Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- ensemble/1/.tmp +0 -0
- ensemble/config.pbtxt +606 -0
- postprocessing/1/__pycache__/model.cpython-312.pyc +0 -0
- postprocessing/1/model.py +177 -0
- postprocessing/config.pbtxt +70 -0
- preprocessing/1/__pycache__/model.cpython-312.pyc +0 -0
- preprocessing/1/model.py +908 -0
- preprocessing/config.pbtxt +240 -0
- tensorrt_llm/1/.gitkeep +0 -0
- tensorrt_llm/1/config.json +362 -0
- tensorrt_llm/1/model.py +1386 -0
- tensorrt_llm/1/rank0.engine +3 -0
- tensorrt_llm/1/rank1.engine +3 -0
- tensorrt_llm/config.pbtxt +757 -0
- tensorrt_llm_bls/1/__pycache__/model.cpython-312.pyc +0 -0
- tensorrt_llm_bls/1/lib/__pycache__/decode.cpython-312.pyc +0 -0
- tensorrt_llm_bls/1/lib/__pycache__/triton_decoder.cpython-312.pyc +0 -0
- tensorrt_llm_bls/1/lib/decode.py +428 -0
- tensorrt_llm_bls/1/lib/triton_decoder.py +542 -0
- tensorrt_llm_bls/1/model.py +146 -0
- tensorrt_llm_bls/config.pbtxt +388 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tensorrt_llm/1/rank0.engine filter=lfs diff=lfs merge=lfs -text
|
37 |
+
tensorrt_llm/1/rank1.engine filter=lfs diff=lfs merge=lfs -text
|
ensemble/1/.tmp
ADDED
File without changes
|
ensemble/config.pbtxt
ADDED
@@ -0,0 +1,606 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "ensemble"
|
28 |
+
platform: "ensemble"
|
29 |
+
max_batch_size: 32
|
30 |
+
input [
|
31 |
+
{
|
32 |
+
name: "text_input"
|
33 |
+
data_type: TYPE_STRING
|
34 |
+
dims: [ 1 ]
|
35 |
+
},
|
36 |
+
{
|
37 |
+
name: "decoder_text_input"
|
38 |
+
data_type: TYPE_STRING
|
39 |
+
dims: [ 1 ]
|
40 |
+
optional: true
|
41 |
+
},
|
42 |
+
{
|
43 |
+
name: "max_tokens"
|
44 |
+
data_type: TYPE_INT32
|
45 |
+
dims: [ 1 ]
|
46 |
+
},
|
47 |
+
{
|
48 |
+
name: "num_return_sequences"
|
49 |
+
data_type: TYPE_INT32
|
50 |
+
dims: [ 1 ]
|
51 |
+
optional: true
|
52 |
+
},
|
53 |
+
{
|
54 |
+
name: "bad_words"
|
55 |
+
data_type: TYPE_STRING
|
56 |
+
dims: [ -1 ]
|
57 |
+
optional: true
|
58 |
+
},
|
59 |
+
{
|
60 |
+
name: "stop_words"
|
61 |
+
data_type: TYPE_STRING
|
62 |
+
dims: [ -1 ]
|
63 |
+
optional: true
|
64 |
+
},
|
65 |
+
{
|
66 |
+
name: "exclude_input_in_output"
|
67 |
+
data_type: TYPE_BOOL
|
68 |
+
dims: [ 1 ]
|
69 |
+
optional: true
|
70 |
+
},
|
71 |
+
{
|
72 |
+
name: "end_id"
|
73 |
+
data_type: TYPE_INT32
|
74 |
+
dims: [ 1 ]
|
75 |
+
optional: true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
name: "pad_id"
|
79 |
+
data_type: TYPE_INT32
|
80 |
+
dims: [ 1 ]
|
81 |
+
optional: true
|
82 |
+
},
|
83 |
+
{
|
84 |
+
name: "top_k"
|
85 |
+
data_type: TYPE_INT32
|
86 |
+
dims: [ 1 ]
|
87 |
+
optional: true
|
88 |
+
},
|
89 |
+
{
|
90 |
+
name: "top_p"
|
91 |
+
data_type: TYPE_FP32
|
92 |
+
dims: [ 1 ]
|
93 |
+
optional: true
|
94 |
+
},
|
95 |
+
{
|
96 |
+
name: "temperature"
|
97 |
+
data_type: TYPE_FP32
|
98 |
+
dims: [ 1 ]
|
99 |
+
optional: true
|
100 |
+
},
|
101 |
+
{
|
102 |
+
name: "length_penalty"
|
103 |
+
data_type: TYPE_FP32
|
104 |
+
dims: [ 1 ]
|
105 |
+
optional: true
|
106 |
+
},
|
107 |
+
{
|
108 |
+
name: "repetition_penalty"
|
109 |
+
data_type: TYPE_FP32
|
110 |
+
dims: [ 1 ]
|
111 |
+
optional: true
|
112 |
+
},
|
113 |
+
{
|
114 |
+
name: "min_length"
|
115 |
+
data_type: TYPE_INT32
|
116 |
+
dims: [ 1 ]
|
117 |
+
optional: true
|
118 |
+
},
|
119 |
+
{
|
120 |
+
name: "presence_penalty"
|
121 |
+
data_type: TYPE_FP32
|
122 |
+
dims: [ 1 ]
|
123 |
+
optional: true
|
124 |
+
},
|
125 |
+
{
|
126 |
+
name: "frequency_penalty"
|
127 |
+
data_type: TYPE_FP32
|
128 |
+
dims: [ 1 ]
|
129 |
+
optional: true
|
130 |
+
},
|
131 |
+
{
|
132 |
+
name: "random_seed"
|
133 |
+
data_type: TYPE_UINT64
|
134 |
+
dims: [ 1 ]
|
135 |
+
optional: true
|
136 |
+
},
|
137 |
+
{
|
138 |
+
name: "return_log_probs"
|
139 |
+
data_type: TYPE_BOOL
|
140 |
+
dims: [ 1 ]
|
141 |
+
optional: true
|
142 |
+
},
|
143 |
+
{
|
144 |
+
name: "return_context_logits"
|
145 |
+
data_type: TYPE_BOOL
|
146 |
+
dims: [ 1 ]
|
147 |
+
optional: true
|
148 |
+
},
|
149 |
+
{
|
150 |
+
name: "return_generation_logits"
|
151 |
+
data_type: TYPE_BOOL
|
152 |
+
dims: [ 1 ]
|
153 |
+
optional: true
|
154 |
+
},
|
155 |
+
{
|
156 |
+
name: "return_kv_cache_reuse_stats"
|
157 |
+
data_type: TYPE_BOOL
|
158 |
+
dims: [ 1 ]
|
159 |
+
optional: true
|
160 |
+
},
|
161 |
+
{
|
162 |
+
name: "beam_width"
|
163 |
+
data_type: TYPE_INT32
|
164 |
+
dims: [ 1 ]
|
165 |
+
optional: true
|
166 |
+
},
|
167 |
+
{
|
168 |
+
name: "stream"
|
169 |
+
data_type: TYPE_BOOL
|
170 |
+
dims: [ 1 ]
|
171 |
+
optional: true
|
172 |
+
},
|
173 |
+
{
|
174 |
+
name: "prompt_embedding_table"
|
175 |
+
data_type: TYPE_FP16
|
176 |
+
dims: [ -1, -1 ]
|
177 |
+
optional: true
|
178 |
+
},
|
179 |
+
{
|
180 |
+
name: "prompt_table_extra_id"
|
181 |
+
data_type: TYPE_UINT64
|
182 |
+
dims: [ 1 ]
|
183 |
+
optional: true
|
184 |
+
},
|
185 |
+
{
|
186 |
+
name: "prompt_vocab_size"
|
187 |
+
data_type: TYPE_INT32
|
188 |
+
dims: [ 1 ]
|
189 |
+
optional: true
|
190 |
+
},
|
191 |
+
{
|
192 |
+
name: "embedding_bias_words"
|
193 |
+
data_type: TYPE_STRING
|
194 |
+
dims: [ -1 ]
|
195 |
+
optional: true
|
196 |
+
},
|
197 |
+
{
|
198 |
+
name: "embedding_bias_weights"
|
199 |
+
data_type: TYPE_FP32
|
200 |
+
dims: [ -1 ]
|
201 |
+
optional: true
|
202 |
+
},
|
203 |
+
# the unique task ID for the given LoRA.
|
204 |
+
# To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
|
205 |
+
# The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
|
206 |
+
# If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached.
|
207 |
+
{
|
208 |
+
name: "lora_task_id"
|
209 |
+
data_type: TYPE_UINT64
|
210 |
+
dims: [ 1 ]
|
211 |
+
optional: true
|
212 |
+
},
|
213 |
+
# weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
|
214 |
+
# where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
|
215 |
+
# each of the in / out tensors are first flattened and then concatenated together in the format above.
|
216 |
+
# D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
|
217 |
+
{
|
218 |
+
name: "lora_weights"
|
219 |
+
data_type: TYPE_FP16
|
220 |
+
dims: [ -1, -1 ]
|
221 |
+
optional: true
|
222 |
+
allow_ragged_batch: true
|
223 |
+
},
|
224 |
+
# module identifier (same size a first dimension of lora_weights)
|
225 |
+
# See LoraModule::ModuleType for model id mapping
|
226 |
+
#
|
227 |
+
# "attn_qkv": 0 # compbined qkv adapter
|
228 |
+
# "attn_q": 1 # q adapter
|
229 |
+
# "attn_k": 2 # k adapter
|
230 |
+
# "attn_v": 3 # v adapter
|
231 |
+
# "attn_dense": 4 # adapter for the dense layer in attention
|
232 |
+
# "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
|
233 |
+
# "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
|
234 |
+
# "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
|
235 |
+
#
|
236 |
+
# last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
|
237 |
+
{
|
238 |
+
name: "lora_config"
|
239 |
+
data_type: TYPE_INT32
|
240 |
+
dims: [ -1, 3 ]
|
241 |
+
optional: true
|
242 |
+
allow_ragged_batch: true
|
243 |
+
},
|
244 |
+
{
|
245 |
+
name: "guided_decoding_guide_type"
|
246 |
+
data_type: TYPE_STRING
|
247 |
+
dims: [ 1 ]
|
248 |
+
optional: true
|
249 |
+
allow_ragged_batch: true
|
250 |
+
},
|
251 |
+
{
|
252 |
+
name: "guided_decoding_guide"
|
253 |
+
data_type: TYPE_STRING
|
254 |
+
dims: [ 1 ]
|
255 |
+
optional: true
|
256 |
+
allow_ragged_batch: true
|
257 |
+
}
|
258 |
+
]
|
259 |
+
output [
|
260 |
+
{
|
261 |
+
name: "text_output"
|
262 |
+
data_type: TYPE_STRING
|
263 |
+
dims: [ -1 ]
|
264 |
+
},
|
265 |
+
{
|
266 |
+
name: "cum_log_probs"
|
267 |
+
data_type: TYPE_FP32
|
268 |
+
dims: [ -1 ]
|
269 |
+
},
|
270 |
+
{
|
271 |
+
name: "output_log_probs"
|
272 |
+
data_type: TYPE_FP32
|
273 |
+
dims: [ -1, -1 ]
|
274 |
+
},
|
275 |
+
{
|
276 |
+
name: "context_logits"
|
277 |
+
data_type: TYPE_FP16
|
278 |
+
dims: [ -1, -1 ]
|
279 |
+
},
|
280 |
+
{
|
281 |
+
name: "generation_logits"
|
282 |
+
data_type: TYPE_FP16
|
283 |
+
dims: [ -1, -1, -1 ]
|
284 |
+
},
|
285 |
+
{
|
286 |
+
name: "batch_index"
|
287 |
+
data_type: TYPE_INT32
|
288 |
+
dims: [ 1 ]
|
289 |
+
},
|
290 |
+
{
|
291 |
+
name: "sequence_index"
|
292 |
+
data_type: TYPE_INT32
|
293 |
+
dims: [ 1 ]
|
294 |
+
},
|
295 |
+
{
|
296 |
+
name: "kv_cache_alloc_new_blocks"
|
297 |
+
data_type: TYPE_INT32
|
298 |
+
dims: [ 1 ]
|
299 |
+
},
|
300 |
+
{
|
301 |
+
name: "kv_cache_reused_blocks"
|
302 |
+
data_type: TYPE_INT32
|
303 |
+
dims: [ 1 ]
|
304 |
+
},
|
305 |
+
{
|
306 |
+
name: "kv_cache_alloc_total_blocks"
|
307 |
+
data_type: TYPE_INT32
|
308 |
+
dims: [ 1 ]
|
309 |
+
}
|
310 |
+
]
|
311 |
+
ensemble_scheduling {
|
312 |
+
step [
|
313 |
+
{
|
314 |
+
model_name: "preprocessing"
|
315 |
+
model_version: -1
|
316 |
+
input_map {
|
317 |
+
key: "QUERY"
|
318 |
+
value: "text_input"
|
319 |
+
}
|
320 |
+
input_map {
|
321 |
+
key: "DECODER_QUERY"
|
322 |
+
value: "decoder_text_input"
|
323 |
+
}
|
324 |
+
input_map {
|
325 |
+
key: "REQUEST_OUTPUT_LEN"
|
326 |
+
value: "max_tokens"
|
327 |
+
}
|
328 |
+
input_map {
|
329 |
+
key: "BAD_WORDS_DICT"
|
330 |
+
value: "bad_words"
|
331 |
+
}
|
332 |
+
input_map {
|
333 |
+
key: "STOP_WORDS_DICT"
|
334 |
+
value: "stop_words"
|
335 |
+
}
|
336 |
+
input_map {
|
337 |
+
key: "EMBEDDING_BIAS_WORDS"
|
338 |
+
value: "embedding_bias_words"
|
339 |
+
}
|
340 |
+
input_map {
|
341 |
+
key: "EMBEDDING_BIAS_WEIGHTS"
|
342 |
+
value: "embedding_bias_weights"
|
343 |
+
}
|
344 |
+
input_map {
|
345 |
+
key: "END_ID"
|
346 |
+
value: "end_id"
|
347 |
+
}
|
348 |
+
input_map {
|
349 |
+
key: "PAD_ID"
|
350 |
+
value: "pad_id"
|
351 |
+
}
|
352 |
+
input_map {
|
353 |
+
key: "PROMPT_TABLE_EXTRA_ID"
|
354 |
+
value: "prompt_table_extra_id"
|
355 |
+
}
|
356 |
+
output_map {
|
357 |
+
key: "REQUEST_INPUT_LEN"
|
358 |
+
value: "_REQUEST_INPUT_LEN"
|
359 |
+
}
|
360 |
+
output_map {
|
361 |
+
key: "INPUT_ID"
|
362 |
+
value: "_INPUT_ID"
|
363 |
+
}
|
364 |
+
output_map {
|
365 |
+
key: "REQUEST_DECODER_INPUT_LEN"
|
366 |
+
value: "_REQUEST_DECODER_INPUT_LEN"
|
367 |
+
}
|
368 |
+
output_map {
|
369 |
+
key: "DECODER_INPUT_ID"
|
370 |
+
value: "_DECODER_INPUT_ID"
|
371 |
+
}
|
372 |
+
output_map {
|
373 |
+
key: "REQUEST_OUTPUT_LEN"
|
374 |
+
value: "_REQUEST_OUTPUT_LEN"
|
375 |
+
}
|
376 |
+
output_map {
|
377 |
+
key: "STOP_WORDS_IDS"
|
378 |
+
value: "_STOP_WORDS_IDS"
|
379 |
+
}
|
380 |
+
output_map {
|
381 |
+
key: "BAD_WORDS_IDS"
|
382 |
+
value: "_BAD_WORDS_IDS"
|
383 |
+
}
|
384 |
+
output_map {
|
385 |
+
key: "EMBEDDING_BIAS"
|
386 |
+
value: "_EMBEDDING_BIAS"
|
387 |
+
}
|
388 |
+
output_map {
|
389 |
+
key: "OUT_END_ID"
|
390 |
+
value: "_PREPROCESSOR_END_ID"
|
391 |
+
}
|
392 |
+
output_map {
|
393 |
+
key: "OUT_PAD_ID"
|
394 |
+
value: "_PREPROCESSOR_PAD_ID"
|
395 |
+
}
|
396 |
+
output_map {
|
397 |
+
key: "OUT_PROMPT_TABLE_EXTRA_IDS"
|
398 |
+
value: "_OUT_PROMPT_TABLE_EXTRA_IDS"
|
399 |
+
}
|
400 |
+
},
|
401 |
+
{
|
402 |
+
model_name: "tensorrt_llm"
|
403 |
+
model_version: -1
|
404 |
+
input_map {
|
405 |
+
key: "input_ids"
|
406 |
+
value: "_INPUT_ID"
|
407 |
+
}
|
408 |
+
input_map {
|
409 |
+
key: "decoder_input_ids"
|
410 |
+
value: "_DECODER_INPUT_ID"
|
411 |
+
}
|
412 |
+
input_map {
|
413 |
+
key: "input_lengths"
|
414 |
+
value: "_REQUEST_INPUT_LEN"
|
415 |
+
}
|
416 |
+
input_map {
|
417 |
+
key: "decoder_input_lengths"
|
418 |
+
value: "_REQUEST_DECODER_INPUT_LEN"
|
419 |
+
}
|
420 |
+
input_map {
|
421 |
+
key: "exclude_input_in_output"
|
422 |
+
value: "exclude_input_in_output"
|
423 |
+
}
|
424 |
+
input_map {
|
425 |
+
key: "request_output_len"
|
426 |
+
value: "_REQUEST_OUTPUT_LEN"
|
427 |
+
}
|
428 |
+
input_map {
|
429 |
+
key: "end_id"
|
430 |
+
value: "_PREPROCESSOR_END_ID"
|
431 |
+
}
|
432 |
+
input_map {
|
433 |
+
key: "pad_id"
|
434 |
+
value: "_PREPROCESSOR_PAD_ID"
|
435 |
+
}
|
436 |
+
input_map {
|
437 |
+
key: "embedding_bias"
|
438 |
+
value: "_EMBEDDING_BIAS"
|
439 |
+
}
|
440 |
+
input_map {
|
441 |
+
key: "runtime_top_k"
|
442 |
+
value: "top_k"
|
443 |
+
}
|
444 |
+
input_map {
|
445 |
+
key: "runtime_top_p"
|
446 |
+
value: "top_p"
|
447 |
+
}
|
448 |
+
input_map {
|
449 |
+
key: "temperature"
|
450 |
+
value: "temperature"
|
451 |
+
}
|
452 |
+
input_map {
|
453 |
+
key: "len_penalty"
|
454 |
+
value: "length_penalty"
|
455 |
+
}
|
456 |
+
input_map {
|
457 |
+
key: "repetition_penalty"
|
458 |
+
value: "repetition_penalty"
|
459 |
+
}
|
460 |
+
input_map {
|
461 |
+
key: "min_length"
|
462 |
+
value: "min_length"
|
463 |
+
}
|
464 |
+
input_map {
|
465 |
+
key: "presence_penalty"
|
466 |
+
value: "presence_penalty"
|
467 |
+
}
|
468 |
+
input_map {
|
469 |
+
key: "frequency_penalty"
|
470 |
+
value: "frequency_penalty"
|
471 |
+
}
|
472 |
+
input_map {
|
473 |
+
key: "random_seed"
|
474 |
+
value: "random_seed"
|
475 |
+
}
|
476 |
+
input_map {
|
477 |
+
key: "return_log_probs"
|
478 |
+
value: "return_log_probs"
|
479 |
+
}
|
480 |
+
input_map {
|
481 |
+
key: "return_context_logits"
|
482 |
+
value: "return_context_logits"
|
483 |
+
}
|
484 |
+
input_map {
|
485 |
+
key: "return_generation_logits"
|
486 |
+
value: "return_generation_logits"
|
487 |
+
}
|
488 |
+
input_map {
|
489 |
+
key: "return_kv_cache_reuse_stats"
|
490 |
+
value: "return_kv_cache_reuse_stats"
|
491 |
+
}
|
492 |
+
input_map {
|
493 |
+
key: "num_return_sequences"
|
494 |
+
value: "num_return_sequences"
|
495 |
+
}
|
496 |
+
input_map {
|
497 |
+
key: "beam_width"
|
498 |
+
value: "beam_width"
|
499 |
+
}
|
500 |
+
input_map {
|
501 |
+
key: "streaming"
|
502 |
+
value: "stream"
|
503 |
+
}
|
504 |
+
input_map {
|
505 |
+
key: "prompt_embedding_table"
|
506 |
+
value: "prompt_embedding_table"
|
507 |
+
}
|
508 |
+
input_map {
|
509 |
+
key: "prompt_vocab_size"
|
510 |
+
value: "prompt_vocab_size"
|
511 |
+
}
|
512 |
+
input_map {
|
513 |
+
key: "stop_words_list"
|
514 |
+
value: "_STOP_WORDS_IDS"
|
515 |
+
}
|
516 |
+
input_map {
|
517 |
+
key: "bad_words_list"
|
518 |
+
value: "_BAD_WORDS_IDS"
|
519 |
+
}
|
520 |
+
input_map {
|
521 |
+
key: "prompt_table_extra_ids"
|
522 |
+
value: "_OUT_PROMPT_TABLE_EXTRA_IDS"
|
523 |
+
},
|
524 |
+
input_map {
|
525 |
+
key: "lora_task_id",
|
526 |
+
value: "lora_task_id"
|
527 |
+
},
|
528 |
+
input_map {
|
529 |
+
key: "lora_weights",
|
530 |
+
value: "lora_weights"
|
531 |
+
},
|
532 |
+
input_map {
|
533 |
+
key: "lora_config",
|
534 |
+
value: "lora_config"
|
535 |
+
},
|
536 |
+
input_map {
|
537 |
+
key: "guided_decoding_guide_type",
|
538 |
+
value: "guided_decoding_guide_type"
|
539 |
+
},
|
540 |
+
input_map {
|
541 |
+
key: "guided_decoding_guide",
|
542 |
+
value: "guided_decoding_guide"
|
543 |
+
}
|
544 |
+
output_map {
|
545 |
+
key: "output_ids"
|
546 |
+
value: "_TOKENS_BATCH"
|
547 |
+
}
|
548 |
+
output_map {
|
549 |
+
key: "sequence_length"
|
550 |
+
value: "_SEQUENCE_LENGTH"
|
551 |
+
},
|
552 |
+
output_map {
|
553 |
+
key: "cum_log_probs"
|
554 |
+
value: "cum_log_probs"
|
555 |
+
}
|
556 |
+
output_map {
|
557 |
+
key: "output_log_probs"
|
558 |
+
value: "output_log_probs"
|
559 |
+
},
|
560 |
+
output_map {
|
561 |
+
key: "context_logits"
|
562 |
+
value: "context_logits"
|
563 |
+
},
|
564 |
+
output_map {
|
565 |
+
key: "generation_logits"
|
566 |
+
value: "generation_logits"
|
567 |
+
},
|
568 |
+
output_map {
|
569 |
+
key: "batch_index"
|
570 |
+
value: "batch_index"
|
571 |
+
},
|
572 |
+
output_map {
|
573 |
+
key: "sequence_index"
|
574 |
+
value: "sequence_index"
|
575 |
+
},
|
576 |
+
output_map {
|
577 |
+
key: "kv_cache_alloc_new_blocks"
|
578 |
+
value: "kv_cache_alloc_new_blocks"
|
579 |
+
},
|
580 |
+
output_map {
|
581 |
+
key: "kv_cache_reused_blocks"
|
582 |
+
value: "kv_cache_reused_blocks"
|
583 |
+
},
|
584 |
+
output_map {
|
585 |
+
key: "kv_cache_alloc_total_blocks"
|
586 |
+
value: "kv_cache_alloc_total_blocks"
|
587 |
+
}
|
588 |
+
},
|
589 |
+
{
|
590 |
+
model_name: "postprocessing"
|
591 |
+
model_version: -1
|
592 |
+
input_map {
|
593 |
+
key: "TOKENS_BATCH"
|
594 |
+
value: "_TOKENS_BATCH"
|
595 |
+
}
|
596 |
+
input_map {
|
597 |
+
key: "SEQUENCE_LENGTH"
|
598 |
+
value: "_SEQUENCE_LENGTH"
|
599 |
+
}
|
600 |
+
output_map {
|
601 |
+
key: "OUTPUT"
|
602 |
+
value: "text_output"
|
603 |
+
}
|
604 |
+
}
|
605 |
+
]
|
606 |
+
}
|
postprocessing/1/__pycache__/model.cpython-312.pyc
ADDED
Binary file (6.52 kB). View file
|
|
postprocessing/1/model.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
import json
|
28 |
+
|
29 |
+
import numpy as np
|
30 |
+
import triton_python_backend_utils as pb_utils
|
31 |
+
from transformers import AutoTokenizer
|
32 |
+
|
33 |
+
|
34 |
+
class TritonPythonModel:
|
35 |
+
"""Your Python model must use the same class name. Every Python model
|
36 |
+
that is created must have "TritonPythonModel" as the class name.
|
37 |
+
"""
|
38 |
+
|
39 |
+
def initialize(self, args):
|
40 |
+
"""`initialize` is called only once when the model is being loaded.
|
41 |
+
Implementing `initialize` function is optional. This function allows
|
42 |
+
the model to initialize any state associated with this model.
|
43 |
+
Parameters
|
44 |
+
----------
|
45 |
+
args : dict
|
46 |
+
Both keys and values are strings. The dictionary keys and values are:
|
47 |
+
* model_config: A JSON string containing the model configuration
|
48 |
+
* model_instance_kind: A string containing model instance kind
|
49 |
+
* model_instance_device_id: A string containing model instance device ID
|
50 |
+
* model_repository: Model repository path
|
51 |
+
* model_version: Model version
|
52 |
+
* model_name: Model name
|
53 |
+
"""
|
54 |
+
# Parse model configs
|
55 |
+
model_config = json.loads(args['model_config'])
|
56 |
+
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
|
57 |
+
'string_value']
|
58 |
+
|
59 |
+
skip_special_tokens = model_config['parameters'].get(
|
60 |
+
'skip_special_tokens')
|
61 |
+
if skip_special_tokens is not None:
|
62 |
+
skip_special_tokens_str = skip_special_tokens[
|
63 |
+
'string_value'].lower()
|
64 |
+
if skip_special_tokens_str in [
|
65 |
+
'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
|
66 |
+
]:
|
67 |
+
self.skip_special_tokens = skip_special_tokens_str in [
|
68 |
+
'true', '1', 't', 'y', 'yes'
|
69 |
+
]
|
70 |
+
else:
|
71 |
+
print(
|
72 |
+
f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default."
|
73 |
+
)
|
74 |
+
self.skip_special_tokens = True
|
75 |
+
else:
|
76 |
+
print(
|
77 |
+
f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default."
|
78 |
+
)
|
79 |
+
self.skip_special_tokens = True
|
80 |
+
|
81 |
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
|
82 |
+
legacy=False,
|
83 |
+
padding_side='left',
|
84 |
+
trust_remote_code=True)
|
85 |
+
if not self.tokenizer.pad_token:
|
86 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
87 |
+
|
88 |
+
# Parse model output configs
|
89 |
+
output_config = pb_utils.get_output_config_by_name(
|
90 |
+
model_config, "OUTPUT")
|
91 |
+
|
92 |
+
# Convert Triton types to numpy types
|
93 |
+
self.output_dtype = pb_utils.triton_string_to_numpy(
|
94 |
+
output_config['data_type'])
|
95 |
+
|
96 |
+
def execute(self, requests):
|
97 |
+
"""`execute` must be implemented in every Python model. `execute`
|
98 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
99 |
+
argument. This function is called when an inference is requested
|
100 |
+
for this model. Depending on the batching configuration (e.g. Dynamic
|
101 |
+
Batching) used, `requests` may contain multiple requests. Every
|
102 |
+
Python model, must create one pb_utils.InferenceResponse for every
|
103 |
+
pb_utils.InferenceRequest in `requests`. If there is an error, you can
|
104 |
+
set the error argument when creating a pb_utils.InferenceResponse.
|
105 |
+
Parameters
|
106 |
+
----------
|
107 |
+
requests : list
|
108 |
+
A list of pb_utils.InferenceRequest
|
109 |
+
Returns
|
110 |
+
-------
|
111 |
+
list
|
112 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
113 |
+
be the same as `requests`
|
114 |
+
"""
|
115 |
+
|
116 |
+
tokens_batch = []
|
117 |
+
sequence_lengths = []
|
118 |
+
for idx, request in enumerate(requests):
|
119 |
+
for input_tensor in request.inputs():
|
120 |
+
if input_tensor.name() == "TOKENS_BATCH":
|
121 |
+
tokens_batch.append(input_tensor.as_numpy())
|
122 |
+
elif input_tensor.name() == "SEQUENCE_LENGTH":
|
123 |
+
sequence_lengths.append(input_tensor.as_numpy())
|
124 |
+
else:
|
125 |
+
raise ValueError(f"unknown input {input_tensor.name}")
|
126 |
+
|
127 |
+
# batch decode
|
128 |
+
list_of_tokens = []
|
129 |
+
req_idx_offset = 0
|
130 |
+
req_idx_offsets = [req_idx_offset]
|
131 |
+
for idx, token_batch in enumerate(tokens_batch):
|
132 |
+
for batch_idx, beam_tokens in enumerate(token_batch):
|
133 |
+
for beam_idx, tokens in enumerate(beam_tokens):
|
134 |
+
seq_len = sequence_lengths[idx][batch_idx][beam_idx]
|
135 |
+
list_of_tokens.append(tokens[:seq_len])
|
136 |
+
req_idx_offset += 1
|
137 |
+
|
138 |
+
req_idx_offsets.append(req_idx_offset)
|
139 |
+
|
140 |
+
all_outputs = self.tokenizer.batch_decode(
|
141 |
+
list_of_tokens, skip_special_tokens=self.skip_special_tokens)
|
142 |
+
|
143 |
+
# construct responses
|
144 |
+
responses = []
|
145 |
+
for idx, request in enumerate(requests):
|
146 |
+
req_outputs = [
|
147 |
+
x.encode('utf8')
|
148 |
+
for x in all_outputs[req_idx_offsets[idx]:req_idx_offsets[idx +
|
149 |
+
1]]
|
150 |
+
]
|
151 |
+
|
152 |
+
output_tensor = pb_utils.Tensor(
|
153 |
+
'OUTPUT',
|
154 |
+
np.array(req_outputs).astype(self.output_dtype))
|
155 |
+
|
156 |
+
outputs = [output_tensor]
|
157 |
+
|
158 |
+
# Create InferenceResponse. You can set an error here in case
|
159 |
+
# there was a problem with handling this inference request.
|
160 |
+
# Below is an example of how you can set errors in inference
|
161 |
+
# response:
|
162 |
+
#
|
163 |
+
# pb_utils.InferenceResponse(
|
164 |
+
# output_tensors=..., TritonError("An error occurred"))
|
165 |
+
inference_response = pb_utils.InferenceResponse(
|
166 |
+
output_tensors=outputs)
|
167 |
+
responses.append(inference_response)
|
168 |
+
# You should return a list of pb_utils.InferenceResponse. Length
|
169 |
+
# of this list must match the length of `requests` list.
|
170 |
+
return responses
|
171 |
+
|
172 |
+
def finalize(self):
|
173 |
+
"""`finalize` is called only once when the model is being unloaded.
|
174 |
+
Implementing `finalize` function is optional. This function allows
|
175 |
+
the model to perform any necessary clean ups before exit.
|
176 |
+
"""
|
177 |
+
print('Cleaning up...')
|
postprocessing/config.pbtxt
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "postprocessing"
|
28 |
+
backend: "python"
|
29 |
+
max_batch_size: 32
|
30 |
+
dynamic_batching {}
|
31 |
+
input [
|
32 |
+
{
|
33 |
+
name: "TOKENS_BATCH"
|
34 |
+
data_type: TYPE_INT32
|
35 |
+
dims: [ -1, -1 ]
|
36 |
+
},
|
37 |
+
{
|
38 |
+
name: "SEQUENCE_LENGTH"
|
39 |
+
data_type: TYPE_INT32
|
40 |
+
dims: [ -1 ]
|
41 |
+
}
|
42 |
+
]
|
43 |
+
output [
|
44 |
+
{
|
45 |
+
name: "OUTPUT"
|
46 |
+
data_type: TYPE_STRING
|
47 |
+
dims: [ -1 ]
|
48 |
+
}
|
49 |
+
]
|
50 |
+
|
51 |
+
parameters {
|
52 |
+
key: "tokenizer_dir"
|
53 |
+
value: {
|
54 |
+
string_value: "huihui-ai/Llama-3.3-70B-Instruct-abliterated"
|
55 |
+
}
|
56 |
+
}
|
57 |
+
|
58 |
+
parameters {
|
59 |
+
key: "skip_special_tokens"
|
60 |
+
value: {
|
61 |
+
string_value: "True"
|
62 |
+
}
|
63 |
+
}
|
64 |
+
|
65 |
+
instance_group [
|
66 |
+
{
|
67 |
+
count: 1
|
68 |
+
kind: KIND_CPU
|
69 |
+
}
|
70 |
+
]
|
preprocessing/1/__pycache__/model.cpython-312.pyc
ADDED
Binary file (39.3 kB). View file
|
|
preprocessing/1/model.py
ADDED
@@ -0,0 +1,908 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
import base64
|
28 |
+
import io
|
29 |
+
import json
|
30 |
+
import os
|
31 |
+
from typing import List
|
32 |
+
|
33 |
+
import numpy as np
|
34 |
+
import requests
|
35 |
+
import triton_python_backend_utils as pb_utils
|
36 |
+
from PIL import Image
|
37 |
+
from transformers import AutoProcessor, AutoTokenizer, T5Tokenizer
|
38 |
+
|
39 |
+
|
40 |
+
class TritonPythonModel:
|
41 |
+
"""Your Python model must use the same class name. Every Python model
|
42 |
+
that is created must have "TritonPythonModel" as the class name.
|
43 |
+
"""
|
44 |
+
|
45 |
+
def initialize(self, args):
|
46 |
+
"""`initialize` is called only once when the model is being loaded.
|
47 |
+
Implementing `initialize` function is optional. This function allows
|
48 |
+
the model to initialize any state associated with this model.
|
49 |
+
Parameters
|
50 |
+
----------
|
51 |
+
args : dict
|
52 |
+
Both keys and values are strings. The dictionary keys and values are:
|
53 |
+
* model_config: A JSON string containing the model configuration
|
54 |
+
* model_instance_kind: A string containing model instance kind
|
55 |
+
* model_instance_device_id: A string containing model instance device ID
|
56 |
+
* model_repository: Model repository path
|
57 |
+
* model_version: Model version
|
58 |
+
* model_name: Model name
|
59 |
+
"""
|
60 |
+
# Parse model configs
|
61 |
+
model_config = json.loads(args['model_config'])
|
62 |
+
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
|
63 |
+
'string_value']
|
64 |
+
|
65 |
+
add_special_tokens = model_config['parameters'].get(
|
66 |
+
'add_special_tokens')
|
67 |
+
visual_model_path = model_config['parameters']['visual_model_path'][
|
68 |
+
'string_value']
|
69 |
+
max_num_images = model_config['parameters'].get('max_num_images')
|
70 |
+
|
71 |
+
if max_num_images is not None:
|
72 |
+
max_num_images_str = max_num_images['string_value']
|
73 |
+
if max_num_images_str.isdigit():
|
74 |
+
self.max_num_images = int(max_num_images_str)
|
75 |
+
else:
|
76 |
+
print(
|
77 |
+
f"[TensorRT-LLM][WARNING] 'max_num_images' parameter is not set correctly (value is {max_num_images_str}). Will be set to None"
|
78 |
+
)
|
79 |
+
self.max_num_images = None
|
80 |
+
else:
|
81 |
+
print(
|
82 |
+
f"[TensorRT-LLM][WARNING] Don't setup 'max_num_images'. Set it as None by default."
|
83 |
+
)
|
84 |
+
self.max_num_images = None
|
85 |
+
if visual_model_path == "${visual_model_path}" or visual_model_path == "":
|
86 |
+
visual_model_path = None
|
87 |
+
|
88 |
+
if add_special_tokens is not None:
|
89 |
+
add_special_tokens_str = add_special_tokens['string_value'].lower()
|
90 |
+
if add_special_tokens_str in [
|
91 |
+
'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
|
92 |
+
]:
|
93 |
+
self.add_special_tokens = add_special_tokens_str in [
|
94 |
+
'true', '1', 't', 'y', 'yes'
|
95 |
+
]
|
96 |
+
else:
|
97 |
+
print(
|
98 |
+
f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default."
|
99 |
+
)
|
100 |
+
self.add_special_tokens = True
|
101 |
+
else:
|
102 |
+
print(
|
103 |
+
f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default."
|
104 |
+
)
|
105 |
+
self.add_special_tokens = True
|
106 |
+
|
107 |
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
|
108 |
+
legacy=False,
|
109 |
+
padding_side='left',
|
110 |
+
trust_remote_code=True)
|
111 |
+
|
112 |
+
if isinstance(self.tokenizer, T5Tokenizer):
|
113 |
+
self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id()
|
114 |
+
|
115 |
+
if not self.tokenizer.pad_token:
|
116 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
117 |
+
|
118 |
+
self.tokenizer_end_id = self.tokenizer.encode(
|
119 |
+
self.tokenizer.eos_token, add_special_tokens=False)[0]
|
120 |
+
self.tokenizer_pad_id = self.tokenizer.encode(
|
121 |
+
self.tokenizer.pad_token, add_special_tokens=False)[0]
|
122 |
+
self.vocab_size = self.tokenizer.vocab_size
|
123 |
+
|
124 |
+
self.is_multimodal = False
|
125 |
+
self.model_type = None
|
126 |
+
self.vision_preprocessor = None
|
127 |
+
|
128 |
+
if visual_model_path is not None:
|
129 |
+
self.is_multimodal = True
|
130 |
+
visual_model_path = os.path.join(visual_model_path, 'config.json')
|
131 |
+
with open(visual_model_path, 'r') as f:
|
132 |
+
visual_model_config = json.load(f)
|
133 |
+
self.model_type = visual_model_config['builder_config'][
|
134 |
+
'model_type']
|
135 |
+
|
136 |
+
assert self.model_type in [
|
137 |
+
'llava', 'blip2-opt', 'vila', 'mllama', 'llava_onevision'
|
138 |
+
], f"[TensorRT-LLM][ERROR] Currently supported multi-modal models are llava, blip2-opt, vila, mllama and llava_onevision. Got {self.model_type}."
|
139 |
+
|
140 |
+
assert self.model_type != 'llava_onevison' or self.max_num_images is None or self.max_num_images <= 1, f"LLaVA-OneVsion is not support multi image inference currently."
|
141 |
+
|
142 |
+
llm_model_path = model_config['parameters']['gpt_model_path'][
|
143 |
+
'string_value']
|
144 |
+
llm_model_path = os.path.join(llm_model_path, 'config.json')
|
145 |
+
with open(llm_model_path, 'r') as f:
|
146 |
+
llm_model_config = json.load(f)
|
147 |
+
self.vocab_size = int(
|
148 |
+
llm_model_config["pretrained_config"]["vocab_size"])
|
149 |
+
self._setup_ptable_shape(llm_model_config)
|
150 |
+
|
151 |
+
if self.model_type == 'mllama' or self.model_type == 'llava_onevision':
|
152 |
+
self.vision_preprocessor = VisionPreProcessor(
|
153 |
+
self.model_type,
|
154 |
+
AutoProcessor.from_pretrained(tokenizer_dir), model_config)
|
155 |
+
|
156 |
+
# Parse model output configs and convert Triton types to numpy types
|
157 |
+
output_names = [
|
158 |
+
"INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN",
|
159 |
+
"REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS",
|
160 |
+
"OUT_END_ID", "OUT_PAD_ID", "OUT_PROMPT_TABLE_EXTRA_IDS",
|
161 |
+
"PIXEL_VALUES", "IMAGE_SIZES"
|
162 |
+
]
|
163 |
+
input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"]
|
164 |
+
for input_name in input_names:
|
165 |
+
setattr(
|
166 |
+
self,
|
167 |
+
input_name.lower() + "_dtype",
|
168 |
+
pb_utils.triton_string_to_numpy(
|
169 |
+
pb_utils.get_input_config_by_name(
|
170 |
+
model_config, input_name)['data_type']))
|
171 |
+
|
172 |
+
for output_name in output_names:
|
173 |
+
setattr(
|
174 |
+
self,
|
175 |
+
output_name.lower() + "_dtype",
|
176 |
+
pb_utils.triton_string_to_numpy(
|
177 |
+
pb_utils.get_output_config_by_name(
|
178 |
+
model_config, output_name)['data_type']))
|
179 |
+
|
180 |
+
def _setup_ptable_shape(self, llm_model_config):
|
181 |
+
max_prompt_embedding_table_size = llm_model_config['build_config'][
|
182 |
+
'max_prompt_embedding_table_size']
|
183 |
+
max_batch_size = llm_model_config['build_config']['max_batch_size']
|
184 |
+
|
185 |
+
num_visual_features = max_prompt_embedding_table_size // max_batch_size
|
186 |
+
hidden_size = llm_model_config['pretrained_config']['hidden_size']
|
187 |
+
if self.max_num_images is not None:
|
188 |
+
num_visual_features = num_visual_features // self.max_num_images
|
189 |
+
|
190 |
+
self.ptable_shape = (-1, num_visual_features, hidden_size)
|
191 |
+
|
192 |
+
def execute(self, requests):
|
193 |
+
"""`execute` must be implemented in every Python model. `execute`
|
194 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
195 |
+
argument. This function is called when an inference is requested
|
196 |
+
for this model. Depending on the batching configuration (e.g. Dynamic
|
197 |
+
Batching) used, `requests` may contain multiple requests. Every
|
198 |
+
Python model, must create one pb_utils.InferenceResponse for every
|
199 |
+
pb_utils.InferenceRequest in `requests`. If there is an error, you can
|
200 |
+
set the error argument when creating a pb_utils.InferenceResponse.
|
201 |
+
Parameters
|
202 |
+
----------
|
203 |
+
requests : list
|
204 |
+
A list of pb_utils.InferenceRequest
|
205 |
+
Returns
|
206 |
+
-------
|
207 |
+
list
|
208 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
209 |
+
be the same as `requests`
|
210 |
+
"""
|
211 |
+
|
212 |
+
responses = []
|
213 |
+
|
214 |
+
# Every Python backend must iterate over everyone of the requests
|
215 |
+
# and create a pb_utils.InferenceResponse for each of them.
|
216 |
+
for idx, request in enumerate(requests):
|
217 |
+
# Get input tensors
|
218 |
+
query = pb_utils.get_input_tensor_by_name(request,
|
219 |
+
'QUERY').as_numpy()
|
220 |
+
batch_size = query.shape[0]
|
221 |
+
|
222 |
+
decoder_query = pb_utils.get_input_tensor_by_name(
|
223 |
+
request, 'DECODER_QUERY')
|
224 |
+
if decoder_query is not None:
|
225 |
+
decoder_query = decoder_query.as_numpy()
|
226 |
+
|
227 |
+
request_output_len = pb_utils.get_input_tensor_by_name(
|
228 |
+
request, 'REQUEST_OUTPUT_LEN').as_numpy()
|
229 |
+
|
230 |
+
bad_words_dict = pb_utils.get_input_tensor_by_name(
|
231 |
+
request, 'BAD_WORDS_DICT')
|
232 |
+
if bad_words_dict is not None:
|
233 |
+
bad_words_dict = bad_words_dict.as_numpy()
|
234 |
+
|
235 |
+
stop_words_dict = pb_utils.get_input_tensor_by_name(
|
236 |
+
request, 'STOP_WORDS_DICT')
|
237 |
+
if stop_words_dict is not None:
|
238 |
+
stop_words_dict = stop_words_dict.as_numpy()
|
239 |
+
|
240 |
+
embedding_bias_words = pb_utils.get_input_tensor_by_name(
|
241 |
+
request, 'EMBEDDING_BIAS_WORDS')
|
242 |
+
if embedding_bias_words is not None:
|
243 |
+
embedding_bias_words = embedding_bias_words.as_numpy()
|
244 |
+
|
245 |
+
embedding_bias_weights = pb_utils.get_input_tensor_by_name(
|
246 |
+
request, 'EMBEDDING_BIAS_WEIGHTS')
|
247 |
+
if embedding_bias_weights is not None:
|
248 |
+
embedding_bias_weights = embedding_bias_weights.as_numpy()
|
249 |
+
|
250 |
+
# Take the end_id from the input tensors
|
251 |
+
# If not specified, use tokenizer to get end_id
|
252 |
+
end_id = pb_utils.get_input_tensor_by_name(request, 'END_ID')
|
253 |
+
if end_id is not None:
|
254 |
+
end_id = end_id.as_numpy()
|
255 |
+
else:
|
256 |
+
end_id = [[self.tokenizer_end_id]] * batch_size
|
257 |
+
|
258 |
+
# Take the pad_id from the input tensors
|
259 |
+
# If not specified, use tokenizer to get pad_id
|
260 |
+
pad_id = pb_utils.get_input_tensor_by_name(request, 'PAD_ID')
|
261 |
+
if pad_id is not None:
|
262 |
+
pad_id = pad_id.as_numpy()
|
263 |
+
else:
|
264 |
+
pad_id = [[self.tokenizer_pad_id]] * batch_size
|
265 |
+
|
266 |
+
# Take the extra_id from the input tensors
|
267 |
+
# Extra id is used in kv cache reuse for p-tuning
|
268 |
+
prompt_table_extra_id = pb_utils.get_input_tensor_by_name(
|
269 |
+
request, 'PROMPT_TABLE_EXTRA_ID')
|
270 |
+
if prompt_table_extra_id is not None:
|
271 |
+
prompt_table_extra_id = prompt_table_extra_id.as_numpy()
|
272 |
+
assert prompt_table_extra_id.shape[
|
273 |
+
0] == batch_size, "Prompt table extra id must have the same batch size as Query"
|
274 |
+
assert prompt_table_extra_id.shape[
|
275 |
+
1] == 1, "Multiple IDs cannot be provided for a single image"
|
276 |
+
|
277 |
+
# Preprocessing vision input passed as a url or bytes tensor
|
278 |
+
img_urls = pb_utils.get_input_tensor_by_name(request, 'IMAGE_URL')
|
279 |
+
image_bytes = pb_utils.get_input_tensor_by_name(
|
280 |
+
request, 'IMAGE_BYTES')
|
281 |
+
video_bytes = pb_utils.get_input_tensor_by_name(
|
282 |
+
request, 'VIDEO_BYTES')
|
283 |
+
vision_processed_tensors = []
|
284 |
+
visual_tokens = []
|
285 |
+
if self.is_multimodal and (img_urls or image_bytes or video_bytes):
|
286 |
+
assert self.vision_preprocessor != None, "Vision preprocessor for preparing images before encoding is None"
|
287 |
+
processed_tensors = {}
|
288 |
+
if self.model_type == 'mllama':
|
289 |
+
processed_tensors = self.vision_preprocessor.mllama_process(
|
290 |
+
queries=query.astype(str).tolist(),
|
291 |
+
img_urls=img_urls,
|
292 |
+
image_bytes=image_bytes,
|
293 |
+
)
|
294 |
+
elif self.model_type == 'llava_onevision':
|
295 |
+
if video_bytes is None:
|
296 |
+
processed_tensors, visual_tokens = self.vision_preprocessor.llava_onevision_process_image(
|
297 |
+
queries=query.astype(str).tolist(),
|
298 |
+
img_urls=img_urls,
|
299 |
+
image_bytes=image_bytes,
|
300 |
+
)
|
301 |
+
else:
|
302 |
+
processed_tensors, visual_tokens = self.vision_preprocessor.llava_onevision_process_video(
|
303 |
+
queries=query.astype(str).tolist(),
|
304 |
+
video_bytes=video_bytes,
|
305 |
+
)
|
306 |
+
else:
|
307 |
+
raise ValueError(
|
308 |
+
"Unsupported model type for IMAGE_BYTES or IMAGE_URL inputs"
|
309 |
+
)
|
310 |
+
vision_processed_tensors = [
|
311 |
+
pb_utils.Tensor.from_dlpack(k, v)
|
312 |
+
for k, v in processed_tensors.items()
|
313 |
+
]
|
314 |
+
else:
|
315 |
+
assert self.model_type != "llava_onevision", "Image processing requires IMAGE_BYTES or IMAGE_URL to be provided"
|
316 |
+
|
317 |
+
# Preprocessing input data.
|
318 |
+
# For the LLaVA_OneVision model, num_visual_features is not a fixed value
|
319 |
+
input_id, request_input_len = self._create_request(
|
320 |
+
query, visual_tokens)
|
321 |
+
if decoder_query is not None:
|
322 |
+
decoder_input_id, request_decoder_input_len = self._create_request(
|
323 |
+
decoder_query)
|
324 |
+
else:
|
325 |
+
decoder_input_id = pad_id * np.ones((batch_size, 1), np.int32)
|
326 |
+
request_decoder_input_len = 1 * np.ones(
|
327 |
+
(batch_size, 1), np.int32)
|
328 |
+
|
329 |
+
bad_words = self._to_word_list_format(bad_words_dict, batch_size)
|
330 |
+
stop_words = self._to_word_list_format(stop_words_dict, batch_size)
|
331 |
+
|
332 |
+
embedding_bias = self._get_embedding_bias(
|
333 |
+
embedding_bias_words, embedding_bias_weights,
|
334 |
+
self.embedding_bias_weights_dtype, batch_size)
|
335 |
+
|
336 |
+
if prompt_table_extra_id is not None:
|
337 |
+
prompt_table_extra_ids = np.zeros_like(input_id)
|
338 |
+
for i in range(batch_size):
|
339 |
+
prompt_table_extra_ids[i] = np.where(
|
340 |
+
input_id[i] >= self.vocab_size,
|
341 |
+
prompt_table_extra_id[i], 0)
|
342 |
+
|
343 |
+
# Create output tensors. You need pb_utils.Tensor
|
344 |
+
# objects to create pb_utils.InferenceResponse.
|
345 |
+
input_id_tensor = pb_utils.Tensor(
|
346 |
+
'INPUT_ID', input_id.astype(self.input_id_dtype))
|
347 |
+
request_input_len_tensor = pb_utils.Tensor(
|
348 |
+
'REQUEST_INPUT_LEN',
|
349 |
+
request_input_len.astype(self.request_input_len_dtype))
|
350 |
+
decoder_input_id_tensor = pb_utils.Tensor(
|
351 |
+
'DECODER_INPUT_ID',
|
352 |
+
decoder_input_id.astype(self.decoder_input_id_dtype))
|
353 |
+
request_decoder_input_len_tensor = pb_utils.Tensor(
|
354 |
+
'REQUEST_DECODER_INPUT_LEN',
|
355 |
+
request_decoder_input_len.astype(
|
356 |
+
self.request_decoder_input_len_dtype))
|
357 |
+
request_output_len_tensor = pb_utils.Tensor(
|
358 |
+
'REQUEST_OUTPUT_LEN', request_output_len)
|
359 |
+
bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
|
360 |
+
stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
|
361 |
+
stop_words)
|
362 |
+
embedding_bias_tensor = pb_utils.Tensor('EMBEDDING_BIAS',
|
363 |
+
embedding_bias)
|
364 |
+
end_id_tensor = pb_utils.Tensor('OUT_END_ID',
|
365 |
+
np.array(end_id, dtype=np.int32))
|
366 |
+
pad_id_tensor = pb_utils.Tensor('OUT_PAD_ID',
|
367 |
+
np.array(pad_id, dtype=np.int32))
|
368 |
+
|
369 |
+
if prompt_table_extra_id is not None:
|
370 |
+
prompt_table_extra_ids_tensor = pb_utils.Tensor(
|
371 |
+
'OUT_PROMPT_TABLE_EXTRA_IDS',
|
372 |
+
np.array(prompt_table_extra_ids,
|
373 |
+
dtype=self.out_prompt_table_extra_ids_dtype))
|
374 |
+
inference_response = pb_utils.InferenceResponse(output_tensors=[
|
375 |
+
input_id_tensor, decoder_input_id_tensor,
|
376 |
+
bad_words_ids_tensor, stop_words_ids_tensor,
|
377 |
+
request_input_len_tensor, request_decoder_input_len_tensor,
|
378 |
+
request_output_len_tensor, embedding_bias_tensor,
|
379 |
+
end_id_tensor, pad_id_tensor, prompt_table_extra_ids_tensor
|
380 |
+
] + vision_processed_tensors)
|
381 |
+
else:
|
382 |
+
inference_response = pb_utils.InferenceResponse(
|
383 |
+
output_tensors=[
|
384 |
+
input_id_tensor, decoder_input_id_tensor,
|
385 |
+
bad_words_ids_tensor, stop_words_ids_tensor,
|
386 |
+
request_input_len_tensor,
|
387 |
+
request_decoder_input_len_tensor,
|
388 |
+
request_output_len_tensor, embedding_bias_tensor,
|
389 |
+
end_id_tensor, pad_id_tensor
|
390 |
+
] + vision_processed_tensors)
|
391 |
+
responses.append(inference_response)
|
392 |
+
|
393 |
+
# You should return a list of pb_utils.InferenceResponse. Length
|
394 |
+
# of this list must match the length of `requests` list.
|
395 |
+
return responses
|
396 |
+
|
397 |
+
def finalize(self):
|
398 |
+
"""`finalize` is called only once when the model is being unloaded.
|
399 |
+
Implementing `finalize` function is optional. This function allows
|
400 |
+
the model to perform any necessary clean ups before exit.
|
401 |
+
"""
|
402 |
+
print('Cleaning up...')
|
403 |
+
|
404 |
+
def _split_prompt_by_images(self,
|
405 |
+
concatenated_ids,
|
406 |
+
image_token_index=-200):
|
407 |
+
"""
|
408 |
+
Splits tokenized prompts by image placeholders for each sample in the batch.
|
409 |
+
|
410 |
+
Args:
|
411 |
+
concatenated_ids (np.ndarray): A batch of concatenated token IDs, where image placeholders are indicated by `image_token_index`.
|
412 |
+
|
413 |
+
Returns:
|
414 |
+
List[List[np.ndarray]]: A list containing lists of token ID arrays for each prompt segment, per batch sample.
|
415 |
+
"""
|
416 |
+
batch_splits = []
|
417 |
+
for batch in concatenated_ids:
|
418 |
+
zero_indices = np.where(batch == image_token_index)[0]
|
419 |
+
start_idx = 0
|
420 |
+
splits = []
|
421 |
+
for idx in zero_indices:
|
422 |
+
if start_idx != idx:
|
423 |
+
splits.append(batch[start_idx:idx].reshape(1, -1))
|
424 |
+
start_idx = idx + 1
|
425 |
+
if start_idx < len(batch):
|
426 |
+
splits.append(batch[start_idx:].reshape(1, -1))
|
427 |
+
|
428 |
+
splits = [split for split in splits if split.size > 0]
|
429 |
+
batch_splits.append(splits)
|
430 |
+
|
431 |
+
return batch_splits
|
432 |
+
|
433 |
+
def _setup_fake_prompts(self, batch_size, batch_split_prompts):
|
434 |
+
"""
|
435 |
+
Replaces image placeholders with unique fake prompt IDs for multi-image inputs.
|
436 |
+
|
437 |
+
Args:
|
438 |
+
batch_size (int): The number of samples in the batch.
|
439 |
+
batch_split_prompts (List[List[np.ndarray]]): Tokenized prompt segments for each batch sample.
|
440 |
+
|
441 |
+
Returns:
|
442 |
+
np.ndarray: An array of input IDs with image placeholders replaced by fake prompt IDs.
|
443 |
+
"""
|
444 |
+
|
445 |
+
num_visual_features = self.ptable_shape[1]
|
446 |
+
input_ids_list = []
|
447 |
+
|
448 |
+
for batch_idx in range(batch_size):
|
449 |
+
splits = batch_split_prompts[batch_idx]
|
450 |
+
sample_input_ids = [splits[0]]
|
451 |
+
sample_fake_prompt_counter = self.vocab_size
|
452 |
+
|
453 |
+
for split_idx in range(len(splits) - 1):
|
454 |
+
fake_prompt_id = np.arange(
|
455 |
+
sample_fake_prompt_counter,
|
456 |
+
sample_fake_prompt_counter + num_visual_features)
|
457 |
+
sample_fake_prompt_counter += num_visual_features
|
458 |
+
fake_prompt_id = np.expand_dims(fake_prompt_id, axis=0)
|
459 |
+
sample_input_ids.append(fake_prompt_id)
|
460 |
+
sample_input_ids.append(splits[split_idx + 1])
|
461 |
+
|
462 |
+
sample_input_ids = np.concatenate(sample_input_ids, axis=1)
|
463 |
+
input_ids_list.append(sample_input_ids)
|
464 |
+
|
465 |
+
# Pad the input_ids to the same length for bs > 1
|
466 |
+
max_seq_len = max(
|
467 |
+
[sample_input_ids.shape[1] for sample_input_ids in input_ids_list])
|
468 |
+
input_ids_padded = []
|
469 |
+
for sample_input_ids in input_ids_list:
|
470 |
+
seq_len = sample_input_ids.shape[1]
|
471 |
+
pad_width = max_seq_len - seq_len
|
472 |
+
if pad_width > 0:
|
473 |
+
sample_input_ids_padded = np.pad(
|
474 |
+
sample_input_ids, ((0, 0), (0, pad_width)),
|
475 |
+
'constant',
|
476 |
+
constant_values=self.tokenizer_pad_id)
|
477 |
+
else:
|
478 |
+
sample_input_ids_padded = sample_input_ids
|
479 |
+
input_ids_padded.append(sample_input_ids_padded)
|
480 |
+
|
481 |
+
input_ids = np.stack(input_ids_padded)
|
482 |
+
input_ids = input_ids.reshape(batch_size, -1).astype(np.int32)
|
483 |
+
|
484 |
+
return input_ids
|
485 |
+
|
486 |
+
def _process_multi_image_inputs(self, query, image_token_index=-200):
|
487 |
+
"""
|
488 |
+
Processes input queries that contain multiple images by tokenizing the input strings and inserting image_token_index between the parts.
|
489 |
+
|
490 |
+
Args:
|
491 |
+
query (np.ndarray): Batch of input strings.
|
492 |
+
|
493 |
+
Returns:
|
494 |
+
List[np.ndarray]: List of tokenized input IDs for each sample.
|
495 |
+
"""
|
496 |
+
start_ids = []
|
497 |
+
for s in query:
|
498 |
+
parts = s[0].decode().split('<image>')
|
499 |
+
num_images = len(parts) - 1
|
500 |
+
if num_images > self.max_num_images:
|
501 |
+
raise ValueError(
|
502 |
+
f"The number of images in the request ({num_images}) exceeds the maximum allowed ({self.max_num_images})."
|
503 |
+
)
|
504 |
+
tokenized_parts = [
|
505 |
+
self.tokenizer.encode(part, add_special_tokens=False)
|
506 |
+
for part in parts
|
507 |
+
]
|
508 |
+
|
509 |
+
# Insert `image_token_index` between the parts to represent <image>
|
510 |
+
final_ids = []
|
511 |
+
for i, part in enumerate(tokenized_parts):
|
512 |
+
final_ids.extend(part)
|
513 |
+
if i < len(tokenized_parts) - 1:
|
514 |
+
final_ids.append(image_token_index)
|
515 |
+
|
516 |
+
start_ids.append(np.array(final_ids).astype(int))
|
517 |
+
|
518 |
+
return start_ids
|
519 |
+
|
520 |
+
def _create_request(self, query, visual_tokens=None):
|
521 |
+
"""
|
522 |
+
query : batch string (2D numpy array)
|
523 |
+
"""
|
524 |
+
if isinstance(self.tokenizer, T5Tokenizer):
|
525 |
+
start_ids = [
|
526 |
+
np.array([self.tokenizer_bos_id] + self.tokenizer.encode(
|
527 |
+
s[0].decode(), add_special_tokens=self.add_special_tokens)
|
528 |
+
).astype(int) for s in query
|
529 |
+
]
|
530 |
+
else:
|
531 |
+
if self.is_multimodal and self.max_num_images and self.max_num_images > 1:
|
532 |
+
start_ids = self._process_multi_image_inputs(query)
|
533 |
+
|
534 |
+
else:
|
535 |
+
start_ids = [
|
536 |
+
np.array(
|
537 |
+
self.tokenizer.encode(s[0].decode(),
|
538 |
+
add_special_tokens=self.
|
539 |
+
add_special_tokens)).astype(int)
|
540 |
+
for s in query
|
541 |
+
]
|
542 |
+
|
543 |
+
if self.is_multimodal:
|
544 |
+
if 'blip2' in self.model_type or 'mllama' == self.model_type:
|
545 |
+
pre_prompt = None
|
546 |
+
post_prompt = None
|
547 |
+
elif 'llava' == self.model_type:
|
548 |
+
pre_prompt = "USER:\n"
|
549 |
+
post_prompt = " ASSISTANT:"
|
550 |
+
elif 'vila' == self.model_type:
|
551 |
+
pre_prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: "
|
552 |
+
post_prompt = " ASSISTANT:"
|
553 |
+
elif 'llava_onevision' == self.model_type:
|
554 |
+
pre_prompt = "<|im_start|>user "
|
555 |
+
post_prompt = "<|im_end|><|im_start|>assistant\n"
|
556 |
+
|
557 |
+
pre_prompt_id = np.array(
|
558 |
+
self.tokenizer.encode(
|
559 |
+
pre_prompt,
|
560 |
+
add_special_tokens=self.add_special_tokens,
|
561 |
+
padding=True)) if pre_prompt is not None else np.array(
|
562 |
+
[], dtype=int)
|
563 |
+
|
564 |
+
post_prompt_id = np.array(
|
565 |
+
self.tokenizer.encode(
|
566 |
+
post_prompt,
|
567 |
+
add_special_tokens=self.add_special_tokens,
|
568 |
+
padding=True)) if post_prompt is not None else np.array(
|
569 |
+
[], dtype=int)
|
570 |
+
|
571 |
+
if self.max_num_images and self.max_num_images > 1:
|
572 |
+
concatenated_ids = [
|
573 |
+
np.concatenate((pre_prompt_id, ids, post_prompt_id),
|
574 |
+
axis=0) for ids in start_ids
|
575 |
+
]
|
576 |
+
batch_split_prompts = self._split_prompt_by_images(
|
577 |
+
concatenated_ids)
|
578 |
+
start_ids = self._setup_fake_prompts(query.shape[0],
|
579 |
+
batch_split_prompts)
|
580 |
+
elif self.model_type == 'llava_onevision':
|
581 |
+
fake_prompt_ids = []
|
582 |
+
extra_id = np.array(
|
583 |
+
self.tokenizer.encode(
|
584 |
+
'\n',
|
585 |
+
add_special_tokens=self.add_special_tokens,
|
586 |
+
padding=True))
|
587 |
+
for tokens in visual_tokens:
|
588 |
+
prompt_id = np.arange(self.vocab_size,
|
589 |
+
self.vocab_size + tokens)
|
590 |
+
fake_prompt_ids.append(prompt_id)
|
591 |
+
start_ids = [
|
592 |
+
np.concatenate((pre_prompt_id, prompt_id, extra_id, ids,
|
593 |
+
post_prompt_id),
|
594 |
+
axis=0)
|
595 |
+
for prompt_id, ids in zip(fake_prompt_ids, start_ids)
|
596 |
+
]
|
597 |
+
else:
|
598 |
+
fake_prompt_id = np.arange(
|
599 |
+
self.vocab_size, self.vocab_size + self.ptable_shape[1])
|
600 |
+
start_ids = [
|
601 |
+
np.concatenate(
|
602 |
+
(pre_prompt_id, fake_prompt_id, ids, post_prompt_id),
|
603 |
+
axis=0) for ids in start_ids
|
604 |
+
]
|
605 |
+
|
606 |
+
start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int)
|
607 |
+
|
608 |
+
max_len = 0
|
609 |
+
for seq in start_ids:
|
610 |
+
max_len = max(max_len, seq.shape[0])
|
611 |
+
start_ids = np.stack([
|
612 |
+
np.pad(seq, (0, max_len - seq.shape[0]),
|
613 |
+
'constant',
|
614 |
+
constant_values=(0, self.tokenizer_pad_id))
|
615 |
+
for seq in start_ids
|
616 |
+
])
|
617 |
+
|
618 |
+
return start_ids, start_lengths
|
619 |
+
|
620 |
+
def _to_word_list_format(self, word_lists: List[List[str | bytes]],
|
621 |
+
batch_size):
|
622 |
+
'''
|
623 |
+
word_lists format:
|
624 |
+
len(word_lists) == batch_size
|
625 |
+
word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
|
626 |
+
'''
|
627 |
+
assert self.tokenizer != None, "need to set tokenizer"
|
628 |
+
|
629 |
+
if word_lists is None:
|
630 |
+
# Return an empty array of shape (1,2,0)
|
631 |
+
return np.empty([batch_size, 2, 0], dtype="int32")
|
632 |
+
|
633 |
+
flat_ids = []
|
634 |
+
offsets = []
|
635 |
+
for word_list in word_lists:
|
636 |
+
item_flat_ids = []
|
637 |
+
item_offsets = []
|
638 |
+
|
639 |
+
for word in word_list:
|
640 |
+
if isinstance(word, bytes):
|
641 |
+
word = word.decode()
|
642 |
+
|
643 |
+
ids = self.tokenizer.encode(word, add_special_tokens=False)
|
644 |
+
if len(ids) == 0:
|
645 |
+
continue
|
646 |
+
|
647 |
+
item_flat_ids += ids
|
648 |
+
item_offsets.append(len(ids))
|
649 |
+
|
650 |
+
flat_ids.append(np.array(item_flat_ids))
|
651 |
+
offsets.append(np.cumsum(np.array(item_offsets)))
|
652 |
+
|
653 |
+
pad_to = max(1, max(len(ids) for ids in flat_ids))
|
654 |
+
|
655 |
+
for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
|
656 |
+
flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)),
|
657 |
+
constant_values=0)
|
658 |
+
offsets[i] = np.pad(offs, (0, pad_to - len(offs)),
|
659 |
+
constant_values=-1)
|
660 |
+
|
661 |
+
return np.array([flat_ids, offsets], dtype="int32").transpose(
|
662 |
+
(1, 0, 2))
|
663 |
+
|
664 |
+
def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights,
|
665 |
+
bias_dtype, batch_size):
|
666 |
+
|
667 |
+
assert self.tokenizer != None, "need to set tokenizer"
|
668 |
+
|
669 |
+
if embedding_bias_words is None or embedding_bias_weights is None:
|
670 |
+
return np.empty([batch_size, 0],
|
671 |
+
dtype=self.embedding_bias_weights_dtype)
|
672 |
+
|
673 |
+
batch_embedding_bias = []
|
674 |
+
for words, weights in zip(embedding_bias_words,
|
675 |
+
embedding_bias_weights):
|
676 |
+
|
677 |
+
vocab_size = len(self.tokenizer.vocab)
|
678 |
+
embedding_bias = [0.] * vocab_size
|
679 |
+
|
680 |
+
assert len(words) == len(
|
681 |
+
weights
|
682 |
+
), "Embedding bias words must have same dimension as embedding bias weights"
|
683 |
+
|
684 |
+
for word, weight in zip(words, weights):
|
685 |
+
if isinstance(word, bytes):
|
686 |
+
word = word.decode()
|
687 |
+
ids = self.tokenizer.encode(word)
|
688 |
+
|
689 |
+
if len(ids) == 0:
|
690 |
+
continue
|
691 |
+
|
692 |
+
for id in ids:
|
693 |
+
embedding_bias[id] += weight
|
694 |
+
|
695 |
+
batch_embedding_bias.append(np.array(embedding_bias))
|
696 |
+
|
697 |
+
return np.array(batch_embedding_bias, dtype=bias_dtype)
|
698 |
+
|
699 |
+
|
700 |
+
class VisionPreProcessor:
|
701 |
+
""" A class that can load images from url requests, and process them via a vision model processor,
|
702 |
+
in preparation for the vision encoder.
|
703 |
+
"""
|
704 |
+
|
705 |
+
def __init__(self,
|
706 |
+
vision_model_type,
|
707 |
+
vision_model_processor,
|
708 |
+
preprocessor_model_config={}):
|
709 |
+
# import libraries that are only relevant for multimodal models
|
710 |
+
import torch
|
711 |
+
from torch.utils.dlpack import from_dlpack
|
712 |
+
|
713 |
+
# NOTE: Due to the behavior of MPI initialization, it is recommended to avoid using import tensorrt_llm
|
714 |
+
# except for the specific modules tensorrt_llm and multimodal_encoders.
|
715 |
+
# As a result, the function str_dtype_to_torch has been copied directly from tensorrt_llm._utils.
|
716 |
+
_str_to_torch_dtype_dict = dict(
|
717 |
+
bfloat16=torch.bfloat16,
|
718 |
+
float16=torch.float16,
|
719 |
+
float32=torch.float32,
|
720 |
+
int64=torch.int64,
|
721 |
+
int32=torch.int32,
|
722 |
+
int8=torch.int8,
|
723 |
+
bool=torch.bool,
|
724 |
+
fp8=torch.float8_e4m3fn,
|
725 |
+
)
|
726 |
+
|
727 |
+
def str_dtype_to_torch(dtype):
|
728 |
+
ret = _str_to_torch_dtype_dict.get(dtype)
|
729 |
+
assert ret is not None, f'Unsupported dtype: {dtype}'
|
730 |
+
return ret
|
731 |
+
|
732 |
+
self.load_images_tensor = lambda tensor: tensor if not hasattr(
|
733 |
+
tensor, 'to_dlpack') else from_dlpack(tensor.to_dlpack())
|
734 |
+
|
735 |
+
# extract expected output tensor dtype
|
736 |
+
self.output_str_dtypes = {}
|
737 |
+
for properties in preprocessor_model_config.get('output', []):
|
738 |
+
dtype = properties['data_type']
|
739 |
+
self.output_str_dtypes[properties['name']] = np.dtype(
|
740 |
+
pb_utils.triton_string_to_numpy(dtype)).name
|
741 |
+
|
742 |
+
# create method for converting output tensors batch to the expected type
|
743 |
+
self.convert_tensor_list_to_tensor = lambda tensor_list: torch.concat(
|
744 |
+
[
|
745 |
+
torch.from_numpy(x) if isinstance(x, np.ndarray) else x
|
746 |
+
for x in tensor_list
|
747 |
+
],
|
748 |
+
dim=0)
|
749 |
+
self.convert_tensor_to_str_dtype = lambda tensor, dtype: tensor.to(
|
750 |
+
str_dtype_to_torch(dtype))
|
751 |
+
|
752 |
+
# create model-specific processor
|
753 |
+
self.vision_model_processor = vision_model_processor
|
754 |
+
self.vision_model_type = vision_model_type
|
755 |
+
|
756 |
+
def load_images_from_urls(self, img_urls):
|
757 |
+
images = []
|
758 |
+
for img_url in img_urls:
|
759 |
+
img_url = img_url.decode()
|
760 |
+
if img_url.startswith("data:image/jpeg;base64,"):
|
761 |
+
image_base64 = img_url.split(",")[1]
|
762 |
+
# Decode the base64 string
|
763 |
+
image_data = base64.b64decode(image_base64)
|
764 |
+
# Create a BytesIO object from the decoded data
|
765 |
+
image_buffer = io.BytesIO(image_data)
|
766 |
+
images.append(Image.open(image_buffer))
|
767 |
+
else:
|
768 |
+
images.append(
|
769 |
+
Image.open(requests.get(img_url, stream=True).raw))
|
770 |
+
return images
|
771 |
+
|
772 |
+
def mllama_process(self, queries, img_urls=None, image_bytes=None):
|
773 |
+
vision_processed_tensors = {}
|
774 |
+
if img_urls is not None or image_bytes is not None:
|
775 |
+
if img_urls is not None:
|
776 |
+
# download and read images
|
777 |
+
images = [
|
778 |
+
self.load_images_from_urls(urls)
|
779 |
+
for urls in img_urls.as_numpy()
|
780 |
+
]
|
781 |
+
else:
|
782 |
+
images = [
|
783 |
+
img for img_list in self.load_images_tensor(image_bytes)
|
784 |
+
for img in img_list
|
785 |
+
]
|
786 |
+
|
787 |
+
batch_size = len(images)
|
788 |
+
|
789 |
+
preprocessor_outputs = {}
|
790 |
+
possible_output_names = [
|
791 |
+
'PIXEL_VALUES', 'ASPECT_RATIO_IDS', 'ASPECT_RATIO_MASK',
|
792 |
+
'CROSS_ATTENTION_MASK'
|
793 |
+
]
|
794 |
+
for batch_id in range(batch_size):
|
795 |
+
# Preprocess images and query
|
796 |
+
processed_vision_data = self.vision_model_processor(
|
797 |
+
images=images[batch_id],
|
798 |
+
text=queries[batch_id],
|
799 |
+
return_tensors="pt")
|
800 |
+
|
801 |
+
# Reshape pixel_values to [num_images, *HWC/CHW]
|
802 |
+
val = processed_vision_data["pixel_values"]
|
803 |
+
|
804 |
+
val = val.reshape(1, -1, *(val.shape[-3:]))
|
805 |
+
processed_vision_data["pixel_values"] = val
|
806 |
+
# Create vision output tensors
|
807 |
+
for key in possible_output_names:
|
808 |
+
val = processed_vision_data.get(key.lower())
|
809 |
+
if val is not None:
|
810 |
+
if key not in preprocessor_outputs:
|
811 |
+
preprocessor_outputs[key] = []
|
812 |
+
preprocessor_outputs[key].append(val)
|
813 |
+
|
814 |
+
for key, tensor_list in preprocessor_outputs.items():
|
815 |
+
val = self.convert_tensor_list_to_tensor(tensor_list)
|
816 |
+
if key in self.output_str_dtypes:
|
817 |
+
val = self.convert_tensor_to_str_dtype(
|
818 |
+
val, self.output_str_dtypes[key])
|
819 |
+
vision_processed_tensors[key] = val
|
820 |
+
return vision_processed_tensors
|
821 |
+
|
822 |
+
def llava_onevision_process_image(self,
|
823 |
+
queries,
|
824 |
+
img_urls=None,
|
825 |
+
image_bytes=None):
|
826 |
+
|
827 |
+
import torch
|
828 |
+
vision_processed_tensors = {}
|
829 |
+
if img_urls is not None:
|
830 |
+
# download and read images
|
831 |
+
images = [
|
832 |
+
self.load_images_from_urls(urls)
|
833 |
+
for urls in img_urls.as_numpy()
|
834 |
+
]
|
835 |
+
else:
|
836 |
+
images = [
|
837 |
+
img for img_list in self.load_images_tensor(image_bytes)
|
838 |
+
for img in img_list
|
839 |
+
]
|
840 |
+
|
841 |
+
batch_size = len(images)
|
842 |
+
assert len(
|
843 |
+
queries
|
844 |
+
) == batch_size, f"Image must have the same batch size as Query."
|
845 |
+
preprocessor_outputs = {}
|
846 |
+
possible_output_names = ['PIXEL_VALUES', 'IMAGE_SIZES']
|
847 |
+
visual_tokens = []
|
848 |
+
for batch_id in range(batch_size):
|
849 |
+
# Preprocess images and query
|
850 |
+
processed_vision_data = self.vision_model_processor(
|
851 |
+
images=images[batch_id], text='<image>', return_tensors="pt")
|
852 |
+
visual_tokens.append(processed_vision_data['input_ids'].shape[1])
|
853 |
+
|
854 |
+
# Create vision output tensors
|
855 |
+
for key in possible_output_names:
|
856 |
+
val = processed_vision_data.get(key.lower())
|
857 |
+
if val is not None:
|
858 |
+
if key not in preprocessor_outputs:
|
859 |
+
preprocessor_outputs[key] = []
|
860 |
+
preprocessor_outputs[key].append(val)
|
861 |
+
|
862 |
+
max_patch = max(x.shape[1]
|
863 |
+
for x in preprocessor_outputs['PIXEL_VALUES'])
|
864 |
+
preprocessor_outputs['PIXEL_VALUES'] = [
|
865 |
+
torch.nn.functional.pad(
|
866 |
+
image, (0, 0, 0, 0, 0, 0, 0, max_patch - image.shape[1], 0, 0),
|
867 |
+
mode='constant')
|
868 |
+
for image in preprocessor_outputs['PIXEL_VALUES']
|
869 |
+
]
|
870 |
+
for key, tensor_list in preprocessor_outputs.items():
|
871 |
+
val = self.convert_tensor_list_to_tensor(tensor_list)
|
872 |
+
if key in self.output_str_dtypes:
|
873 |
+
val = self.convert_tensor_to_str_dtype(
|
874 |
+
val, self.output_str_dtypes[key])
|
875 |
+
vision_processed_tensors[key] = val
|
876 |
+
return vision_processed_tensors, visual_tokens
|
877 |
+
|
878 |
+
def llava_onevision_process_video(self, queries, video_bytes=None):
|
879 |
+
import torch
|
880 |
+
vision_processed_tensors = {}
|
881 |
+
videos = [video for video in self.load_images_tensor(video_bytes)]
|
882 |
+
|
883 |
+
batch_size = len(videos)
|
884 |
+
assert len(
|
885 |
+
queries
|
886 |
+
) == batch_size, f"Video must have the same batch size as Query."
|
887 |
+
preprocessor_outputs = {}
|
888 |
+
preprocessor_outputs['PIXEL_VALUES'] = []
|
889 |
+
preprocessor_outputs['IS_VIDEO_INPUT'] = []
|
890 |
+
visual_tokens = []
|
891 |
+
for batch_id in range(len(queries)):
|
892 |
+
processed_vision_data = self.vision_model_processor(
|
893 |
+
videos=list(videos[batch_id]),
|
894 |
+
text='<video>',
|
895 |
+
return_tensors="pt")
|
896 |
+
visual_tokens.append(processed_vision_data['input_ids'].shape[1])
|
897 |
+
preprocessor_outputs['PIXEL_VALUES'].append(
|
898 |
+
processed_vision_data['pixel_values_videos'])
|
899 |
+
preprocessor_outputs['IS_VIDEO_INPUT'].append(
|
900 |
+
torch.ones((1, 1), dtype=torch.bool))
|
901 |
+
|
902 |
+
for key, tensor_list in preprocessor_outputs.items():
|
903 |
+
val = self.convert_tensor_list_to_tensor(tensor_list)
|
904 |
+
if key in self.output_str_dtypes:
|
905 |
+
val = self.convert_tensor_to_str_dtype(
|
906 |
+
val, self.output_str_dtypes[key])
|
907 |
+
vision_processed_tensors[key] = val
|
908 |
+
return vision_processed_tensors, visual_tokens
|
preprocessing/config.pbtxt
ADDED
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "preprocessing"
|
28 |
+
backend: "python"
|
29 |
+
max_batch_size: 32
|
30 |
+
|
31 |
+
|
32 |
+
input [
|
33 |
+
{
|
34 |
+
name: "QUERY"
|
35 |
+
data_type: TYPE_STRING
|
36 |
+
dims: [ 1 ]
|
37 |
+
},
|
38 |
+
{
|
39 |
+
name: "DECODER_QUERY"
|
40 |
+
data_type: TYPE_STRING
|
41 |
+
dims: [ 1 ]
|
42 |
+
optional: true
|
43 |
+
},
|
44 |
+
{
|
45 |
+
name: "IMAGE_BYTES"
|
46 |
+
data_type: TYPE_UINT8
|
47 |
+
dims: [ -1, -1, -1, -1 ]
|
48 |
+
optional: true
|
49 |
+
},
|
50 |
+
{
|
51 |
+
name: "IMAGE_URL"
|
52 |
+
data_type: TYPE_STRING
|
53 |
+
dims: [ 1 ]
|
54 |
+
optional: true
|
55 |
+
},
|
56 |
+
{
|
57 |
+
name: "VIDEO_BYTES"
|
58 |
+
data_type: TYPE_UINT8
|
59 |
+
dims: [ -1, -1, -1, -1 ]
|
60 |
+
optional: true
|
61 |
+
},
|
62 |
+
{
|
63 |
+
name: "REQUEST_OUTPUT_LEN"
|
64 |
+
data_type: TYPE_INT32
|
65 |
+
dims: [ 1 ]
|
66 |
+
},
|
67 |
+
{
|
68 |
+
name: "BAD_WORDS_DICT"
|
69 |
+
data_type: TYPE_STRING
|
70 |
+
dims: [ -1 ]
|
71 |
+
optional: true
|
72 |
+
},
|
73 |
+
{
|
74 |
+
name: "STOP_WORDS_DICT"
|
75 |
+
data_type: TYPE_STRING
|
76 |
+
dims: [ -1 ]
|
77 |
+
optional: true
|
78 |
+
},
|
79 |
+
{
|
80 |
+
name: "EMBEDDING_BIAS_WORDS"
|
81 |
+
data_type: TYPE_STRING
|
82 |
+
dims: [ -1 ]
|
83 |
+
optional: true
|
84 |
+
},
|
85 |
+
{
|
86 |
+
name: "EMBEDDING_BIAS_WEIGHTS"
|
87 |
+
data_type: TYPE_FP32
|
88 |
+
dims: [ -1 ]
|
89 |
+
optional: true
|
90 |
+
},
|
91 |
+
{
|
92 |
+
name: "END_ID"
|
93 |
+
data_type: TYPE_INT32
|
94 |
+
dims: [ 1 ]
|
95 |
+
optional: true
|
96 |
+
},
|
97 |
+
{
|
98 |
+
name: "PAD_ID"
|
99 |
+
data_type: TYPE_INT32
|
100 |
+
dims: [ 1 ]
|
101 |
+
optional: true
|
102 |
+
},
|
103 |
+
{
|
104 |
+
name: "PROMPT_TABLE_EXTRA_ID"
|
105 |
+
data_type: TYPE_UINT64
|
106 |
+
dims: [ 1 ]
|
107 |
+
optional: true
|
108 |
+
}
|
109 |
+
]
|
110 |
+
output [
|
111 |
+
{
|
112 |
+
name: "INPUT_ID"
|
113 |
+
data_type: TYPE_INT32
|
114 |
+
dims: [ -1 ]
|
115 |
+
},
|
116 |
+
{
|
117 |
+
name: "REQUEST_INPUT_LEN"
|
118 |
+
data_type: TYPE_INT32
|
119 |
+
dims: [ 1 ]
|
120 |
+
},
|
121 |
+
{
|
122 |
+
name: "DECODER_INPUT_ID"
|
123 |
+
data_type: TYPE_INT32
|
124 |
+
dims: [ -1 ]
|
125 |
+
},
|
126 |
+
{
|
127 |
+
name: "REQUEST_DECODER_INPUT_LEN"
|
128 |
+
data_type: TYPE_INT32
|
129 |
+
dims: [ 1 ]
|
130 |
+
},
|
131 |
+
{
|
132 |
+
name: "BAD_WORDS_IDS"
|
133 |
+
data_type: TYPE_INT32
|
134 |
+
dims: [ 2, -1 ]
|
135 |
+
},
|
136 |
+
{
|
137 |
+
name: "STOP_WORDS_IDS"
|
138 |
+
data_type: TYPE_INT32
|
139 |
+
dims: [ 2, -1 ]
|
140 |
+
},
|
141 |
+
{
|
142 |
+
name: "EMBEDDING_BIAS"
|
143 |
+
data_type: TYPE_FP32
|
144 |
+
dims: [ -1 ]
|
145 |
+
},
|
146 |
+
{
|
147 |
+
name: "REQUEST_OUTPUT_LEN"
|
148 |
+
data_type: TYPE_INT32
|
149 |
+
dims: [ -1 ]
|
150 |
+
},
|
151 |
+
{
|
152 |
+
name: "OUT_END_ID"
|
153 |
+
data_type: TYPE_INT32
|
154 |
+
dims: [ 1 ]
|
155 |
+
},
|
156 |
+
{
|
157 |
+
name: "OUT_PAD_ID"
|
158 |
+
data_type: TYPE_INT32
|
159 |
+
dims: [ 1 ]
|
160 |
+
},
|
161 |
+
{
|
162 |
+
name: "OUT_PROMPT_TABLE_EXTRA_IDS"
|
163 |
+
data_type: TYPE_UINT64
|
164 |
+
dims: [ -1 ]
|
165 |
+
},
|
166 |
+
{
|
167 |
+
name: "PIXEL_VALUES"
|
168 |
+
data_type: TYPE_FP16
|
169 |
+
dims: [ -1, -1, -1, -1 ]
|
170 |
+
},
|
171 |
+
{
|
172 |
+
name: "ASPECT_RATIO_IDS"
|
173 |
+
data_type: TYPE_INT64
|
174 |
+
dims: [ -1 ]
|
175 |
+
},
|
176 |
+
{
|
177 |
+
name: "ASPECT_RATIO_MASK"
|
178 |
+
data_type: TYPE_INT64
|
179 |
+
dims: [ -1, -1 ]
|
180 |
+
},
|
181 |
+
{
|
182 |
+
name: "CROSS_ATTENTION_MASK"
|
183 |
+
data_type: TYPE_INT64
|
184 |
+
dims: [ -1, -1, -1 ]
|
185 |
+
},
|
186 |
+
# Required for image postprocessing in the llava_onevision model
|
187 |
+
{
|
188 |
+
name: "IMAGE_SIZES"
|
189 |
+
data_type: TYPE_INT64
|
190 |
+
dims: [ 2 ]
|
191 |
+
},
|
192 |
+
# Indicates if the input is video in the llava_onevision model
|
193 |
+
{
|
194 |
+
name: "IS_VIDEO_INPUT"
|
195 |
+
data_type: TYPE_BOOL
|
196 |
+
dims: [ 1 ]
|
197 |
+
}
|
198 |
+
]
|
199 |
+
|
200 |
+
parameters {
|
201 |
+
key: "tokenizer_dir"
|
202 |
+
value: {
|
203 |
+
string_value: "huihui-ai/Llama-3.3-70B-Instruct-abliterated"
|
204 |
+
}
|
205 |
+
}
|
206 |
+
|
207 |
+
parameters {
|
208 |
+
key: "add_special_tokens"
|
209 |
+
value: {
|
210 |
+
string_value: "False"
|
211 |
+
}
|
212 |
+
}
|
213 |
+
|
214 |
+
parameters {
|
215 |
+
key: "visual_model_path"
|
216 |
+
value: {
|
217 |
+
string_value: "${visual_model_path}"
|
218 |
+
}
|
219 |
+
}
|
220 |
+
|
221 |
+
parameters: {
|
222 |
+
key: "gpt_model_path"
|
223 |
+
value: {
|
224 |
+
string_value: "/all_models/inflight_batcher_llm/tensorrt_llm/1"
|
225 |
+
}
|
226 |
+
}
|
227 |
+
|
228 |
+
parameters: {
|
229 |
+
key: "max_num_images"
|
230 |
+
value: {
|
231 |
+
string_value: "${max_num_images}"
|
232 |
+
}
|
233 |
+
}
|
234 |
+
|
235 |
+
instance_group [
|
236 |
+
{
|
237 |
+
count: 1
|
238 |
+
kind: KIND_CPU
|
239 |
+
}
|
240 |
+
]
|
tensorrt_llm/1/.gitkeep
ADDED
File without changes
|
tensorrt_llm/1/config.json
ADDED
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"version": "0.18.0.dev2025020400",
|
3 |
+
"pretrained_config": {
|
4 |
+
"mlp_bias": false,
|
5 |
+
"attn_bias": false,
|
6 |
+
"rotary_base": 500000.0,
|
7 |
+
"rotary_scaling": {
|
8 |
+
"factor": 8.0,
|
9 |
+
"high_freq_factor": 4.0,
|
10 |
+
"low_freq_factor": 1.0,
|
11 |
+
"original_max_position_embeddings": 8192,
|
12 |
+
"rope_type": "llama3"
|
13 |
+
},
|
14 |
+
"residual_mlp": false,
|
15 |
+
"disable_weight_only_quant_plugin": false,
|
16 |
+
"moe": {
|
17 |
+
"num_experts": 0,
|
18 |
+
"shared_expert_intermediate_size": 0,
|
19 |
+
"top_k": 0,
|
20 |
+
"normalization_mode": null,
|
21 |
+
"sparse_mixer_epsilon": 0.01,
|
22 |
+
"tp_mode": 0,
|
23 |
+
"device_limited_n_group": 0,
|
24 |
+
"device_limited_topk_group": 0,
|
25 |
+
"device_limited_routed_scaling_factor": 1.0
|
26 |
+
},
|
27 |
+
"remove_duplicated_kv_heads": false,
|
28 |
+
"fc_after_embed": false,
|
29 |
+
"use_input_layernorm_in_first_layer": true,
|
30 |
+
"use_last_layernorm": true,
|
31 |
+
"layer_idx_offset": 0,
|
32 |
+
"embedding_multiplier": 1.0,
|
33 |
+
"attention_multiplier": 1.0,
|
34 |
+
"residual_multiplier": 1.0,
|
35 |
+
"output_multiplier_scale": 1.0,
|
36 |
+
"has_partial_lora_mask": false,
|
37 |
+
"architecture": "LlamaForCausalLM",
|
38 |
+
"dtype": "float16",
|
39 |
+
"vocab_size": 128256,
|
40 |
+
"hidden_size": 8192,
|
41 |
+
"num_hidden_layers": 80,
|
42 |
+
"num_attention_heads": 64,
|
43 |
+
"hidden_act": "silu",
|
44 |
+
"logits_dtype": "float16",
|
45 |
+
"norm_epsilon": 1e-05,
|
46 |
+
"runtime_defaults": null,
|
47 |
+
"position_embedding_type": "rope_gpt_neox",
|
48 |
+
"num_key_value_heads": 8,
|
49 |
+
"intermediate_size": 28672,
|
50 |
+
"max_position_embeddings": 131072,
|
51 |
+
"mapping": {
|
52 |
+
"world_size": 2,
|
53 |
+
"gpus_per_node": 8,
|
54 |
+
"cp_size": 1,
|
55 |
+
"tp_size": 2,
|
56 |
+
"pp_size": 1,
|
57 |
+
"moe_tp_size": 2,
|
58 |
+
"moe_ep_size": 1,
|
59 |
+
"auto_parallel": false
|
60 |
+
},
|
61 |
+
"quantization": {
|
62 |
+
"quant_algo": "FP8",
|
63 |
+
"kv_cache_quant_algo": "FP8",
|
64 |
+
"group_size": 128,
|
65 |
+
"smoothquant_val": 0.5,
|
66 |
+
"clamp_val": null,
|
67 |
+
"use_meta_recipe": false,
|
68 |
+
"has_zero_point": false,
|
69 |
+
"pre_quant_scale": false,
|
70 |
+
"exclude_modules": [
|
71 |
+
"transformer.layers.33.input_layernorm",
|
72 |
+
"transformer.layers.58.post_layernorm",
|
73 |
+
"transformer.layers.43.post_layernorm",
|
74 |
+
"transformer.layers.45.input_layernorm",
|
75 |
+
"transformer.layers.8.post_layernorm",
|
76 |
+
"transformer.layers.79.input_layernorm",
|
77 |
+
"transformer.layers.70.post_layernorm",
|
78 |
+
"transformer.layers.73.input_layernorm",
|
79 |
+
"transformer.layers.19.input_layernorm",
|
80 |
+
"transformer.layers.46.input_layernorm",
|
81 |
+
"transformer.layers.48.input_layernorm",
|
82 |
+
"transformer.layers.67.post_layernorm",
|
83 |
+
"transformer.layers.12.input_layernorm",
|
84 |
+
"transformer.layers.60.post_layernorm",
|
85 |
+
"transformer.layers.17.post_layernorm",
|
86 |
+
"transformer.layers.57.input_layernorm",
|
87 |
+
"transformer.layers.0.input_layernorm",
|
88 |
+
"transformer.layers.49.input_layernorm",
|
89 |
+
"transformer.layers.4.post_layernorm",
|
90 |
+
"transformer.layers.39.post_layernorm",
|
91 |
+
"transformer.layers.73.post_layernorm",
|
92 |
+
"transformer.layers.44.post_layernorm",
|
93 |
+
"transformer.layers.13.input_layernorm",
|
94 |
+
"transformer.layers.56.post_layernorm",
|
95 |
+
"transformer.layers.62.post_layernorm",
|
96 |
+
"transformer.layers.42.post_layernorm",
|
97 |
+
"transformer.layers.27.input_layernorm",
|
98 |
+
"transformer.layers.22.post_layernorm",
|
99 |
+
"transformer.layers.77.input_layernorm",
|
100 |
+
"transformer.layers.51.input_layernorm",
|
101 |
+
"transformer.layers.21.post_layernorm",
|
102 |
+
"transformer.layers.54.post_layernorm",
|
103 |
+
"transformer.layers.22.input_layernorm",
|
104 |
+
"transformer.layers.47.input_layernorm",
|
105 |
+
"transformer.layers.15.input_layernorm",
|
106 |
+
"transformer.layers.7.input_layernorm",
|
107 |
+
"transformer.layers.63.input_layernorm",
|
108 |
+
"transformer.layers.70.input_layernorm",
|
109 |
+
"transformer.layers.5.input_layernorm",
|
110 |
+
"transformer.layers.29.post_layernorm",
|
111 |
+
"transformer.vocab_embedding",
|
112 |
+
"transformer.layers.2.post_layernorm",
|
113 |
+
"transformer.layers.11.post_layernorm",
|
114 |
+
"transformer.layers.54.input_layernorm",
|
115 |
+
"transformer.layers.45.post_layernorm",
|
116 |
+
"transformer.layers.78.post_layernorm",
|
117 |
+
"transformer.layers.23.post_layernorm",
|
118 |
+
"transformer.layers.30.input_layernorm",
|
119 |
+
"transformer.layers.58.input_layernorm",
|
120 |
+
"transformer.layers.18.input_layernorm",
|
121 |
+
"transformer.layers.3.input_layernorm",
|
122 |
+
"transformer.layers.7.post_layernorm",
|
123 |
+
"transformer.layers.77.post_layernorm",
|
124 |
+
"transformer.layers.47.post_layernorm",
|
125 |
+
"transformer.layers.38.input_layernorm",
|
126 |
+
"transformer.layers.41.post_layernorm",
|
127 |
+
"transformer.layers.55.post_layernorm",
|
128 |
+
"transformer.layers.64.post_layernorm",
|
129 |
+
"transformer.layers.57.post_layernorm",
|
130 |
+
"transformer.layers.29.input_layernorm",
|
131 |
+
"transformer.layers.28.input_layernorm",
|
132 |
+
"transformer.layers.9.input_layernorm",
|
133 |
+
"transformer.layers.43.input_layernorm",
|
134 |
+
"transformer.layers.28.post_layernorm",
|
135 |
+
"transformer.layers.52.post_layernorm",
|
136 |
+
"transformer.layers.17.input_layernorm",
|
137 |
+
"transformer.layers.19.post_layernorm",
|
138 |
+
"transformer.layers.15.post_layernorm",
|
139 |
+
"transformer.layers.25.post_layernorm",
|
140 |
+
"transformer.layers.32.input_layernorm",
|
141 |
+
"transformer.layers.76.post_layernorm",
|
142 |
+
"transformer.layers.16.input_layernorm",
|
143 |
+
"transformer.layers.75.post_layernorm",
|
144 |
+
"transformer.layers.62.input_layernorm",
|
145 |
+
"transformer.layers.50.input_layernorm",
|
146 |
+
"transformer.layers.35.input_layernorm",
|
147 |
+
"transformer.layers.59.input_layernorm",
|
148 |
+
"transformer.layers.68.post_layernorm",
|
149 |
+
"transformer.layers.40.post_layernorm",
|
150 |
+
"transformer.layers.10.post_layernorm",
|
151 |
+
"transformer.layers.50.post_layernorm",
|
152 |
+
"transformer.layers.14.input_layernorm",
|
153 |
+
"transformer.layers.61.post_layernorm",
|
154 |
+
"transformer.layers.41.input_layernorm",
|
155 |
+
"transformer.layers.3.post_layernorm",
|
156 |
+
"transformer.layers.69.input_layernorm",
|
157 |
+
"transformer.layers.2.input_layernorm",
|
158 |
+
"transformer.layers.1.post_layernorm",
|
159 |
+
"transformer.layers.14.post_layernorm",
|
160 |
+
"transformer.layers.1.input_layernorm",
|
161 |
+
"transformer.layers.53.input_layernorm",
|
162 |
+
"transformer.layers.65.input_layernorm",
|
163 |
+
"lm_head",
|
164 |
+
"transformer.layers.32.post_layernorm",
|
165 |
+
"transformer.layers.11.input_layernorm",
|
166 |
+
"transformer.layers.59.post_layernorm",
|
167 |
+
"transformer.layers.37.input_layernorm",
|
168 |
+
"transformer.ln_f",
|
169 |
+
"transformer.layers.4.input_layernorm",
|
170 |
+
"transformer.layers.34.post_layernorm",
|
171 |
+
"transformer.layers.78.input_layernorm",
|
172 |
+
"transformer.layers.44.input_layernorm",
|
173 |
+
"transformer.layers.48.post_layernorm",
|
174 |
+
"transformer.layers.20.post_layernorm",
|
175 |
+
"transformer.layers.49.post_layernorm",
|
176 |
+
"transformer.layers.42.input_layernorm",
|
177 |
+
"transformer.layers.66.post_layernorm",
|
178 |
+
"transformer.layers.74.input_layernorm",
|
179 |
+
"transformer.layers.20.input_layernorm",
|
180 |
+
"transformer.layers.5.post_layernorm",
|
181 |
+
"transformer.layers.69.post_layernorm",
|
182 |
+
"transformer.layers.35.post_layernorm",
|
183 |
+
"transformer.layers.56.input_layernorm",
|
184 |
+
"transformer.layers.79.post_layernorm",
|
185 |
+
"transformer.layers.31.post_layernorm",
|
186 |
+
"transformer.layers.60.input_layernorm",
|
187 |
+
"transformer.layers.36.post_layernorm",
|
188 |
+
"transformer.layers.23.input_layernorm",
|
189 |
+
"transformer.layers.26.post_layernorm",
|
190 |
+
"transformer.layers.66.input_layernorm",
|
191 |
+
"transformer.layers.68.input_layernorm",
|
192 |
+
"transformer.layers.52.input_layernorm",
|
193 |
+
"transformer.layers.72.input_layernorm",
|
194 |
+
"transformer.layers.26.input_layernorm",
|
195 |
+
"transformer.layers.9.post_layernorm",
|
196 |
+
"transformer.layers.71.post_layernorm",
|
197 |
+
"transformer.layers.72.post_layernorm",
|
198 |
+
"transformer.layers.18.post_layernorm",
|
199 |
+
"transformer.layers.6.input_layernorm",
|
200 |
+
"transformer.layers.33.post_layernorm",
|
201 |
+
"transformer.layers.51.post_layernorm",
|
202 |
+
"transformer.layers.76.input_layernorm",
|
203 |
+
"transformer.layers.64.input_layernorm",
|
204 |
+
"transformer.layers.16.post_layernorm",
|
205 |
+
"transformer.layers.25.input_layernorm",
|
206 |
+
"transformer.layers.0.post_layernorm",
|
207 |
+
"transformer.layers.38.post_layernorm",
|
208 |
+
"transformer.layers.63.post_layernorm",
|
209 |
+
"transformer.layers.12.post_layernorm",
|
210 |
+
"transformer.layers.30.post_layernorm",
|
211 |
+
"transformer.layers.67.input_layernorm",
|
212 |
+
"transformer.layers.46.post_layernorm",
|
213 |
+
"transformer.layers.24.input_layernorm",
|
214 |
+
"transformer.layers.53.post_layernorm",
|
215 |
+
"transformer.layers.74.post_layernorm",
|
216 |
+
"transformer.layers.71.input_layernorm",
|
217 |
+
"transformer.layers.55.input_layernorm",
|
218 |
+
"transformer.layers.6.post_layernorm",
|
219 |
+
"transformer.layers.40.input_layernorm",
|
220 |
+
"transformer.layers.13.post_layernorm",
|
221 |
+
"transformer.layers.27.post_layernorm",
|
222 |
+
"transformer.layers.8.input_layernorm",
|
223 |
+
"transformer.layers.24.post_layernorm",
|
224 |
+
"transformer.layers.37.post_layernorm",
|
225 |
+
"transformer.layers.61.input_layernorm",
|
226 |
+
"transformer.layers.34.input_layernorm",
|
227 |
+
"transformer.layers.36.input_layernorm",
|
228 |
+
"transformer.layers.31.input_layernorm",
|
229 |
+
"transformer.layers.65.post_layernorm",
|
230 |
+
"transformer.layers.21.input_layernorm",
|
231 |
+
"transformer.layers.39.input_layernorm",
|
232 |
+
"transformer.layers.10.input_layernorm",
|
233 |
+
"transformer.layers.75.input_layernorm"
|
234 |
+
]
|
235 |
+
},
|
236 |
+
"use_parallel_embedding": true,
|
237 |
+
"embedding_sharding_dim": 0,
|
238 |
+
"head_size": 128,
|
239 |
+
"qk_layernorm": false,
|
240 |
+
"rotary_embedding_dim": 128,
|
241 |
+
"producer": {
|
242 |
+
"name": "modelopt",
|
243 |
+
"version": "0.23.0"
|
244 |
+
},
|
245 |
+
"share_embedding_table": false,
|
246 |
+
"bias": false,
|
247 |
+
"rotary_pct": 1.0,
|
248 |
+
"rank": 1,
|
249 |
+
"decoder": "llama",
|
250 |
+
"rmsnorm": true,
|
251 |
+
"lm_head_bias": false,
|
252 |
+
"tie_word_embeddings": false,
|
253 |
+
"model_type": "llama"
|
254 |
+
},
|
255 |
+
"build_config": {
|
256 |
+
"max_input_len": 124000,
|
257 |
+
"max_seq_len": 131072,
|
258 |
+
"opt_batch_size": 8,
|
259 |
+
"max_batch_size": 32,
|
260 |
+
"max_beam_width": 1,
|
261 |
+
"max_num_tokens": 128000,
|
262 |
+
"opt_num_tokens": null,
|
263 |
+
"max_prompt_embedding_table_size": 0,
|
264 |
+
"kv_cache_type": "PAGED",
|
265 |
+
"gather_context_logits": false,
|
266 |
+
"gather_generation_logits": false,
|
267 |
+
"strongly_typed": true,
|
268 |
+
"force_num_profiles": null,
|
269 |
+
"profiling_verbosity": "layer_names_only",
|
270 |
+
"enable_debug_output": false,
|
271 |
+
"max_draft_len": 0,
|
272 |
+
"speculative_decoding_mode": 1,
|
273 |
+
"use_refit": false,
|
274 |
+
"input_timing_cache": null,
|
275 |
+
"output_timing_cache": "model.cache",
|
276 |
+
"lora_config": {
|
277 |
+
"lora_dir": [],
|
278 |
+
"lora_ckpt_source": "hf",
|
279 |
+
"max_lora_rank": 64,
|
280 |
+
"lora_target_modules": [],
|
281 |
+
"trtllm_modules_to_hf_modules": {}
|
282 |
+
},
|
283 |
+
"auto_parallel_config": {
|
284 |
+
"world_size": 1,
|
285 |
+
"gpus_per_node": 8,
|
286 |
+
"cluster_key": "H100-PCIe",
|
287 |
+
"cluster_info": null,
|
288 |
+
"sharding_cost_model": "alpha_beta",
|
289 |
+
"comm_cost_model": "alpha_beta",
|
290 |
+
"enable_pipeline_parallelism": false,
|
291 |
+
"enable_shard_unbalanced_shape": false,
|
292 |
+
"enable_shard_dynamic_shape": false,
|
293 |
+
"enable_reduce_scatter": true,
|
294 |
+
"builder_flags": null,
|
295 |
+
"debug_mode": false,
|
296 |
+
"infer_shape": true,
|
297 |
+
"validation_mode": false,
|
298 |
+
"same_buffer_io": {
|
299 |
+
"past_key_value_(\\d+)": "present_key_value_\\1"
|
300 |
+
},
|
301 |
+
"same_spec_io": {},
|
302 |
+
"sharded_io_allowlist": [
|
303 |
+
"past_key_value_\\d+",
|
304 |
+
"present_key_value_\\d*"
|
305 |
+
],
|
306 |
+
"fill_weights": false,
|
307 |
+
"parallel_config_cache": null,
|
308 |
+
"profile_cache": null,
|
309 |
+
"dump_path": null,
|
310 |
+
"debug_outputs": []
|
311 |
+
},
|
312 |
+
"weight_sparsity": false,
|
313 |
+
"weight_streaming": false,
|
314 |
+
"plugin_config": {
|
315 |
+
"dtype": "float16",
|
316 |
+
"bert_attention_plugin": "auto",
|
317 |
+
"gpt_attention_plugin": "auto",
|
318 |
+
"gemm_plugin": "fp8",
|
319 |
+
"explicitly_disable_gemm_plugin": false,
|
320 |
+
"gemm_swiglu_plugin": null,
|
321 |
+
"fp8_rowwise_gemm_plugin": null,
|
322 |
+
"qserve_gemm_plugin": null,
|
323 |
+
"identity_plugin": null,
|
324 |
+
"nccl_plugin": "float16",
|
325 |
+
"lora_plugin": null,
|
326 |
+
"dora_plugin": false,
|
327 |
+
"weight_only_groupwise_quant_matmul_plugin": null,
|
328 |
+
"weight_only_quant_matmul_plugin": null,
|
329 |
+
"smooth_quant_plugins": true,
|
330 |
+
"smooth_quant_gemm_plugin": null,
|
331 |
+
"layernorm_quantization_plugin": null,
|
332 |
+
"rmsnorm_quantization_plugin": null,
|
333 |
+
"quantize_per_token_plugin": false,
|
334 |
+
"quantize_tensor_plugin": false,
|
335 |
+
"moe_plugin": "auto",
|
336 |
+
"mamba_conv1d_plugin": "auto",
|
337 |
+
"low_latency_gemm_plugin": null,
|
338 |
+
"low_latency_gemm_swiglu_plugin": null,
|
339 |
+
"gemm_allreduce_plugin": null,
|
340 |
+
"context_fmha": true,
|
341 |
+
"bert_context_fmha_fp32_acc": false,
|
342 |
+
"paged_kv_cache": true,
|
343 |
+
"remove_input_padding": true,
|
344 |
+
"reduce_fusion": false,
|
345 |
+
"user_buffer": false,
|
346 |
+
"tokens_per_block": 32,
|
347 |
+
"use_paged_context_fmha": true,
|
348 |
+
"use_fp8_context_fmha": true,
|
349 |
+
"fuse_fp4_quant": false,
|
350 |
+
"multiple_profiles": true,
|
351 |
+
"paged_state": false,
|
352 |
+
"streamingllm": false,
|
353 |
+
"manage_weights": false,
|
354 |
+
"use_fused_mlp": true,
|
355 |
+
"pp_reduce_scatter": false
|
356 |
+
},
|
357 |
+
"use_strip_plan": false,
|
358 |
+
"max_encoder_input_len": 1024,
|
359 |
+
"monitor_memory": false,
|
360 |
+
"use_mrope": false
|
361 |
+
}
|
362 |
+
}
|
tensorrt_llm/1/model.py
ADDED
@@ -0,0 +1,1386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
import time
|
6 |
+
from dataclasses import dataclass
|
7 |
+
from random import randint
|
8 |
+
from threading import Lock, Thread
|
9 |
+
from typing import Any, List
|
10 |
+
|
11 |
+
import numpy as np
|
12 |
+
import torch
|
13 |
+
import triton_python_backend_utils as pb_utils
|
14 |
+
from torch import from_numpy
|
15 |
+
from torch.utils.dlpack import from_dlpack
|
16 |
+
|
17 |
+
import tensorrt_llm.bindings.executor as trtllm
|
18 |
+
from tensorrt_llm.llmapi.tokenizer import _xgrammar_tokenizer_info
|
19 |
+
|
20 |
+
METRIC_TOTAL_OUTPUT_TOKENS = "total_output_tokens"
|
21 |
+
METRIC_TOTAL_INPUT_TOKENS = "total_input_tokens"
|
22 |
+
import tensorrt_llm.logger as logger
|
23 |
+
|
24 |
+
# From https://github.com/pytorch/pytorch/blob/39425feac799905402abe4d15667fa47c344f2d7/torch/testing/_internal/common_utils.py#L1761
|
25 |
+
# Dict of NumPy dtype -> torch dtype (when the correspondence exists)
|
26 |
+
numpy_to_torch_dtype_dict = {
|
27 |
+
np.bool_: torch.bool,
|
28 |
+
np.uint8: torch.uint8,
|
29 |
+
np.uint16: torch.uint16,
|
30 |
+
np.uint32: torch.uint32,
|
31 |
+
np.uint64: torch.uint64,
|
32 |
+
np.int8: torch.int8,
|
33 |
+
np.int16: torch.int16,
|
34 |
+
np.int32: torch.int32,
|
35 |
+
np.int64: torch.int64,
|
36 |
+
np.float16: torch.float16,
|
37 |
+
np.float32: torch.float32,
|
38 |
+
np.float64: torch.float64,
|
39 |
+
np.complex64: torch.complex64,
|
40 |
+
np.complex128: torch.complex128
|
41 |
+
}
|
42 |
+
|
43 |
+
# Dict of torch dtype -> NumPy dtype
|
44 |
+
torch_to_numpy_dtype_dict = {
|
45 |
+
value: key
|
46 |
+
for (key, value) in numpy_to_torch_dtype_dict.items()
|
47 |
+
}
|
48 |
+
torch_to_numpy_dtype_dict.update({
|
49 |
+
torch.bfloat16: np.float32,
|
50 |
+
torch.complex32: np.complex64
|
51 |
+
})
|
52 |
+
|
53 |
+
|
54 |
+
@dataclass
|
55 |
+
class RequestData:
|
56 |
+
triton_req_id: int
|
57 |
+
triton_user_id: str
|
58 |
+
batch_index: int
|
59 |
+
batch_size: int
|
60 |
+
num_return_sequences: int
|
61 |
+
num_input_tokens: int
|
62 |
+
num_output_tokens: int
|
63 |
+
response_sender: Any
|
64 |
+
|
65 |
+
|
66 |
+
def mpi_comm():
|
67 |
+
from mpi4py import MPI
|
68 |
+
return MPI.COMM_WORLD
|
69 |
+
|
70 |
+
|
71 |
+
def mpi_rank():
|
72 |
+
return mpi_comm().Get_rank()
|
73 |
+
|
74 |
+
|
75 |
+
def get_input_tensor_by_name(request,
|
76 |
+
name,
|
77 |
+
expected_batch_size=None,
|
78 |
+
batch_index=None,
|
79 |
+
force_on_torch=False):
|
80 |
+
tensor = pb_utils.get_input_tensor_by_name(request, name)
|
81 |
+
if tensor is None:
|
82 |
+
return None
|
83 |
+
|
84 |
+
if tensor.is_cpu() and not force_on_torch:
|
85 |
+
tensor = tensor.as_numpy()
|
86 |
+
else:
|
87 |
+
tensor = from_dlpack(tensor.to_dlpack())
|
88 |
+
|
89 |
+
if expected_batch_size is not None and tensor.shape[
|
90 |
+
0] != expected_batch_size:
|
91 |
+
raise pb_utils.TritonModelException(
|
92 |
+
f"Expected batch size doesn't match batch size for tensor {name}. Expected {expected_batch_size} got {tensor.shape[0]}"
|
93 |
+
)
|
94 |
+
|
95 |
+
if batch_index is not None and expected_batch_size is not None and batch_index >= expected_batch_size:
|
96 |
+
raise pb_utils.TritonModelException(
|
97 |
+
f"Invalid batch index in get_input_tensor_by_name for {name}")
|
98 |
+
|
99 |
+
if batch_index is not None:
|
100 |
+
# Add leading 1 batch dimension
|
101 |
+
if isinstance(tensor, np.ndarray):
|
102 |
+
return np.expand_dims(tensor[batch_index], axis=0)
|
103 |
+
elif isinstance(tensor, torch.Tensor):
|
104 |
+
return torch.unsqueeze(tensor[batch_index], dim=0)
|
105 |
+
else:
|
106 |
+
return tensor
|
107 |
+
|
108 |
+
|
109 |
+
def get_input_scalar_by_name(request,
|
110 |
+
name,
|
111 |
+
expected_batch_size=1,
|
112 |
+
batch_index=0):
|
113 |
+
tensor = pb_utils.get_input_tensor_by_name(request, name)
|
114 |
+
if tensor is None:
|
115 |
+
return None
|
116 |
+
tensor = tensor.as_numpy()
|
117 |
+
|
118 |
+
if tensor.size != expected_batch_size:
|
119 |
+
raise pb_utils.TritonModelException(
|
120 |
+
f"Expected a scalar tensor for tensor {name}")
|
121 |
+
|
122 |
+
return tensor.item(batch_index)
|
123 |
+
|
124 |
+
|
125 |
+
def read_parameter_as_type(value, name, pytype=str):
|
126 |
+
if value == "":
|
127 |
+
return None
|
128 |
+
if value.startswith("${") and value.endswith("}"):
|
129 |
+
return None
|
130 |
+
if pytype is bool:
|
131 |
+
return value.lower() in ["1", "true"]
|
132 |
+
try:
|
133 |
+
result = pytype(value)
|
134 |
+
return result
|
135 |
+
except:
|
136 |
+
pb_utils.Logger.log_warning(
|
137 |
+
f"Could not read parameter '{name}' with value '{value}', will use default."
|
138 |
+
)
|
139 |
+
return None
|
140 |
+
|
141 |
+
|
142 |
+
def get_parameter(model_config, name, pytype=str):
|
143 |
+
if name not in model_config['parameters']:
|
144 |
+
return None
|
145 |
+
return read_parameter_as_type(
|
146 |
+
model_config['parameters'][name]['string_value'], name, pytype)
|
147 |
+
|
148 |
+
|
149 |
+
def convert_word_list(word_list):
|
150 |
+
if word_list is None:
|
151 |
+
return None
|
152 |
+
word_list = word_list.tolist()
|
153 |
+
if len(word_list) == 0 or len(word_list[0]) != 2:
|
154 |
+
raise pb_utils.TritonModelException(f"Invalid format for word list.")
|
155 |
+
words, indices = word_list[0]
|
156 |
+
result = []
|
157 |
+
current_index = 0
|
158 |
+
for i in indices:
|
159 |
+
if i == -1:
|
160 |
+
continue
|
161 |
+
if i > len(words):
|
162 |
+
raise pb_utils.TritonModelException(
|
163 |
+
f"Invalid format for word list.")
|
164 |
+
current_word = []
|
165 |
+
while current_index < i:
|
166 |
+
current_word.append(words[current_index])
|
167 |
+
current_index += 1
|
168 |
+
result.append(current_word)
|
169 |
+
return result
|
170 |
+
|
171 |
+
|
172 |
+
def parse_medusa_choices(medusa_choices):
|
173 |
+
if medusa_choices is None:
|
174 |
+
return None
|
175 |
+
try:
|
176 |
+
result = json.loads(
|
177 |
+
"[" + medusa_choices.replace("{", "[").replace("}", "]") + "]")
|
178 |
+
assert isinstance(result, list) and len(result) > 0
|
179 |
+
assert all([isinstance(x, list) for x in result])
|
180 |
+
assert all([isinstance(y, int) for x in result for y in x])
|
181 |
+
except Exception:
|
182 |
+
raise pb_utils.TritonModelException(
|
183 |
+
"Invalid format for medusa_choices")
|
184 |
+
return result
|
185 |
+
|
186 |
+
|
187 |
+
def parse_eagle_choices(eagle_choices):
|
188 |
+
return parse_medusa_choices(eagle_choices)
|
189 |
+
|
190 |
+
|
191 |
+
def get_sampling_config_from_request(request, batch_size=1, batch_index=0):
|
192 |
+
kwargs = {}
|
193 |
+
kwargs['beam_width'] = get_input_scalar_by_name(
|
194 |
+
request, 'beam_width', batch_size, batch_index) or 1
|
195 |
+
kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k',
|
196 |
+
batch_size, batch_index)
|
197 |
+
kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p',
|
198 |
+
batch_size, batch_index)
|
199 |
+
kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[
|
200 |
+
'top_p'] <= 0 else kwargs['top_p']
|
201 |
+
kwargs['random_seed'] = get_input_scalar_by_name(request, 'random_seed',
|
202 |
+
batch_size, batch_index)
|
203 |
+
kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature',
|
204 |
+
batch_size, batch_index)
|
205 |
+
kwargs['min_length'] = get_input_scalar_by_name(request, 'min_length',
|
206 |
+
batch_size, batch_index)
|
207 |
+
kwargs['repetition_penalty'] = get_input_scalar_by_name(
|
208 |
+
request, 'repetition_penalty', batch_size, batch_index)
|
209 |
+
kwargs['presence_penalty'] = get_input_scalar_by_name(
|
210 |
+
request, 'presence_penalty', batch_size, batch_index)
|
211 |
+
kwargs['frequency_penalty'] = get_input_scalar_by_name(
|
212 |
+
request, 'frequency_penalty', batch_size, batch_index)
|
213 |
+
kwargs['length_penalty'] = get_input_scalar_by_name(
|
214 |
+
request, 'len_penalty', batch_size, batch_index)
|
215 |
+
kwargs['top_p_min'] = get_input_scalar_by_name(request,
|
216 |
+
'runtime_top_p_min',
|
217 |
+
batch_size, batch_index)
|
218 |
+
kwargs['top_p_reset_ids'] = get_input_scalar_by_name(
|
219 |
+
request, 'runtime_top_p_reset_ids', batch_size, batch_index)
|
220 |
+
kwargs['top_p_decay'] = get_input_scalar_by_name(request,
|
221 |
+
'runtime_top_p_decay',
|
222 |
+
batch_size, batch_index)
|
223 |
+
kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name(
|
224 |
+
request, 'beam_search_diversity_rate', batch_size, batch_index)
|
225 |
+
kwargs['early_stopping'] = get_input_scalar_by_name(
|
226 |
+
request, 'early_stopping', batch_size, batch_index)
|
227 |
+
kwargs['num_return_sequences'] = get_input_scalar_by_name(
|
228 |
+
request, 'num_return_sequences', batch_size, batch_index) or 1
|
229 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
230 |
+
return trtllm.SamplingConfig(**kwargs)
|
231 |
+
|
232 |
+
|
233 |
+
def get_output_config_from_request(request, batch_size=1, batch_index=0):
|
234 |
+
kwargs = {}
|
235 |
+
kwargs["return_log_probs"] = get_input_scalar_by_name(
|
236 |
+
request, 'return_log_probs', batch_size, batch_index)
|
237 |
+
kwargs["return_context_logits"] = get_input_scalar_by_name(
|
238 |
+
request, 'return_context_logits', batch_size, batch_index)
|
239 |
+
kwargs["return_generation_logits"] = get_input_scalar_by_name(
|
240 |
+
request, 'return_generation_logits', batch_size, batch_index)
|
241 |
+
kwargs["return_perf_metrics"] = get_input_scalar_by_name(
|
242 |
+
request, 'return_kv_cache_reuse_stats', batch_size, batch_index)
|
243 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
244 |
+
return trtllm.OutputConfig(**kwargs)
|
245 |
+
|
246 |
+
|
247 |
+
def get_external_draft_tokens_config_from_request(request,
|
248 |
+
batch_size=1,
|
249 |
+
batch_index=0):
|
250 |
+
kwargs = {}
|
251 |
+
draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids',
|
252 |
+
batch_size, batch_index)
|
253 |
+
if draft_input_ids is not None:
|
254 |
+
kwargs['tokens'] = draft_input_ids[0].tolist()
|
255 |
+
draft_logits = get_input_tensor_by_name(request, 'draft_logits',
|
256 |
+
batch_size, batch_index)
|
257 |
+
if draft_logits is not None:
|
258 |
+
kwargs['logits'] = from_numpy(draft_logits).squeeze(dim=0)
|
259 |
+
kwargs['acceptance_threshold'] = get_input_scalar_by_name(
|
260 |
+
request, 'draft_acceptance_threshold', batch_size, batch_index)
|
261 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
262 |
+
if len(kwargs) > 0:
|
263 |
+
return trtllm.ExternalDraftTokensConfig(**kwargs)
|
264 |
+
return None
|
265 |
+
|
266 |
+
|
267 |
+
def get_prompt_tuning_config_from_request(request,
|
268 |
+
batch_size=1,
|
269 |
+
batch_index=0,
|
270 |
+
input_length=0):
|
271 |
+
# prompt_vocab_size is unused by executor.
|
272 |
+
kwargs = {}
|
273 |
+
prompt_embedding_table = get_input_tensor_by_name(
|
274 |
+
request, 'prompt_embedding_table', batch_size, batch_index)
|
275 |
+
prompt_table_extra_ids = get_input_tensor_by_name(
|
276 |
+
request, 'prompt_table_extra_ids', batch_size, batch_index)
|
277 |
+
if prompt_embedding_table is not None:
|
278 |
+
if isinstance(prompt_embedding_table, np.ndarray):
|
279 |
+
kwargs["embedding_table"] = from_numpy(
|
280 |
+
prompt_embedding_table).squeeze(dim=0)
|
281 |
+
elif isinstance(prompt_embedding_table, torch.Tensor):
|
282 |
+
kwargs["embedding_table"] = prompt_embedding_table.squeeze(dim=0)
|
283 |
+
|
284 |
+
if prompt_table_extra_ids is not None:
|
285 |
+
prompt_table_extra_ids = prompt_table_extra_ids[0].tolist()
|
286 |
+
if len(prompt_table_extra_ids) != 0:
|
287 |
+
kwargs["input_token_extra_ids"] = prompt_table_extra_ids[
|
288 |
+
0:input_length]
|
289 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
290 |
+
if len(kwargs) > 0:
|
291 |
+
return trtllm.PromptTuningConfig(**kwargs)
|
292 |
+
return None
|
293 |
+
|
294 |
+
|
295 |
+
def get_lora_config_from_request(request, batch_size=1, batch_index=0):
|
296 |
+
kwargs = {}
|
297 |
+
kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id',
|
298 |
+
batch_size, batch_index)
|
299 |
+
lora_weights = get_input_tensor_by_name(request, 'lora_weights',
|
300 |
+
batch_size, batch_index)
|
301 |
+
if lora_weights is not None:
|
302 |
+
kwargs["weights"] = from_numpy(lora_weights).squeeze(dim=0)
|
303 |
+
lora_config = get_input_tensor_by_name(request, 'lora_config', batch_size,
|
304 |
+
batch_index)
|
305 |
+
if lora_config is not None:
|
306 |
+
kwargs["config"] = from_numpy(lora_config).squeeze(dim=0)
|
307 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
308 |
+
if len(kwargs) > 0:
|
309 |
+
return trtllm.LoraConfig(**kwargs)
|
310 |
+
return None
|
311 |
+
|
312 |
+
|
313 |
+
def get_guided_decoding_params_from_request(request,
|
314 |
+
batch_size=1,
|
315 |
+
batch_index=0):
|
316 |
+
kwargs = {}
|
317 |
+
guided_decoding_guide_type = get_input_tensor_by_name(
|
318 |
+
request, 'guided_decoding_guide_type', batch_size, batch_index)
|
319 |
+
if guided_decoding_guide_type is not None:
|
320 |
+
guided_decoding_guide_type = guided_decoding_guide_type.squeeze(
|
321 |
+
axis=0)[0].decode()
|
322 |
+
guided_decoding_guide_type_mapping = {
|
323 |
+
"json": trtllm.GuidedDecodingParams.GuideType.JSON,
|
324 |
+
"json_schema": trtllm.GuidedDecodingParams.GuideType.JSON_SCHEMA,
|
325 |
+
"regex": trtllm.GuidedDecodingParams.GuideType.REGEX,
|
326 |
+
"ebnf_grammar": trtllm.GuidedDecodingParams.GuideType.EBNF_GRAMMAR
|
327 |
+
}
|
328 |
+
guided_decoding_guide_type = guided_decoding_guide_type_mapping.get(
|
329 |
+
guided_decoding_guide_type)
|
330 |
+
kwargs['guide_type'] = guided_decoding_guide_type
|
331 |
+
|
332 |
+
guided_decoding_guide = get_input_tensor_by_name(request,
|
333 |
+
'guided_decoding_guide',
|
334 |
+
batch_size, batch_index)
|
335 |
+
if guided_decoding_guide is not None:
|
336 |
+
kwargs['guide'] = guided_decoding_guide.squeeze(axis=0)[0].decode()
|
337 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
338 |
+
if len(kwargs) > 0:
|
339 |
+
return trtllm.GuidedDecodingParams(**kwargs)
|
340 |
+
return None
|
341 |
+
|
342 |
+
|
343 |
+
def get_kv_cache_retention_config_from_request(request,
|
344 |
+
batch_size=1,
|
345 |
+
batch_index=0):
|
346 |
+
|
347 |
+
def get_tensor_and_check_length(name: str, expected_length: int):
|
348 |
+
tensor = get_input_tensor_by_name(request, name, batch_size,
|
349 |
+
batch_index)
|
350 |
+
|
351 |
+
if tensor is None:
|
352 |
+
raise RuntimeError(f"{name} must be provided.")
|
353 |
+
|
354 |
+
tensor = np.squeeze(tensor, axis=0)
|
355 |
+
|
356 |
+
if len(tensor) != expected_length:
|
357 |
+
raise RuntimeError(
|
358 |
+
f"Invalid {name} length. Expected length {expected_length}, got length {len(tensor)}"
|
359 |
+
)
|
360 |
+
|
361 |
+
return tensor
|
362 |
+
|
363 |
+
token_range_starts = get_input_tensor_by_name(
|
364 |
+
request, "retention_token_range_starts", batch_size, batch_index)
|
365 |
+
|
366 |
+
if token_range_starts is not None:
|
367 |
+
token_range_starts = np.squeeze(token_range_starts, axis=0)
|
368 |
+
|
369 |
+
token_range_ends = get_tensor_and_check_length(
|
370 |
+
"retention_token_range_ends", len(token_range_starts))
|
371 |
+
token_range_ends = [
|
372 |
+
None if end == -1 else end for end in token_range_ends
|
373 |
+
]
|
374 |
+
|
375 |
+
token_range_priorities = get_tensor_and_check_length(
|
376 |
+
"retention_token_range_priorities", len(token_range_starts))
|
377 |
+
|
378 |
+
token_range_durations_ms = get_input_tensor_by_name(
|
379 |
+
request, "retention_token_range_durations_ms", batch_size,
|
380 |
+
batch_index)
|
381 |
+
|
382 |
+
if token_range_durations_ms is None:
|
383 |
+
token_range_durations_ms = [None] * len(token_range_starts)
|
384 |
+
else:
|
385 |
+
token_range_durations_ms = np.squeeze(token_range_durations_ms,
|
386 |
+
axis=0)
|
387 |
+
token_range_durations_ms = [
|
388 |
+
None if duration == -1 else duration
|
389 |
+
for duration in token_range_durations_ms
|
390 |
+
]
|
391 |
+
|
392 |
+
if len(token_range_durations_ms) != len(token_range_starts):
|
393 |
+
raise RuntimeError(
|
394 |
+
f"Invalid retention_token_range_durations length. Expected length {len(token_range_starts)}, got length {len(token_range_durations_ms)}"
|
395 |
+
)
|
396 |
+
|
397 |
+
ranges = []
|
398 |
+
|
399 |
+
for start, end, priority, duration_ms in zip(token_range_starts,
|
400 |
+
token_range_ends,
|
401 |
+
token_range_priorities,
|
402 |
+
token_range_durations_ms):
|
403 |
+
ranges.append(
|
404 |
+
trtllm.KvCacheRetentionConfig.TokenRangeRetentionConfig(
|
405 |
+
token_start=start,
|
406 |
+
token_end=end,
|
407 |
+
priority=priority.item(),
|
408 |
+
duration_ms=None if duration_ms is None else
|
409 |
+
datetime.timedelta(milliseconds=duration_ms.item())))
|
410 |
+
|
411 |
+
decode_args = {}
|
412 |
+
|
413 |
+
decode_priority = get_input_scalar_by_name(
|
414 |
+
request, "retention_decode_priority", batch_size, batch_index)
|
415 |
+
if decode_priority is not None:
|
416 |
+
decode_args['decode_retention_priority'] = decode_priority
|
417 |
+
|
418 |
+
decode_duration_ms = get_input_scalar_by_name(
|
419 |
+
request, "retention_decode_duration_ms", batch_size, batch_index)
|
420 |
+
if decode_duration_ms is not None:
|
421 |
+
decode_args[
|
422 |
+
'decode_duration_ms'] = decode_duration_ms if decode_duration_ms != -1 else None
|
423 |
+
|
424 |
+
return trtllm.KvCacheRetentionConfig(
|
425 |
+
token_range_retention_configs=ranges, **decode_args)
|
426 |
+
|
427 |
+
return None
|
428 |
+
|
429 |
+
|
430 |
+
def build_1_2_5_buckets(max_value: int) -> List[int]:
|
431 |
+
"""
|
432 |
+
Builds a list of buckets with increasing powers of 10 multiplied by
|
433 |
+
mantissa values (1, 5), starting from 10 until the value exceeds
|
434 |
+
the specified maximum.
|
435 |
+
|
436 |
+
Example:
|
437 |
+
>>> build_1_2_5_buckets(1000)
|
438 |
+
[10, 50, 100, 500, 1000]
|
439 |
+
"""
|
440 |
+
mantissa_lst = [1, 5]
|
441 |
+
exponent = 1 # Start from exponent 1 instead of 0
|
442 |
+
buckets: List[int] = []
|
443 |
+
while True:
|
444 |
+
for m in mantissa_lst:
|
445 |
+
value = m * 10**exponent
|
446 |
+
if value <= max_value:
|
447 |
+
buckets.append(value)
|
448 |
+
else:
|
449 |
+
return buckets
|
450 |
+
exponent += 1
|
451 |
+
|
452 |
+
|
453 |
+
def convert_request(request, exclude_input_from_output, decoupled):
|
454 |
+
inputs = {}
|
455 |
+
input_token_ids = get_input_tensor_by_name(request, 'input_ids')
|
456 |
+
if input_token_ids is None:
|
457 |
+
raise pb_utils.TritonModelException(
|
458 |
+
"A value is required for input_ids")
|
459 |
+
if len(input_token_ids.shape) != 2:
|
460 |
+
raise pb_utils.TritonModelException(f"Invalid format for input_ids")
|
461 |
+
batch_size = input_token_ids.shape[0]
|
462 |
+
requests = []
|
463 |
+
for batch_index in range(0, batch_size):
|
464 |
+
input_token_ids = get_input_tensor_by_name(request, 'input_ids',
|
465 |
+
batch_size, batch_index)[0]
|
466 |
+
if input_token_ids is None:
|
467 |
+
raise pb_utils.TritonModelException(
|
468 |
+
"A value is required for input_ids")
|
469 |
+
input_token_ids = input_token_ids.tolist()
|
470 |
+
if len(input_token_ids) == 0:
|
471 |
+
raise pb_utils.TritonModelException(
|
472 |
+
f"Invalid format for input_ids")
|
473 |
+
|
474 |
+
input_length = get_input_scalar_by_name(request, 'input_lengths',
|
475 |
+
batch_size, batch_index)
|
476 |
+
if input_length is None:
|
477 |
+
input_length = len(input_token_ids)
|
478 |
+
# Trim input token ids with input_lengths
|
479 |
+
inputs['input_token_ids'] = input_token_ids[0:input_length]
|
480 |
+
inputs['max_new_tokens'] = get_input_scalar_by_name(
|
481 |
+
request, 'request_output_len', batch_size, batch_index)
|
482 |
+
if inputs['max_new_tokens'] is None:
|
483 |
+
raise pb_utils.TritonModelException(
|
484 |
+
"A value is required for request_output_len")
|
485 |
+
inputs['streaming'] = get_input_scalar_by_name(request, 'streaming',
|
486 |
+
batch_size, batch_index)
|
487 |
+
if inputs['streaming'] and not decoupled:
|
488 |
+
raise pb_utils.TritonModelException(
|
489 |
+
"Streaming is only supported in decoupled mode.")
|
490 |
+
|
491 |
+
inputs['end_id'] = get_input_scalar_by_name(request, 'end_id',
|
492 |
+
batch_size, batch_index)
|
493 |
+
inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id',
|
494 |
+
batch_size, batch_index)
|
495 |
+
inputs['stop_words'] = convert_word_list(
|
496 |
+
get_input_tensor_by_name(request, 'stop_words_list', batch_size,
|
497 |
+
batch_index))
|
498 |
+
inputs['bad_words'] = convert_word_list(
|
499 |
+
get_input_tensor_by_name(request, 'bad_words_list', batch_size,
|
500 |
+
batch_index))
|
501 |
+
embedding_bias = get_input_tensor_by_name(request, 'embedding_bias',
|
502 |
+
batch_size, batch_index)
|
503 |
+
if embedding_bias is not None and embedding_bias.size != 0:
|
504 |
+
inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze(
|
505 |
+
dim=0)
|
506 |
+
|
507 |
+
sampling_config = get_sampling_config_from_request(
|
508 |
+
request, batch_size, batch_index)
|
509 |
+
output_config = get_output_config_from_request(request, batch_size,
|
510 |
+
batch_index)
|
511 |
+
req_exclude_input_from_output = get_input_scalar_by_name(
|
512 |
+
request, 'exclude_input_in_output', batch_size, batch_index)
|
513 |
+
if req_exclude_input_from_output is None:
|
514 |
+
# if request doesn't specify exclude_input_from_output, try to use the parameter
|
515 |
+
output_config.exclude_input_from_output = (
|
516 |
+
exclude_input_from_output
|
517 |
+
if exclude_input_from_output is not None else False)
|
518 |
+
else:
|
519 |
+
output_config.exclude_input_from_output = req_exclude_input_from_output
|
520 |
+
|
521 |
+
external_draft_tokens_config = get_external_draft_tokens_config_from_request(
|
522 |
+
request, batch_size, batch_index)
|
523 |
+
prompt_tuning_config = get_prompt_tuning_config_from_request(
|
524 |
+
request, batch_size, batch_index, input_length)
|
525 |
+
lora_config = get_lora_config_from_request(request, batch_size,
|
526 |
+
batch_index)
|
527 |
+
kv_cache_retention_config = get_kv_cache_retention_config_from_request(
|
528 |
+
request, batch_size, batch_index)
|
529 |
+
|
530 |
+
# Inputs for mllama support
|
531 |
+
encoder_input_features = get_input_tensor_by_name(
|
532 |
+
request, 'encoder_input_features', batch_size, batch_index)
|
533 |
+
if encoder_input_features is not None:
|
534 |
+
if isinstance(encoder_input_features, np.ndarray):
|
535 |
+
encoder_input_features = from_numpy(
|
536 |
+
encoder_input_features).squeeze(dim=0)
|
537 |
+
elif isinstance(encoder_input_features, torch.Tensor):
|
538 |
+
encoder_input_features = encoder_input_features.squeeze(dim=0)
|
539 |
+
inputs['encoder_input_features'] = encoder_input_features
|
540 |
+
logger.debug(
|
541 |
+
f"inputs to llm: encoder_input_features ({encoder_input_features.shape}"
|
542 |
+
)
|
543 |
+
|
544 |
+
encoder_output_length = get_input_tensor_by_name(
|
545 |
+
request, 'encoder_output_lengths', batch_size, batch_index)
|
546 |
+
if encoder_output_length is not None:
|
547 |
+
inputs['encoder_output_length'] = np.squeeze(
|
548 |
+
encoder_output_length, axis=0)
|
549 |
+
|
550 |
+
cross_attention_mask = get_input_tensor_by_name(
|
551 |
+
request, 'cross_attention_mask', batch_size, batch_index)
|
552 |
+
if cross_attention_mask is not None:
|
553 |
+
inputs['cross_attention_mask'] = cross_attention_mask[0]
|
554 |
+
logger.debug(
|
555 |
+
f"inputs to llm: cross_attention_mask ({ cross_attention_mask.shape})"
|
556 |
+
)
|
557 |
+
|
558 |
+
skip_cross_attn_blocks = get_input_tensor_by_name(
|
559 |
+
request,
|
560 |
+
'skip_cross_attn_blocks',
|
561 |
+
batch_size,
|
562 |
+
batch_index,
|
563 |
+
force_on_torch=True)
|
564 |
+
if skip_cross_attn_blocks is not None:
|
565 |
+
inputs['skip_cross_attn_blocks'] = skip_cross_attn_blocks[0]
|
566 |
+
logger.debug(
|
567 |
+
f"inputs to llm: skip_cross_attn_blocks ({ skip_cross_attn_blocks.shape})"
|
568 |
+
)
|
569 |
+
|
570 |
+
guided_decoding_params = get_guided_decoding_params_from_request(
|
571 |
+
request, batch_size, batch_index)
|
572 |
+
|
573 |
+
requests.append(
|
574 |
+
trtllm.Request(
|
575 |
+
**inputs,
|
576 |
+
sampling_config=sampling_config,
|
577 |
+
output_config=output_config,
|
578 |
+
external_draft_tokens_config=external_draft_tokens_config,
|
579 |
+
prompt_tuning_config=prompt_tuning_config,
|
580 |
+
lora_config=lora_config,
|
581 |
+
guided_decoding_params=guided_decoding_params,
|
582 |
+
kv_cache_retention_config=kv_cache_retention_config))
|
583 |
+
return requests
|
584 |
+
|
585 |
+
|
586 |
+
def convert_response(response,
|
587 |
+
batch_index,
|
588 |
+
batch_size,
|
589 |
+
num_return_sequences,
|
590 |
+
expected_logits_dtype=torch.float32):
|
591 |
+
|
592 |
+
if response.has_error():
|
593 |
+
return pb_utils.InferenceResponse(output_tensors=[],
|
594 |
+
error=pb_utils.TritonError(
|
595 |
+
response.error_msg)), True, 0
|
596 |
+
result = response.result
|
597 |
+
beam_lengths = np.expand_dims(
|
598 |
+
np.array([len(beam) for beam in result.output_token_ids], np.int32), 0)
|
599 |
+
max_beam_length = max([len(beam) for beam in result.output_token_ids])
|
600 |
+
output_ids = np.full((1, len(result.output_token_ids), max_beam_length),
|
601 |
+
-1, np.int32)
|
602 |
+
for idx, beam in enumerate(result.output_token_ids):
|
603 |
+
output_ids[0, idx, :len(beam)] = beam
|
604 |
+
|
605 |
+
output_lengths = output_ids.size
|
606 |
+
output_tensors = [
|
607 |
+
pb_utils.Tensor("output_ids", output_ids),
|
608 |
+
pb_utils.Tensor("sequence_length", beam_lengths),
|
609 |
+
]
|
610 |
+
|
611 |
+
if result.cum_log_probs is not None:
|
612 |
+
output_tensors.append(
|
613 |
+
pb_utils.Tensor(
|
614 |
+
"cum_log_probs",
|
615 |
+
np.expand_dims(np.array(result.cum_log_probs, np.float32), 0)))
|
616 |
+
|
617 |
+
if result.log_probs is not None:
|
618 |
+
output_tensors.append(
|
619 |
+
pb_utils.Tensor(
|
620 |
+
"output_log_probs",
|
621 |
+
np.expand_dims(np.array(result.log_probs, np.float32), 0)))
|
622 |
+
|
623 |
+
if result.context_logits is not None:
|
624 |
+
assert (result.context_logits.dtype is expected_logits_dtype)
|
625 |
+
output_tensors.append(
|
626 |
+
pb_utils.Tensor(
|
627 |
+
"context_logits",
|
628 |
+
np.expand_dims(
|
629 |
+
np.array(
|
630 |
+
result.context_logits, torch_to_numpy_dtype_dict[
|
631 |
+
result.context_logits.dtype]), 0)))
|
632 |
+
|
633 |
+
if result.generation_logits is not None:
|
634 |
+
assert (result.generation_logits.dtype is expected_logits_dtype)
|
635 |
+
output_tensors.append(
|
636 |
+
pb_utils.Tensor(
|
637 |
+
"generation_logits",
|
638 |
+
np.expand_dims(
|
639 |
+
np.array(
|
640 |
+
result.generation_logits, torch_to_numpy_dtype_dict[
|
641 |
+
result.generation_logits.dtype]), 0)))
|
642 |
+
|
643 |
+
if batch_size > 1:
|
644 |
+
output_tensors.append(
|
645 |
+
pb_utils.Tensor(
|
646 |
+
"batch_index",
|
647 |
+
np.expand_dims(np.array([batch_index], np.int32), 0)))
|
648 |
+
|
649 |
+
if num_return_sequences > 1:
|
650 |
+
output_tensors.append(
|
651 |
+
pb_utils.Tensor(
|
652 |
+
"sequence_index",
|
653 |
+
np.expand_dims(np.array([result.sequence_index], np.int32),
|
654 |
+
0)))
|
655 |
+
|
656 |
+
if result.request_perf_metrics is not None:
|
657 |
+
kv_cache_metrics = result.request_perf_metrics.kv_cache_metrics
|
658 |
+
output_tensors.append(
|
659 |
+
pb_utils.Tensor(
|
660 |
+
"kv_cache_alloc_new_blocks",
|
661 |
+
np.expand_dims(
|
662 |
+
np.array([kv_cache_metrics.num_new_allocated_blocks],
|
663 |
+
np.int32), 0)))
|
664 |
+
output_tensors.append(
|
665 |
+
pb_utils.Tensor(
|
666 |
+
"kv_cache_reused_blocks",
|
667 |
+
np.expand_dims(
|
668 |
+
np.array([kv_cache_metrics.num_reused_blocks], np.int32),
|
669 |
+
0)))
|
670 |
+
output_tensors.append(
|
671 |
+
pb_utils.Tensor(
|
672 |
+
"kv_cache_alloc_total_blocks",
|
673 |
+
np.expand_dims(
|
674 |
+
np.array([kv_cache_metrics.num_total_allocated_blocks],
|
675 |
+
np.int32), 0)))
|
676 |
+
|
677 |
+
return pb_utils.InferenceResponse(
|
678 |
+
output_tensors), result.is_final, output_lengths
|
679 |
+
|
680 |
+
|
681 |
+
def convert_scheduler_policy(batch_scheduler_policy: str):
|
682 |
+
if batch_scheduler_policy.lower() == "max_utilization":
|
683 |
+
return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
|
684 |
+
elif batch_scheduler_policy.lower() == "guaranteed_no_evict":
|
685 |
+
return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
|
686 |
+
raise pb_utils.TritonModelException(
|
687 |
+
f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported."
|
688 |
+
)
|
689 |
+
|
690 |
+
|
691 |
+
def convert_batching_type(gpt_model_type: str):
|
692 |
+
if gpt_model_type is None:
|
693 |
+
return None
|
694 |
+
if gpt_model_type.lower(
|
695 |
+
) == "inflight_fused_batching" or gpt_model_type.lower(
|
696 |
+
) == "inflight_batching":
|
697 |
+
return trtllm.BatchingType.INFLIGHT
|
698 |
+
elif gpt_model_type.lower() == "v1":
|
699 |
+
return trtllm.BatchingType.STATIC
|
700 |
+
raise pb_utils.TritonModelException(
|
701 |
+
f"gpt_model_type value of '{gpt_model_type}' is not supported.")
|
702 |
+
|
703 |
+
|
704 |
+
def convert_decoding_mode(decoding_mode: str):
|
705 |
+
if decoding_mode is None:
|
706 |
+
return None
|
707 |
+
elif decoding_mode == "auto":
|
708 |
+
return trtllm.DecodingMode.Auto()
|
709 |
+
elif decoding_mode == "top_k":
|
710 |
+
return trtllm.DecodingMode.TopK()
|
711 |
+
elif decoding_mode == "top_p":
|
712 |
+
return trtllm.DecodingMode.TopP()
|
713 |
+
elif decoding_mode == "top_k_top_p":
|
714 |
+
return trtllm.DecodingMode.TopKTopP()
|
715 |
+
elif decoding_mode == "beam_search":
|
716 |
+
return trtllm.DecodingMode.BeamSearch()
|
717 |
+
elif decoding_mode == "medusa":
|
718 |
+
return trtllm.DecodingMode.Medusa()
|
719 |
+
elif decoding_mode == "redrafter":
|
720 |
+
return trtllm.DecodingMode.ExplicitDraftTokens()
|
721 |
+
elif decoding_mode == "lookahead":
|
722 |
+
return trtllm.DecodingMode.Lookahead()
|
723 |
+
elif decoding_mode == "eagle":
|
724 |
+
return trtllm.DecodingMode.Eagle()
|
725 |
+
raise pb_utils.TritonModelException(
|
726 |
+
f"decoding_mode value of '{decoding_mode}' is not supported.")
|
727 |
+
|
728 |
+
|
729 |
+
def convert_timestamp_to_seconds(timestamp: str):
|
730 |
+
return int(
|
731 |
+
datetime.datetime.strptime(timestamp,
|
732 |
+
"%m-%d-%Y %H:%M:%S.%f").timestamp())
|
733 |
+
|
734 |
+
|
735 |
+
def triton_string_to_torch(dtype):
|
736 |
+
type_map = {
|
737 |
+
"TYPE_BOOL": torch.bool,
|
738 |
+
"TYPE_UINT8": torch.uint8,
|
739 |
+
"TYPE_INT8": torch.int8,
|
740 |
+
"TYPE_INT16": torch.int16,
|
741 |
+
"TYPE_INT32": torch.int32,
|
742 |
+
"TYPE_INT64": torch.int64,
|
743 |
+
"TYPE_FP16": torch.float16,
|
744 |
+
"TYPE_FP32": torch.float32,
|
745 |
+
"TYPE_FP64": torch.float64,
|
746 |
+
"TYPE_BF16": torch.bfloat16
|
747 |
+
}
|
748 |
+
return type_map[dtype]
|
749 |
+
|
750 |
+
|
751 |
+
class TritonPythonModel:
|
752 |
+
"""Your Python model must use the same class name. Every Python model
|
753 |
+
that is created must have "TritonPythonModel" as the class name.
|
754 |
+
"""
|
755 |
+
|
756 |
+
def get_scheduler_config(self, model_config):
|
757 |
+
batch_scheduler_policy = get_parameter(model_config,
|
758 |
+
"batch_scheduler_policy")
|
759 |
+
if batch_scheduler_policy is None:
|
760 |
+
return trtllm.SchedulerConfig()
|
761 |
+
return trtllm.SchedulerConfig(
|
762 |
+
convert_scheduler_policy(batch_scheduler_policy))
|
763 |
+
|
764 |
+
def get_kv_cache_config(self, model_config):
|
765 |
+
kwargs = {
|
766 |
+
"enable_block_reuse":
|
767 |
+
get_parameter(model_config, "enable_kv_cache_reuse", bool),
|
768 |
+
"max_tokens":
|
769 |
+
get_parameter(model_config, "max_tokens_in_paged_kv_cache", int),
|
770 |
+
"sink_token_length":
|
771 |
+
get_parameter(model_config, "sink_token_length", int),
|
772 |
+
"free_gpu_memory_fraction":
|
773 |
+
get_parameter(model_config, "kv_cache_free_gpu_mem_fraction",
|
774 |
+
float),
|
775 |
+
"cross_kv_cache_fraction":
|
776 |
+
get_parameter(model_config, "cross_kv_cache_fraction", float),
|
777 |
+
"host_cache_size":
|
778 |
+
get_parameter(model_config, "kv_cache_host_memory_bytes", int),
|
779 |
+
"onboard_blocks":
|
780 |
+
get_parameter(model_config, "kv_cache_onboard_blocks", bool),
|
781 |
+
}
|
782 |
+
max_attention_window_size = get_parameter(model_config,
|
783 |
+
"max_attention_window_size")
|
784 |
+
if max_attention_window_size:
|
785 |
+
kwargs["max_attention_window"] = [
|
786 |
+
int(x) for x in max_attention_window_size.split(",")
|
787 |
+
]
|
788 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
789 |
+
return trtllm.KvCacheConfig(**kwargs)
|
790 |
+
|
791 |
+
def get_parallel_config(self, model_config):
|
792 |
+
kwargs = {}
|
793 |
+
gpu_device_ids = get_parameter(model_config, "gpu_device_ids")
|
794 |
+
if gpu_device_ids:
|
795 |
+
kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")]
|
796 |
+
self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR",
|
797 |
+
"0") == "1"
|
798 |
+
if self.use_orchestrator_mode:
|
799 |
+
kwargs[
|
800 |
+
"communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR
|
801 |
+
worker_path = get_parameter(model_config, "worker_path")
|
802 |
+
spawn_processes = os.environ.get(
|
803 |
+
"TRTLLM_ORCHESTRATOR_SPAWN_PROCESSES", "1") == "1"
|
804 |
+
if not spawn_processes:
|
805 |
+
raise pb_utils.TritonModelException(
|
806 |
+
"Orchestrator mode with --disable-spawn-processes is not supported in the Python backend."
|
807 |
+
)
|
808 |
+
is_orchestrator = (mpi_rank() == 0) if spawn_processes else True
|
809 |
+
if worker_path is not None:
|
810 |
+
raise pb_utils.TritonModelException(
|
811 |
+
"worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable."
|
812 |
+
)
|
813 |
+
executor_worker_path = get_parameter(model_config,
|
814 |
+
"executor_worker_path")
|
815 |
+
kwargs["orchestrator_config"] = trtllm.OrchestratorConfig(
|
816 |
+
is_orchestrator, executor_worker_path)
|
817 |
+
if len(kwargs) > 0:
|
818 |
+
return trtllm.ParallelConfig(**kwargs)
|
819 |
+
return None
|
820 |
+
|
821 |
+
def get_peft_cache_config(self, model_config):
|
822 |
+
kwargs = {
|
823 |
+
"optimal_adapter_size":
|
824 |
+
get_parameter(model_config, "lora_cache_optimal_adapter_size",
|
825 |
+
int),
|
826 |
+
"max_adapter_size":
|
827 |
+
get_parameter(model_config, "lora_cache_max_adapter_size", int),
|
828 |
+
"device_cache_percent":
|
829 |
+
get_parameter(model_config, "lora_cache_gpu_memory_fraction",
|
830 |
+
float),
|
831 |
+
"host_cache_size":
|
832 |
+
get_parameter(model_config, "lora_cache_host_memory_bytes", int),
|
833 |
+
}
|
834 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
835 |
+
return trtllm.PeftCacheConfig(**kwargs)
|
836 |
+
|
837 |
+
def get_decoding_config(self, model_config):
|
838 |
+
eagle_choices = parse_eagle_choices(
|
839 |
+
get_parameter(model_config, "eagle_choices"))
|
840 |
+
kwargs = {
|
841 |
+
"medusa_choices":
|
842 |
+
parse_medusa_choices(get_parameter(model_config,
|
843 |
+
"medusa_choices")),
|
844 |
+
"eagle_config":
|
845 |
+
None
|
846 |
+
if eagle_choices is None else trtllm.EagleConfig(eagle_choices),
|
847 |
+
"decoding_mode":
|
848 |
+
convert_decoding_mode(get_parameter(model_config,
|
849 |
+
"decoding_mode")),
|
850 |
+
}
|
851 |
+
print(kwargs)
|
852 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
853 |
+
return trtllm.DecodingConfig(**kwargs)
|
854 |
+
|
855 |
+
def get_extended_runtime_perf_knob_config(self, model_config):
|
856 |
+
kwargs = {
|
857 |
+
"multi_block_mode":
|
858 |
+
get_parameter(model_config, "multi_block_mode", bool),
|
859 |
+
"enable_context_fmha_fp32_acc":
|
860 |
+
get_parameter(model_config, "enable_context_fmha_fp32_acc", bool),
|
861 |
+
"cuda_graph_mode":
|
862 |
+
get_parameter(model_config, "cuda_graph_mode", bool),
|
863 |
+
"cuda_graph_cache_size":
|
864 |
+
get_parameter(model_config, "cuda_graph_cache_size", int),
|
865 |
+
}
|
866 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
867 |
+
return trtllm.ExtendedRuntimePerfKnobConfig(**kwargs)
|
868 |
+
|
869 |
+
def get_guided_decoding_config(self, model_config):
|
870 |
+
|
871 |
+
guided_decoding_backend = get_parameter(model_config,
|
872 |
+
"guided_decoding_backend", str)
|
873 |
+
|
874 |
+
tokenizer_dir = get_parameter(model_config, "tokenizer_dir", str)
|
875 |
+
if guided_decoding_backend not in ['xgrammar']:
|
876 |
+
if tokenizer_dir:
|
877 |
+
pb_utils.Logger.log_warn(
|
878 |
+
f"Guided decoding backend has not been set but tokenizer_dir is given. Tokenizer_dir will be ignored."
|
879 |
+
)
|
880 |
+
return None
|
881 |
+
|
882 |
+
if guided_decoding_backend == 'xgrammar':
|
883 |
+
guided_decoding_backend = trtllm.GuidedDecodingConfig.GuidedDecodingBackend.XGRAMMAR
|
884 |
+
|
885 |
+
if not tokenizer_dir:
|
886 |
+
raise ValueError(
|
887 |
+
"Guided decoding requires tokenizer's information. Please provide 'tokenizer_dir'."
|
888 |
+
)
|
889 |
+
from transformers import AutoTokenizer
|
890 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
|
891 |
+
pb_utils.Logger.log_info(
|
892 |
+
f"Guided decoding has been set with {guided_decoding_backend} backend"
|
893 |
+
)
|
894 |
+
return trtllm.GuidedDecodingConfig(
|
895 |
+
backend=guided_decoding_backend,
|
896 |
+
**_xgrammar_tokenizer_info(tokenizer))
|
897 |
+
|
898 |
+
def get_executor_config(self, model_config):
|
899 |
+
kwargs = {
|
900 |
+
"max_beam_width":
|
901 |
+
get_parameter(model_config, "max_beam_width", int),
|
902 |
+
"scheduler_config":
|
903 |
+
self.get_scheduler_config(model_config),
|
904 |
+
"kv_cache_config":
|
905 |
+
self.get_kv_cache_config(model_config),
|
906 |
+
"enable_chunked_context":
|
907 |
+
get_parameter(model_config, "enable_chunked_context", bool),
|
908 |
+
"normalize_log_probs":
|
909 |
+
get_parameter(model_config, "normalize_log_probs", bool),
|
910 |
+
"batching_type":
|
911 |
+
convert_batching_type(get_parameter(model_config,
|
912 |
+
"gpt_model_type")),
|
913 |
+
"parallel_config":
|
914 |
+
self.get_parallel_config(model_config),
|
915 |
+
"peft_cache_config":
|
916 |
+
self.get_peft_cache_config(model_config),
|
917 |
+
"decoding_config":
|
918 |
+
self.get_decoding_config(model_config),
|
919 |
+
"max_queue_size":
|
920 |
+
model_config.get(
|
921 |
+
"dynamic_batching",
|
922 |
+
{},
|
923 |
+
).get(
|
924 |
+
"default_queue_policy",
|
925 |
+
{},
|
926 |
+
).get("max_queue_size"),
|
927 |
+
"extended_runtime_perf_knob_config":
|
928 |
+
self.get_extended_runtime_perf_knob_config(model_config),
|
929 |
+
"guided_decoding_config":
|
930 |
+
self.get_guided_decoding_config(model_config)
|
931 |
+
}
|
932 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
933 |
+
return trtllm.ExecutorConfig(**kwargs)
|
934 |
+
|
935 |
+
def create_metrics(self, model: str, version: str, is_v1_model: bool):
|
936 |
+
self.request_metric_family = pb_utils.MetricFamily(
|
937 |
+
name="nv_trt_llm_request_metrics",
|
938 |
+
description="TRT LLM request metrics",
|
939 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
940 |
+
)
|
941 |
+
self.runtime_memory_metric_family = pb_utils.MetricFamily(
|
942 |
+
name="nv_trt_llm_runtime_memory_metrics",
|
943 |
+
description="TRT LLM runtime memory metrics",
|
944 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
945 |
+
)
|
946 |
+
self.kv_cache_metric_family = pb_utils.MetricFamily(
|
947 |
+
name="nv_trt_llm_kv_cache_block_metrics",
|
948 |
+
description="TRT LLM KV cache block metrics",
|
949 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
950 |
+
)
|
951 |
+
model_type = "v1" if is_v1_model else "inflight_batcher"
|
952 |
+
self.model_type_metric_family = pb_utils.MetricFamily(
|
953 |
+
name=f"nv_trt_llm_{model_type}_metrics",
|
954 |
+
description=f"TRT LLM {model_type}-specific metrics",
|
955 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
956 |
+
)
|
957 |
+
self.general_metric_family = pb_utils.MetricFamily(
|
958 |
+
name="nv_trt_llm_general_metrics",
|
959 |
+
description="General TRT LLM metrics",
|
960 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
961 |
+
)
|
962 |
+
# Set the metric using self.general_metric_output_family.observe(string_size)
|
963 |
+
self.request_tokens_metric_family = pb_utils.MetricFamily(
|
964 |
+
name="nv_llm_input_token_len",
|
965 |
+
description="TRT LLM response metrics",
|
966 |
+
kind=pb_utils.MetricFamily.HISTOGRAM,
|
967 |
+
)
|
968 |
+
self.response_tokens_metric_family = pb_utils.MetricFamily(
|
969 |
+
name="nv_llm_output_token_len",
|
970 |
+
description="TRT LLM response metrics",
|
971 |
+
kind=pb_utils.MetricFamily.HISTOGRAM,
|
972 |
+
)
|
973 |
+
common_labels = {"model": model, "version": version}
|
974 |
+
self.all_metrics = {
|
975 |
+
# Request metrics
|
976 |
+
"num_active_requests":
|
977 |
+
self.request_metric_family.Metric(labels={
|
978 |
+
"request_type": "active",
|
979 |
+
**common_labels
|
980 |
+
}),
|
981 |
+
"max_num_active_requests":
|
982 |
+
self.request_metric_family.Metric(labels={
|
983 |
+
"request_type": "max",
|
984 |
+
**common_labels
|
985 |
+
}),
|
986 |
+
"num_scheduled_requests":
|
987 |
+
self.request_metric_family.Metric(labels={
|
988 |
+
"request_type": "scheduled",
|
989 |
+
**common_labels
|
990 |
+
}),
|
991 |
+
"num_context_requests":
|
992 |
+
self.request_metric_family.Metric(labels={
|
993 |
+
"request_type": "context",
|
994 |
+
**common_labels
|
995 |
+
}),
|
996 |
+
# Runtime metrics
|
997 |
+
"cpu_mem_usage":
|
998 |
+
self.runtime_memory_metric_family.Metric(labels={
|
999 |
+
"memory_type": "cpu",
|
1000 |
+
**common_labels
|
1001 |
+
}),
|
1002 |
+
"gpu_mem_usage":
|
1003 |
+
self.runtime_memory_metric_family.Metric(labels={
|
1004 |
+
"memory_type": "gpu",
|
1005 |
+
**common_labels
|
1006 |
+
}),
|
1007 |
+
"pinned_mem_usage":
|
1008 |
+
self.runtime_memory_metric_family.Metric(labels={
|
1009 |
+
"memory_type": "pinned",
|
1010 |
+
**common_labels
|
1011 |
+
}),
|
1012 |
+
# KV cache metrics
|
1013 |
+
"max_num_blocks":
|
1014 |
+
self.kv_cache_metric_family.Metric(labels={
|
1015 |
+
"kv_cache_block_type": "max",
|
1016 |
+
**common_labels
|
1017 |
+
}),
|
1018 |
+
"free_num_blocks":
|
1019 |
+
self.kv_cache_metric_family.Metric(labels={
|
1020 |
+
"kv_cache_block_type": "free",
|
1021 |
+
**common_labels
|
1022 |
+
}),
|
1023 |
+
"used_num_blocks":
|
1024 |
+
self.kv_cache_metric_family.Metric(labels={
|
1025 |
+
"kv_cache_block_type": "used",
|
1026 |
+
**common_labels
|
1027 |
+
}),
|
1028 |
+
"tokens_per_block":
|
1029 |
+
self.kv_cache_metric_family.Metric(labels={
|
1030 |
+
"kv_cache_block_type": "tokens_per",
|
1031 |
+
**common_labels
|
1032 |
+
}),
|
1033 |
+
# General metrics
|
1034 |
+
"timestamp":
|
1035 |
+
self.general_metric_family.Metric(labels={
|
1036 |
+
"general_type": "timestamp",
|
1037 |
+
**common_labels
|
1038 |
+
}),
|
1039 |
+
"iter":
|
1040 |
+
self.general_metric_family.Metric(labels={
|
1041 |
+
"general_type": "iteration_counter",
|
1042 |
+
**common_labels
|
1043 |
+
}),
|
1044 |
+
METRIC_TOTAL_OUTPUT_TOKENS:
|
1045 |
+
self.response_tokens_metric_family.Metric(
|
1046 |
+
labels={
|
1047 |
+
"response_metric_type": METRIC_TOTAL_OUTPUT_TOKENS,
|
1048 |
+
**common_labels
|
1049 |
+
},
|
1050 |
+
buckets=build_1_2_5_buckets(1000)),
|
1051 |
+
METRIC_TOTAL_INPUT_TOKENS:
|
1052 |
+
self.request_tokens_metric_family.Metric(
|
1053 |
+
labels={
|
1054 |
+
"response_metric_type": METRIC_TOTAL_INPUT_TOKENS,
|
1055 |
+
**common_labels
|
1056 |
+
},
|
1057 |
+
buckets=build_1_2_5_buckets(1000)),
|
1058 |
+
}
|
1059 |
+
if is_v1_model:
|
1060 |
+
self.all_metrics.update({
|
1061 |
+
"num_ctx_tokens":
|
1062 |
+
self.model_type_metric_family.Metric(labels={
|
1063 |
+
"v1_specific_metric": "total_context_tokens",
|
1064 |
+
**common_labels
|
1065 |
+
}),
|
1066 |
+
"num_gen_tokens":
|
1067 |
+
self.model_type_metric_family.Metric(
|
1068 |
+
labels={
|
1069 |
+
"v1_specific_metric": "total_generation_tokens",
|
1070 |
+
**common_labels
|
1071 |
+
}),
|
1072 |
+
"empty_gen_slots":
|
1073 |
+
self.model_type_metric_family.Metric(
|
1074 |
+
labels={
|
1075 |
+
"v1_specific_metric": "empty_generation_slots",
|
1076 |
+
**common_labels
|
1077 |
+
}),
|
1078 |
+
})
|
1079 |
+
else:
|
1080 |
+
self.all_metrics.update({
|
1081 |
+
"num_ctx_tokens":
|
1082 |
+
self.model_type_metric_family.Metric(
|
1083 |
+
labels={
|
1084 |
+
"inflight_batcher_specific_metric":
|
1085 |
+
"total_context_tokens",
|
1086 |
+
**common_labels
|
1087 |
+
}),
|
1088 |
+
"num_gen_requests":
|
1089 |
+
self.model_type_metric_family.Metric(
|
1090 |
+
labels={
|
1091 |
+
"inflight_batcher_specific_metric":
|
1092 |
+
"generation_requests",
|
1093 |
+
**common_labels
|
1094 |
+
}),
|
1095 |
+
"micro_batch_id":
|
1096 |
+
self.model_type_metric_family.Metric(
|
1097 |
+
labels={
|
1098 |
+
"inflight_batcher_specific_metric": "micro_batch_id",
|
1099 |
+
**common_labels
|
1100 |
+
}),
|
1101 |
+
"num_paused_requests":
|
1102 |
+
self.model_type_metric_family.Metric(
|
1103 |
+
labels={
|
1104 |
+
"inflight_batcher_specific_metric": "paused_requests",
|
1105 |
+
**common_labels
|
1106 |
+
}),
|
1107 |
+
})
|
1108 |
+
|
1109 |
+
def initialize(self, args):
|
1110 |
+
"""`initialize` is called only once when the model is being loaded.
|
1111 |
+
Implementing `initialize` function is optional. This function allows
|
1112 |
+
the model to initialize any state associated with this model.
|
1113 |
+
|
1114 |
+
Parameters
|
1115 |
+
----------
|
1116 |
+
args : dict
|
1117 |
+
Both keys and values are strings. The dictionary keys and values are:
|
1118 |
+
* model_config: A JSON string containing the model configuration
|
1119 |
+
* model_instance_kind: A string containing model instance kind
|
1120 |
+
* model_instance_device_id: A string containing model instance device ID
|
1121 |
+
* model_repository: Model repository path
|
1122 |
+
* model_version: Model version
|
1123 |
+
* model_name: Model name
|
1124 |
+
"""
|
1125 |
+
model_config = json.loads(args['model_config'])
|
1126 |
+
gpt_model_path = get_parameter(model_config, "gpt_model_path")
|
1127 |
+
if get_parameter(model_config, "enable_trt_overlap", bool):
|
1128 |
+
raise pb_utils.TritonModelException(
|
1129 |
+
f"enable_trt_overlap=true is not supported.")
|
1130 |
+
self.exclude_input_from_output = get_parameter(
|
1131 |
+
model_config, "exclude_input_in_output", bool)
|
1132 |
+
executor_config = self.get_executor_config(model_config)
|
1133 |
+
self.executor = trtllm.Executor(gpt_model_path,
|
1134 |
+
trtllm.ModelType.DECODER_ONLY,
|
1135 |
+
executor_config)
|
1136 |
+
self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
|
1137 |
+
model_config)
|
1138 |
+
self.cancellation_check_period_ms = get_parameter(
|
1139 |
+
model_config, "cancellation_check_period_ms", int) or 100
|
1140 |
+
self.stats_check_period_ms = get_parameter(
|
1141 |
+
model_config, "stats_check_period_ms", int) or 100
|
1142 |
+
|
1143 |
+
self.logits_dtype = None
|
1144 |
+
for output in model_config['output']:
|
1145 |
+
if output['name'] == 'context_logits' or output[
|
1146 |
+
'name'] == 'generation_logits':
|
1147 |
+
self.logits_dtype = triton_string_to_torch(output['data_type'])
|
1148 |
+
|
1149 |
+
self.create_metrics(args["model_name"],
|
1150 |
+
args["model_version"],
|
1151 |
+
is_v1_model=executor_config.batching_type ==
|
1152 |
+
trtllm.BatchingType.STATIC)
|
1153 |
+
self.triton_user_id_to_req_ids = {}
|
1154 |
+
self.triton_req_id_to_req_ids = {}
|
1155 |
+
self.req_id_to_request_data = {}
|
1156 |
+
self.lock = Lock()
|
1157 |
+
self.running = False
|
1158 |
+
self.awaiter_thread = Thread(target=self.awaiter_loop)
|
1159 |
+
self.cancellation_thread = Thread(target=self.cancellation_loop)
|
1160 |
+
self.metrics_thread = Thread(target=self.metrics_loop)
|
1161 |
+
if self.executor.can_enqueue_requests():
|
1162 |
+
self.running = True
|
1163 |
+
self.awaiter_thread.start()
|
1164 |
+
self.cancellation_thread.start()
|
1165 |
+
self.metrics_thread.start()
|
1166 |
+
else:
|
1167 |
+
# In leader mode, worker ranks will wait here until leader is done.
|
1168 |
+
self.executor.shutdown()
|
1169 |
+
|
1170 |
+
def handle_stop_request(self, triton_user_id, response_sender):
|
1171 |
+
if triton_user_id is None or triton_user_id == "":
|
1172 |
+
response_sender.send(
|
1173 |
+
pb_utils.InferenceResponse(error=pb_utils.TritonError(
|
1174 |
+
"A request id must be provided for request cancellation")),
|
1175 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
1176 |
+
return
|
1177 |
+
|
1178 |
+
with self.lock:
|
1179 |
+
if triton_user_id in self.triton_user_id_to_req_ids:
|
1180 |
+
req_ids = self.triton_user_id_to_req_ids[triton_user_id]
|
1181 |
+
for req_id in req_ids:
|
1182 |
+
self.executor.cancel_request(req_id)
|
1183 |
+
|
1184 |
+
response_sender.send(
|
1185 |
+
pb_utils.InferenceResponse(),
|
1186 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
1187 |
+
|
1188 |
+
def execute(self, requests):
|
1189 |
+
"""`execute` must be implemented in every Python model. `execute`
|
1190 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
1191 |
+
argument. This function is called when an inference is requested
|
1192 |
+
for this model.
|
1193 |
+
|
1194 |
+
Parameters
|
1195 |
+
----------
|
1196 |
+
requests : list
|
1197 |
+
A list of pb_utils.InferenceRequest
|
1198 |
+
|
1199 |
+
Returns
|
1200 |
+
-------
|
1201 |
+
list
|
1202 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
1203 |
+
be the same as `requests`
|
1204 |
+
"""
|
1205 |
+
if not self.executor.can_enqueue_requests():
|
1206 |
+
return
|
1207 |
+
|
1208 |
+
# Convert to executor requests.
|
1209 |
+
|
1210 |
+
triton_requests = []
|
1211 |
+
executor_requests = []
|
1212 |
+
batch_indices = []
|
1213 |
+
triton_user_ids = []
|
1214 |
+
triton_req_ids = []
|
1215 |
+
|
1216 |
+
for request in requests:
|
1217 |
+
|
1218 |
+
triton_user_id = request.request_id()
|
1219 |
+
|
1220 |
+
response_sender = request.get_response_sender()
|
1221 |
+
stop = get_input_scalar_by_name(request, 'stop')
|
1222 |
+
|
1223 |
+
if stop:
|
1224 |
+
self.handle_stop_request(triton_user_id, response_sender)
|
1225 |
+
else:
|
1226 |
+
#Unique request id used to identify each triton request
|
1227 |
+
triton_req_id = str(randint(0, sys.maxsize))
|
1228 |
+
self.triton_req_id_to_req_ids[triton_req_id] = set()
|
1229 |
+
if triton_user_id is not None and triton_user_id != "":
|
1230 |
+
self.triton_user_id_to_req_ids[triton_user_id] = set()
|
1231 |
+
|
1232 |
+
try:
|
1233 |
+
converted_reqs = convert_request(
|
1234 |
+
request, self.exclude_input_from_output,
|
1235 |
+
self.decoupled)
|
1236 |
+
except Exception as e:
|
1237 |
+
response_sender.send(
|
1238 |
+
pb_utils.InferenceResponse(error=pb_utils.TritonError(
|
1239 |
+
f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'"
|
1240 |
+
)),
|
1241 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
1242 |
+
else:
|
1243 |
+
for batch_index, converted_req in enumerate(
|
1244 |
+
converted_reqs):
|
1245 |
+
triton_requests.append(request)
|
1246 |
+
executor_requests.append(converted_req)
|
1247 |
+
triton_user_ids.append(triton_user_id)
|
1248 |
+
triton_req_ids.append(triton_req_id)
|
1249 |
+
batch_indices.append(batch_index)
|
1250 |
+
|
1251 |
+
with self.lock:
|
1252 |
+
request_ids = self.executor.enqueue_requests(executor_requests)
|
1253 |
+
for req_id, triton_req_id, triton_user_id, executor_request, triton_request, batch_index in zip(
|
1254 |
+
request_ids, triton_req_ids, triton_user_ids,
|
1255 |
+
executor_requests, triton_requests, batch_indices):
|
1256 |
+
|
1257 |
+
self.req_id_to_request_data[req_id] = RequestData(
|
1258 |
+
triton_req_id, triton_user_id, batch_index,
|
1259 |
+
len(batch_indices),
|
1260 |
+
executor_request.sampling_config.num_return_sequences, 0,
|
1261 |
+
0, triton_request.get_response_sender())
|
1262 |
+
self.triton_req_id_to_req_ids[triton_req_id].add(req_id)
|
1263 |
+
input_len = len(
|
1264 |
+
executor_request.input_token_ids
|
1265 |
+
) if executor_request.input_token_ids is not None else 0
|
1266 |
+
self.req_id_to_request_data[
|
1267 |
+
req_id].num_input_tokens += input_len
|
1268 |
+
# This checks both request level and instance config level
|
1269 |
+
if executor_request.output_config.exclude_input_from_output == False and executor_request.streaming == False:
|
1270 |
+
self.req_id_to_request_data[
|
1271 |
+
req_id].num_output_tokens -= self.req_id_to_request_data[
|
1272 |
+
req_id].num_input_tokens * executor_request.sampling_config.beam_width
|
1273 |
+
if triton_user_id is not None and triton_user_id != "":
|
1274 |
+
self.triton_user_id_to_req_ids[triton_user_id].add(req_id)
|
1275 |
+
|
1276 |
+
return None
|
1277 |
+
|
1278 |
+
def awaiter_loop(self):
|
1279 |
+
"""Gets responses from executor and returns the results."""
|
1280 |
+
while self.running:
|
1281 |
+
for response in self.executor.await_responses(
|
1282 |
+
timeout=datetime.timedelta(milliseconds=1)):
|
1283 |
+
req_id = response.request_id
|
1284 |
+
request_data = None
|
1285 |
+
with self.lock:
|
1286 |
+
if req_id not in self.req_id_to_request_data:
|
1287 |
+
continue
|
1288 |
+
request_data = self.req_id_to_request_data[req_id]
|
1289 |
+
|
1290 |
+
triton_response, is_final, output_length = convert_response(
|
1291 |
+
response, request_data.batch_index,
|
1292 |
+
request_data.batch_size, request_data.num_return_sequences,
|
1293 |
+
self.logits_dtype)
|
1294 |
+
with self.lock:
|
1295 |
+
self.req_id_to_request_data[
|
1296 |
+
req_id].num_output_tokens += output_length
|
1297 |
+
triton_request_final = False
|
1298 |
+
if is_final:
|
1299 |
+
with self.lock:
|
1300 |
+
# Check if all executor requests part of that triton request are finished
|
1301 |
+
self.triton_req_id_to_req_ids[
|
1302 |
+
request_data.triton_req_id].remove(req_id)
|
1303 |
+
if len(self.triton_req_id_to_req_ids[
|
1304 |
+
request_data.triton_req_id]) == 0:
|
1305 |
+
pb_utils.Logger.log_info(
|
1306 |
+
f"DELETING Req id {req_id}, triton_req_id {request_data.triton_req_id} "
|
1307 |
+
)
|
1308 |
+
triton_request_final = True
|
1309 |
+
del self.triton_req_id_to_req_ids[
|
1310 |
+
request_data.triton_req_id]
|
1311 |
+
if request_data.triton_user_id is not None and request_data.triton_user_id != "":
|
1312 |
+
del self.triton_user_id_to_req_ids[
|
1313 |
+
request_data.triton_user_id]
|
1314 |
+
self.update_metrics_per_request(req_id)
|
1315 |
+
del self.req_id_to_request_data[req_id]
|
1316 |
+
|
1317 |
+
request_data.response_sender.send(
|
1318 |
+
triton_response,
|
1319 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
|
1320 |
+
if triton_request_final else 0)
|
1321 |
+
|
1322 |
+
def cancellation_loop(self):
|
1323 |
+
"""Checks if any pending requests have been cancelled."""
|
1324 |
+
while self.running:
|
1325 |
+
time.sleep(self.cancellation_check_period_ms / 1000.0)
|
1326 |
+
with self.lock:
|
1327 |
+
for req_id, request_data in self.req_id_to_request_data.items(
|
1328 |
+
):
|
1329 |
+
if request_data.response_sender.is_cancelled():
|
1330 |
+
self.executor.cancel_request(req_id)
|
1331 |
+
|
1332 |
+
def update_metrics_per_request(self, req_id):
|
1333 |
+
"""Updates triton metrics after completing one request"""
|
1334 |
+
output_tokens = self.req_id_to_request_data[req_id].num_output_tokens
|
1335 |
+
input_tokens = self.req_id_to_request_data[req_id].num_input_tokens
|
1336 |
+
|
1337 |
+
self.all_metrics[METRIC_TOTAL_OUTPUT_TOKENS].observe(output_tokens)
|
1338 |
+
self.all_metrics[METRIC_TOTAL_INPUT_TOKENS].observe(input_tokens)
|
1339 |
+
|
1340 |
+
def metrics_loop(self):
|
1341 |
+
"""Updates triton metrics using stats from the executor."""
|
1342 |
+
while self.running:
|
1343 |
+
time.sleep(self.stats_check_period_ms / 1000.0)
|
1344 |
+
for stat in self.executor.get_latest_iteration_stats():
|
1345 |
+
try:
|
1346 |
+
for key, metric in self.all_metrics.items():
|
1347 |
+
# Skip processing for both histogram metrics
|
1348 |
+
if isinstance(key, str) and key in [
|
1349 |
+
METRIC_TOTAL_OUTPUT_TOKENS,
|
1350 |
+
METRIC_TOTAL_INPUT_TOKENS
|
1351 |
+
]:
|
1352 |
+
continue
|
1353 |
+
value = None
|
1354 |
+
if hasattr(stat, key):
|
1355 |
+
value = getattr(stat, key)
|
1356 |
+
elif stat.kv_cache_stats is not None and hasattr(
|
1357 |
+
stat.kv_cache_stats, key):
|
1358 |
+
value = getattr(stat.kv_cache_stats, key)
|
1359 |
+
elif stat.static_batching_stats is not None and hasattr(
|
1360 |
+
stat.static_batching_stats, key):
|
1361 |
+
value = getattr(stat.static_batching_stats, key)
|
1362 |
+
elif stat.inflight_batching_stats is not None and hasattr(
|
1363 |
+
stat.inflight_batching_stats, key):
|
1364 |
+
value = getattr(stat.inflight_batching_stats, key)
|
1365 |
+
if value is not None:
|
1366 |
+
if key == "timestamp":
|
1367 |
+
value = convert_timestamp_to_seconds(value)
|
1368 |
+
metric.set(value)
|
1369 |
+
else:
|
1370 |
+
pb_utils.Logger.log_warn(
|
1371 |
+
f"Metric \"{key}\" not found.")
|
1372 |
+
except Exception as e:
|
1373 |
+
pb_utils.Logger.log_warn(
|
1374 |
+
f"Error while processing metrics: {e}")
|
1375 |
+
|
1376 |
+
def finalize(self):
|
1377 |
+
"""`finalize` is called only once when the model is being unloaded.
|
1378 |
+
Implementing `finalize` function is optional. This function allows
|
1379 |
+
the model to perform any necessary clean ups before exit.
|
1380 |
+
"""
|
1381 |
+
if self.executor.can_enqueue_requests():
|
1382 |
+
self.running = False
|
1383 |
+
self.awaiter_thread.join()
|
1384 |
+
self.cancellation_thread.join()
|
1385 |
+
self.metrics_thread.join()
|
1386 |
+
self.executor.shutdown()
|
tensorrt_llm/1/rank0.engine
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7595e62baa9d736243148716820f6258fbc253d709a52778771f7593dfde37a6
|
3 |
+
size 36509691604
|
tensorrt_llm/1/rank1.engine
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ce45fa6b73f60436052b12754ccf229b02c319b94fecafe52d513b05900cf244
|
3 |
+
size 36509692228
|
tensorrt_llm/config.pbtxt
ADDED
@@ -0,0 +1,757 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "tensorrt_llm"
|
28 |
+
backend: "tensorrtllm"
|
29 |
+
max_batch_size: 32
|
30 |
+
|
31 |
+
model_transaction_policy {
|
32 |
+
decoupled: True
|
33 |
+
}
|
34 |
+
|
35 |
+
dynamic_batching {
|
36 |
+
preferred_batch_size: [ 32 ]
|
37 |
+
max_queue_delay_microseconds: 0
|
38 |
+
default_queue_policy: { max_queue_size: 32 }
|
39 |
+
}
|
40 |
+
input [
|
41 |
+
{
|
42 |
+
name: "input_ids"
|
43 |
+
data_type: TYPE_INT32
|
44 |
+
dims: [ -1 ]
|
45 |
+
allow_ragged_batch: true
|
46 |
+
optional: true
|
47 |
+
},
|
48 |
+
{
|
49 |
+
name: "encoder_input_features"
|
50 |
+
data_type: TYPE_FP16
|
51 |
+
dims: [ -1, -1 ]
|
52 |
+
allow_ragged_batch: true
|
53 |
+
optional: true
|
54 |
+
},
|
55 |
+
{
|
56 |
+
name: "encoder_output_lengths"
|
57 |
+
data_type: TYPE_INT32
|
58 |
+
dims: [ 1 ]
|
59 |
+
reshape: { shape: [ ] }
|
60 |
+
optional: true
|
61 |
+
},
|
62 |
+
{
|
63 |
+
name: "input_lengths"
|
64 |
+
data_type: TYPE_INT32
|
65 |
+
dims: [ 1 ]
|
66 |
+
reshape: { shape: [ ] }
|
67 |
+
},
|
68 |
+
{
|
69 |
+
name: "request_output_len"
|
70 |
+
data_type: TYPE_INT32
|
71 |
+
dims: [ 1 ]
|
72 |
+
reshape: { shape: [ ] }
|
73 |
+
},
|
74 |
+
{
|
75 |
+
name: "num_return_sequences"
|
76 |
+
data_type: TYPE_INT32
|
77 |
+
dims: [ 1 ]
|
78 |
+
reshape: { shape: [ ] }
|
79 |
+
optional: true
|
80 |
+
},
|
81 |
+
{
|
82 |
+
name: "draft_input_ids"
|
83 |
+
data_type: TYPE_INT32
|
84 |
+
dims: [ -1 ]
|
85 |
+
optional: true
|
86 |
+
allow_ragged_batch: true
|
87 |
+
},
|
88 |
+
{
|
89 |
+
name: "decoder_input_ids"
|
90 |
+
data_type: TYPE_INT32
|
91 |
+
dims: [ -1 ]
|
92 |
+
optional: true
|
93 |
+
allow_ragged_batch: true
|
94 |
+
},
|
95 |
+
{
|
96 |
+
name: "decoder_input_lengths"
|
97 |
+
data_type: TYPE_INT32
|
98 |
+
dims: [ 1 ]
|
99 |
+
optional: true
|
100 |
+
reshape: { shape: [ ] }
|
101 |
+
},
|
102 |
+
{
|
103 |
+
name: "draft_logits"
|
104 |
+
data_type: TYPE_FP16
|
105 |
+
dims: [ -1, -1 ]
|
106 |
+
optional: true
|
107 |
+
allow_ragged_batch: true
|
108 |
+
},
|
109 |
+
{
|
110 |
+
name: "draft_acceptance_threshold"
|
111 |
+
data_type: TYPE_FP32
|
112 |
+
dims: [ 1 ]
|
113 |
+
reshape: { shape: [ ] }
|
114 |
+
optional: true
|
115 |
+
},
|
116 |
+
{
|
117 |
+
name: "end_id"
|
118 |
+
data_type: TYPE_INT32
|
119 |
+
dims: [ 1 ]
|
120 |
+
reshape: { shape: [ ] }
|
121 |
+
optional: true
|
122 |
+
},
|
123 |
+
{
|
124 |
+
name: "pad_id"
|
125 |
+
data_type: TYPE_INT32
|
126 |
+
dims: [ 1 ]
|
127 |
+
reshape: { shape: [ ] }
|
128 |
+
optional: true
|
129 |
+
},
|
130 |
+
{
|
131 |
+
name: "stop_words_list"
|
132 |
+
data_type: TYPE_INT32
|
133 |
+
dims: [ 2, -1 ]
|
134 |
+
optional: true
|
135 |
+
allow_ragged_batch: true
|
136 |
+
},
|
137 |
+
{
|
138 |
+
name: "bad_words_list"
|
139 |
+
data_type: TYPE_INT32
|
140 |
+
dims: [ 2, -1 ]
|
141 |
+
optional: true
|
142 |
+
allow_ragged_batch: true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
name: "embedding_bias"
|
146 |
+
data_type: TYPE_FP32
|
147 |
+
dims: [ -1 ]
|
148 |
+
optional: true
|
149 |
+
allow_ragged_batch: true
|
150 |
+
},
|
151 |
+
{
|
152 |
+
name: "beam_width"
|
153 |
+
data_type: TYPE_INT32
|
154 |
+
dims: [ 1 ]
|
155 |
+
reshape: { shape: [ ] }
|
156 |
+
optional: true
|
157 |
+
},
|
158 |
+
{
|
159 |
+
name: "temperature"
|
160 |
+
data_type: TYPE_FP32
|
161 |
+
dims: [ 1 ]
|
162 |
+
reshape: { shape: [ ] }
|
163 |
+
optional: true
|
164 |
+
},
|
165 |
+
{
|
166 |
+
name: "runtime_top_k"
|
167 |
+
data_type: TYPE_INT32
|
168 |
+
dims: [ 1 ]
|
169 |
+
reshape: { shape: [ ] }
|
170 |
+
optional: true
|
171 |
+
},
|
172 |
+
{
|
173 |
+
name: "runtime_top_p"
|
174 |
+
data_type: TYPE_FP32
|
175 |
+
dims: [ 1 ]
|
176 |
+
reshape: { shape: [ ] }
|
177 |
+
optional: true
|
178 |
+
},
|
179 |
+
{
|
180 |
+
name: "runtime_top_p_min"
|
181 |
+
data_type: TYPE_FP32
|
182 |
+
dims: [ 1 ]
|
183 |
+
reshape: { shape: [ ] }
|
184 |
+
optional: true
|
185 |
+
},
|
186 |
+
{
|
187 |
+
name: "runtime_top_p_decay"
|
188 |
+
data_type: TYPE_FP32
|
189 |
+
dims: [ 1 ]
|
190 |
+
reshape: { shape: [ ] }
|
191 |
+
optional: true
|
192 |
+
},
|
193 |
+
{
|
194 |
+
name: "runtime_top_p_reset_ids"
|
195 |
+
data_type: TYPE_INT32
|
196 |
+
dims: [ 1 ]
|
197 |
+
reshape: { shape: [ ] }
|
198 |
+
optional: true
|
199 |
+
},
|
200 |
+
{
|
201 |
+
name: "len_penalty"
|
202 |
+
data_type: TYPE_FP32
|
203 |
+
dims: [ 1 ]
|
204 |
+
reshape: { shape: [ ] }
|
205 |
+
optional: true
|
206 |
+
},
|
207 |
+
{
|
208 |
+
name: "early_stopping"
|
209 |
+
data_type: TYPE_BOOL
|
210 |
+
dims: [ 1 ]
|
211 |
+
reshape: { shape: [ ] }
|
212 |
+
optional: true
|
213 |
+
},
|
214 |
+
{
|
215 |
+
name: "repetition_penalty"
|
216 |
+
data_type: TYPE_FP32
|
217 |
+
dims: [ 1 ]
|
218 |
+
reshape: { shape: [ ] }
|
219 |
+
optional: true
|
220 |
+
},
|
221 |
+
{
|
222 |
+
name: "min_length"
|
223 |
+
data_type: TYPE_INT32
|
224 |
+
dims: [ 1 ]
|
225 |
+
reshape: { shape: [ ] }
|
226 |
+
optional: true
|
227 |
+
},
|
228 |
+
{
|
229 |
+
name: "beam_search_diversity_rate"
|
230 |
+
data_type: TYPE_FP32
|
231 |
+
dims: [ 1 ]
|
232 |
+
reshape: { shape: [ ] }
|
233 |
+
optional: true
|
234 |
+
},
|
235 |
+
{
|
236 |
+
name: "presence_penalty"
|
237 |
+
data_type: TYPE_FP32
|
238 |
+
dims: [ 1 ]
|
239 |
+
reshape: { shape: [ ] }
|
240 |
+
optional: true
|
241 |
+
},
|
242 |
+
{
|
243 |
+
name: "frequency_penalty"
|
244 |
+
data_type: TYPE_FP32
|
245 |
+
dims: [ 1 ]
|
246 |
+
reshape: { shape: [ ] }
|
247 |
+
optional: true
|
248 |
+
},
|
249 |
+
{
|
250 |
+
name: "random_seed"
|
251 |
+
data_type: TYPE_UINT64
|
252 |
+
dims: [ 1 ]
|
253 |
+
reshape: { shape: [ ] }
|
254 |
+
optional: true
|
255 |
+
},
|
256 |
+
{
|
257 |
+
name: "return_log_probs"
|
258 |
+
data_type: TYPE_BOOL
|
259 |
+
dims: [ 1 ]
|
260 |
+
reshape: { shape: [ ] }
|
261 |
+
optional: true
|
262 |
+
},
|
263 |
+
{
|
264 |
+
name: "return_context_logits"
|
265 |
+
data_type: TYPE_BOOL
|
266 |
+
dims: [ 1 ]
|
267 |
+
reshape: { shape: [ ] }
|
268 |
+
optional: true
|
269 |
+
},
|
270 |
+
{
|
271 |
+
name: "return_generation_logits"
|
272 |
+
data_type: TYPE_BOOL
|
273 |
+
dims: [ 1 ]
|
274 |
+
reshape: { shape: [ ] }
|
275 |
+
optional: true
|
276 |
+
},
|
277 |
+
{
|
278 |
+
name: "return_kv_cache_reuse_stats"
|
279 |
+
data_type: TYPE_BOOL
|
280 |
+
dims: [ 1 ]
|
281 |
+
reshape: { shape: [ ] }
|
282 |
+
optional: true
|
283 |
+
},
|
284 |
+
{
|
285 |
+
name: "exclude_input_in_output"
|
286 |
+
data_type: TYPE_BOOL
|
287 |
+
dims: [ 1 ]
|
288 |
+
reshape: { shape: [ ] }
|
289 |
+
optional: true
|
290 |
+
},
|
291 |
+
{
|
292 |
+
name: "stop"
|
293 |
+
data_type: TYPE_BOOL
|
294 |
+
dims: [ 1 ]
|
295 |
+
reshape: { shape: [ ] }
|
296 |
+
optional: true
|
297 |
+
},
|
298 |
+
{
|
299 |
+
name: "streaming"
|
300 |
+
data_type: TYPE_BOOL
|
301 |
+
dims: [ 1 ]
|
302 |
+
reshape: { shape: [ ] }
|
303 |
+
optional: true
|
304 |
+
},
|
305 |
+
{
|
306 |
+
name: "prompt_embedding_table"
|
307 |
+
data_type: TYPE_FP16
|
308 |
+
dims: [ -1, -1 ]
|
309 |
+
optional: true
|
310 |
+
allow_ragged_batch: true
|
311 |
+
},
|
312 |
+
{
|
313 |
+
name: "prompt_table_extra_ids"
|
314 |
+
data_type: TYPE_UINT64
|
315 |
+
dims: [ -1 ]
|
316 |
+
optional: true
|
317 |
+
allow_ragged_batch: true
|
318 |
+
},
|
319 |
+
{
|
320 |
+
name: "prompt_vocab_size"
|
321 |
+
data_type: TYPE_INT32
|
322 |
+
dims: [ 1 ]
|
323 |
+
reshape: { shape: [ ] }
|
324 |
+
optional: true
|
325 |
+
},
|
326 |
+
# cross_attention_mask shape `[bs, seq_len, num_images*num_tiles]`
|
327 |
+
{
|
328 |
+
name: "cross_attention_mask"
|
329 |
+
data_type: TYPE_BOOL
|
330 |
+
dims: [ -1, -1 ]
|
331 |
+
optional: true
|
332 |
+
allow_ragged_batch: true
|
333 |
+
},
|
334 |
+
# the unique task ID for the given LoRA.
|
335 |
+
# To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
|
336 |
+
# The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
|
337 |
+
# If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached.
|
338 |
+
{
|
339 |
+
name: "lora_task_id"
|
340 |
+
data_type: TYPE_UINT64
|
341 |
+
dims: [ 1 ]
|
342 |
+
reshape: { shape: [ ] }
|
343 |
+
optional: true
|
344 |
+
},
|
345 |
+
# weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
|
346 |
+
# where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
|
347 |
+
# each of the in / out tensors are first flattened and then concatenated together in the format above.
|
348 |
+
# D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
|
349 |
+
{
|
350 |
+
name: "lora_weights"
|
351 |
+
data_type: TYPE_FP16
|
352 |
+
dims: [ -1, -1 ]
|
353 |
+
optional: true
|
354 |
+
allow_ragged_batch: true
|
355 |
+
},
|
356 |
+
# module identifier (same size a first dimension of lora_weights)
|
357 |
+
# See LoraModule::ModuleType for model id mapping
|
358 |
+
#
|
359 |
+
# "attn_qkv": 0 # compbined qkv adapter
|
360 |
+
# "attn_q": 1 # q adapter
|
361 |
+
# "attn_k": 2 # k adapter
|
362 |
+
# "attn_v": 3 # v adapter
|
363 |
+
# "attn_dense": 4 # adapter for the dense layer in attention
|
364 |
+
# "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
|
365 |
+
# "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
|
366 |
+
# "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
|
367 |
+
#
|
368 |
+
# last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
|
369 |
+
{
|
370 |
+
name: "lora_config"
|
371 |
+
data_type: TYPE_INT32
|
372 |
+
dims: [ -1, 3 ]
|
373 |
+
optional: true
|
374 |
+
allow_ragged_batch: true
|
375 |
+
},
|
376 |
+
{
|
377 |
+
name: "context_phase_params"
|
378 |
+
data_type: TYPE_UINT8
|
379 |
+
dims: [ -1 ]
|
380 |
+
optional: true
|
381 |
+
allow_ragged_batch: true
|
382 |
+
},
|
383 |
+
# skip_cross_attn_blocks shape `[bs, 1]`, only used in mllama
|
384 |
+
{
|
385 |
+
name: "skip_cross_attn_blocks"
|
386 |
+
data_type: TYPE_BOOL
|
387 |
+
dims: [ 1 ]
|
388 |
+
optional: true
|
389 |
+
allow_ragged_batch: true
|
390 |
+
},
|
391 |
+
{
|
392 |
+
name: "retention_token_range_starts"
|
393 |
+
data_type: TYPE_INT32
|
394 |
+
dims: [ -1 ]
|
395 |
+
optional: true
|
396 |
+
allow_ragged_batch: true
|
397 |
+
},
|
398 |
+
{
|
399 |
+
name: "retention_token_range_ends"
|
400 |
+
data_type: TYPE_INT32
|
401 |
+
dims: [ -1 ]
|
402 |
+
optional: true
|
403 |
+
allow_ragged_batch: true
|
404 |
+
},
|
405 |
+
{
|
406 |
+
name: "retention_token_range_priorities"
|
407 |
+
data_type: TYPE_INT32
|
408 |
+
dims: [ -1 ]
|
409 |
+
optional: true
|
410 |
+
allow_ragged_batch: true
|
411 |
+
},
|
412 |
+
{
|
413 |
+
name: "retention_token_range_durations_ms"
|
414 |
+
data_type: TYPE_INT32
|
415 |
+
dims: [ -1 ]
|
416 |
+
optional: true
|
417 |
+
allow_ragged_batch: true
|
418 |
+
},
|
419 |
+
{
|
420 |
+
name: "retention_decode_priority"
|
421 |
+
data_type: TYPE_INT32
|
422 |
+
dims: [ 1 ]
|
423 |
+
optional: true
|
424 |
+
allow_ragged_batch: true
|
425 |
+
},
|
426 |
+
{
|
427 |
+
name: "retention_decode_duration_ms"
|
428 |
+
data_type: TYPE_INT32
|
429 |
+
dims: [ 1 ]
|
430 |
+
optional: true
|
431 |
+
allow_ragged_batch: true
|
432 |
+
},
|
433 |
+
{
|
434 |
+
name: "guided_decoding_guide_type"
|
435 |
+
data_type: TYPE_STRING
|
436 |
+
dims: [ 1 ]
|
437 |
+
optional: true
|
438 |
+
allow_ragged_batch: true
|
439 |
+
},
|
440 |
+
{
|
441 |
+
name: "guided_decoding_guide"
|
442 |
+
data_type: TYPE_STRING
|
443 |
+
dims: [ 1 ]
|
444 |
+
optional: true
|
445 |
+
allow_ragged_batch: true
|
446 |
+
}
|
447 |
+
]
|
448 |
+
output [
|
449 |
+
{
|
450 |
+
name: "output_ids"
|
451 |
+
data_type: TYPE_INT32
|
452 |
+
dims: [ -1, -1 ]
|
453 |
+
},
|
454 |
+
{
|
455 |
+
name: "sequence_length"
|
456 |
+
data_type: TYPE_INT32
|
457 |
+
dims: [ -1 ]
|
458 |
+
},
|
459 |
+
{
|
460 |
+
name: "cum_log_probs"
|
461 |
+
data_type: TYPE_FP32
|
462 |
+
dims: [ -1 ]
|
463 |
+
},
|
464 |
+
{
|
465 |
+
name: "output_log_probs"
|
466 |
+
data_type: TYPE_FP32
|
467 |
+
dims: [ -1, -1 ]
|
468 |
+
},
|
469 |
+
{
|
470 |
+
name: "context_logits"
|
471 |
+
data_type: TYPE_FP16
|
472 |
+
dims: [ -1, -1 ]
|
473 |
+
},
|
474 |
+
{
|
475 |
+
name: "generation_logits"
|
476 |
+
data_type: TYPE_FP16
|
477 |
+
dims: [ -1, -1, -1 ]
|
478 |
+
},
|
479 |
+
{
|
480 |
+
name: "batch_index"
|
481 |
+
data_type: TYPE_INT32
|
482 |
+
dims: [ 1 ]
|
483 |
+
},
|
484 |
+
{
|
485 |
+
name: "sequence_index"
|
486 |
+
data_type: TYPE_INT32
|
487 |
+
dims: [ 1 ]
|
488 |
+
},
|
489 |
+
{
|
490 |
+
name: "context_phase_params"
|
491 |
+
data_type: TYPE_UINT8
|
492 |
+
dims: [ -1 ]
|
493 |
+
},
|
494 |
+
{
|
495 |
+
name: "kv_cache_alloc_new_blocks"
|
496 |
+
data_type: TYPE_INT32
|
497 |
+
dims: [ 1 ]
|
498 |
+
},
|
499 |
+
{
|
500 |
+
name: "kv_cache_reused_blocks"
|
501 |
+
data_type: TYPE_INT32
|
502 |
+
dims: [ 1 ]
|
503 |
+
},
|
504 |
+
{
|
505 |
+
name: "kv_cache_alloc_total_blocks"
|
506 |
+
data_type: TYPE_INT32
|
507 |
+
dims: [ 1 ]
|
508 |
+
}
|
509 |
+
]
|
510 |
+
instance_group [
|
511 |
+
{
|
512 |
+
count: 1
|
513 |
+
kind : KIND_CPU
|
514 |
+
}
|
515 |
+
]
|
516 |
+
parameters: {
|
517 |
+
key: "max_beam_width"
|
518 |
+
value: {
|
519 |
+
string_value: "1"
|
520 |
+
}
|
521 |
+
}
|
522 |
+
parameters: {
|
523 |
+
key: "FORCE_CPU_ONLY_INPUT_TENSORS"
|
524 |
+
value: {
|
525 |
+
string_value: "no"
|
526 |
+
}
|
527 |
+
}
|
528 |
+
parameters: {
|
529 |
+
key: "gpt_model_type"
|
530 |
+
value: {
|
531 |
+
string_value: "inflight_fused_batching"
|
532 |
+
}
|
533 |
+
}
|
534 |
+
parameters: {
|
535 |
+
key: "gpt_model_path"
|
536 |
+
value: {
|
537 |
+
string_value: "/all_models/inflight_batcher_llm/tensorrt_llm/1"
|
538 |
+
}
|
539 |
+
}
|
540 |
+
parameters: {
|
541 |
+
key: "encoder_model_path"
|
542 |
+
value: {
|
543 |
+
string_value: "${encoder_engine_dir}"
|
544 |
+
}
|
545 |
+
}
|
546 |
+
parameters: {
|
547 |
+
key: "max_tokens_in_paged_kv_cache"
|
548 |
+
value: {
|
549 |
+
string_value: "${max_tokens_in_paged_kv_cache}"
|
550 |
+
}
|
551 |
+
}
|
552 |
+
parameters: {
|
553 |
+
key: "max_attention_window_size"
|
554 |
+
value: {
|
555 |
+
string_value: "${max_attention_window_size}"
|
556 |
+
}
|
557 |
+
}
|
558 |
+
parameters: {
|
559 |
+
key: "sink_token_length"
|
560 |
+
value: {
|
561 |
+
string_value: "${sink_token_length}"
|
562 |
+
}
|
563 |
+
}
|
564 |
+
parameters: {
|
565 |
+
key: "batch_scheduler_policy"
|
566 |
+
value: {
|
567 |
+
string_value: "guaranteed_no_evict"
|
568 |
+
}
|
569 |
+
}
|
570 |
+
parameters: {
|
571 |
+
key: "kv_cache_free_gpu_mem_fraction"
|
572 |
+
value: {
|
573 |
+
string_value: "${kv_cache_free_gpu_mem_fraction}"
|
574 |
+
}
|
575 |
+
}
|
576 |
+
parameters: {
|
577 |
+
key: "cross_kv_cache_fraction"
|
578 |
+
value: {
|
579 |
+
string_value: "${cross_kv_cache_fraction}"
|
580 |
+
}
|
581 |
+
}
|
582 |
+
parameters: {
|
583 |
+
key: "kv_cache_host_memory_bytes"
|
584 |
+
value: {
|
585 |
+
string_value: "${kv_cache_host_memory_bytes}"
|
586 |
+
}
|
587 |
+
}
|
588 |
+
# kv_cache_onboard_blocks is for internal implementation.
|
589 |
+
parameters: {
|
590 |
+
key: "kv_cache_onboard_blocks"
|
591 |
+
value: {
|
592 |
+
string_value: "${kv_cache_onboard_blocks}"
|
593 |
+
}
|
594 |
+
}
|
595 |
+
# enable_trt_overlap is deprecated and doesn't have any effect on the runtime
|
596 |
+
# parameters: {
|
597 |
+
# key: "enable_trt_overlap"
|
598 |
+
# value: {
|
599 |
+
# string_value: "${enable_trt_overlap}"
|
600 |
+
# }
|
601 |
+
# }
|
602 |
+
parameters: {
|
603 |
+
key: "exclude_input_in_output"
|
604 |
+
value: {
|
605 |
+
string_value: "True"
|
606 |
+
}
|
607 |
+
}
|
608 |
+
parameters: {
|
609 |
+
key: "cancellation_check_period_ms"
|
610 |
+
value: {
|
611 |
+
string_value: "${cancellation_check_period_ms}"
|
612 |
+
}
|
613 |
+
}
|
614 |
+
parameters: {
|
615 |
+
key: "stats_check_period_ms"
|
616 |
+
value: {
|
617 |
+
string_value: "${stats_check_period_ms}"
|
618 |
+
}
|
619 |
+
}
|
620 |
+
parameters: {
|
621 |
+
key: "iter_stats_max_iterations"
|
622 |
+
value: {
|
623 |
+
string_value: "${iter_stats_max_iterations}"
|
624 |
+
}
|
625 |
+
}
|
626 |
+
parameters: {
|
627 |
+
key: "request_stats_max_iterations"
|
628 |
+
value: {
|
629 |
+
string_value: "${request_stats_max_iterations}"
|
630 |
+
}
|
631 |
+
}
|
632 |
+
parameters: {
|
633 |
+
key: "enable_kv_cache_reuse"
|
634 |
+
value: {
|
635 |
+
string_value: "True"
|
636 |
+
}
|
637 |
+
}
|
638 |
+
parameters: {
|
639 |
+
key: "normalize_log_probs"
|
640 |
+
value: {
|
641 |
+
string_value: "${normalize_log_probs}"
|
642 |
+
}
|
643 |
+
}
|
644 |
+
parameters: {
|
645 |
+
key: "enable_chunked_context"
|
646 |
+
value: {
|
647 |
+
string_value: "${enable_chunked_context}"
|
648 |
+
}
|
649 |
+
}
|
650 |
+
parameters: {
|
651 |
+
key: "gpu_device_ids"
|
652 |
+
value: {
|
653 |
+
string_value: "${gpu_device_ids}"
|
654 |
+
}
|
655 |
+
}
|
656 |
+
parameters: {
|
657 |
+
key: "participant_ids"
|
658 |
+
value: {
|
659 |
+
string_value: "${participant_ids}"
|
660 |
+
}
|
661 |
+
}
|
662 |
+
parameters: {
|
663 |
+
key: "lora_cache_optimal_adapter_size"
|
664 |
+
value: {
|
665 |
+
string_value: "${lora_cache_optimal_adapter_size}"
|
666 |
+
}
|
667 |
+
}
|
668 |
+
parameters: {
|
669 |
+
key: "lora_cache_max_adapter_size"
|
670 |
+
value: {
|
671 |
+
string_value: "${lora_cache_max_adapter_size}"
|
672 |
+
}
|
673 |
+
}
|
674 |
+
parameters: {
|
675 |
+
key: "lora_cache_gpu_memory_fraction"
|
676 |
+
value: {
|
677 |
+
string_value: "${lora_cache_gpu_memory_fraction}"
|
678 |
+
}
|
679 |
+
}
|
680 |
+
parameters: {
|
681 |
+
key: "lora_cache_host_memory_bytes"
|
682 |
+
value: {
|
683 |
+
string_value: "${lora_cache_host_memory_bytes}"
|
684 |
+
}
|
685 |
+
}
|
686 |
+
parameters: {
|
687 |
+
key: "decoding_mode"
|
688 |
+
value: {
|
689 |
+
string_value: "${decoding_mode}"
|
690 |
+
}
|
691 |
+
}
|
692 |
+
parameters: {
|
693 |
+
key: "executor_worker_path"
|
694 |
+
value: {
|
695 |
+
string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
|
696 |
+
}
|
697 |
+
}
|
698 |
+
parameters: {
|
699 |
+
key: "medusa_choices"
|
700 |
+
value: {
|
701 |
+
string_value: "${medusa_choices}"
|
702 |
+
}
|
703 |
+
}
|
704 |
+
parameters: {
|
705 |
+
key: "eagle_choices"
|
706 |
+
value: {
|
707 |
+
string_value: "${eagle_choices}"
|
708 |
+
}
|
709 |
+
}
|
710 |
+
parameters: {
|
711 |
+
key: "gpu_weights_percent"
|
712 |
+
value: {
|
713 |
+
string_value: "${gpu_weights_percent}"
|
714 |
+
}
|
715 |
+
}
|
716 |
+
parameters: {
|
717 |
+
key: "enable_context_fmha_fp32_acc"
|
718 |
+
value: {
|
719 |
+
string_value: "${enable_context_fmha_fp32_acc}"
|
720 |
+
}
|
721 |
+
}
|
722 |
+
parameters: {
|
723 |
+
key: "multi_block_mode"
|
724 |
+
value: {
|
725 |
+
string_value: "${multi_block_mode}"
|
726 |
+
}
|
727 |
+
}
|
728 |
+
parameters: {
|
729 |
+
key: "cuda_graph_mode"
|
730 |
+
value: {
|
731 |
+
string_value: "${cuda_graph_mode}"
|
732 |
+
}
|
733 |
+
}
|
734 |
+
parameters: {
|
735 |
+
key: "cuda_graph_cache_size"
|
736 |
+
value: {
|
737 |
+
string_value: "${cuda_graph_cache_size}"
|
738 |
+
}
|
739 |
+
}
|
740 |
+
parameters: {
|
741 |
+
key: "speculative_decoding_fast_logits"
|
742 |
+
value: {
|
743 |
+
string_value: "${speculative_decoding_fast_logits}"
|
744 |
+
}
|
745 |
+
}
|
746 |
+
parameters: {
|
747 |
+
key: "tokenizer_dir"
|
748 |
+
value: {
|
749 |
+
string_value: "${tokenizer_dir}"
|
750 |
+
}
|
751 |
+
}
|
752 |
+
parameters: {
|
753 |
+
key: "guided_decoding_backend"
|
754 |
+
value: {
|
755 |
+
string_value: "${guided_decoding_backend}"
|
756 |
+
}
|
757 |
+
}
|
tensorrt_llm_bls/1/__pycache__/model.cpython-312.pyc
ADDED
Binary file (5.53 kB). View file
|
|
tensorrt_llm_bls/1/lib/__pycache__/decode.cpython-312.pyc
ADDED
Binary file (21.7 kB). View file
|
|
tensorrt_llm_bls/1/lib/__pycache__/triton_decoder.cpython-312.pyc
ADDED
Binary file (19.4 kB). View file
|
|
tensorrt_llm_bls/1/lib/decode.py
ADDED
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
from collections.abc import Generator
|
28 |
+
from dataclasses import dataclass, field
|
29 |
+
from typing import Optional
|
30 |
+
|
31 |
+
import numpy as np
|
32 |
+
import torch
|
33 |
+
|
34 |
+
|
35 |
+
class RequestValidationError(Exception):
|
36 |
+
pass
|
37 |
+
|
38 |
+
|
39 |
+
def _validate_that(condition: bool, msg: str):
|
40 |
+
if not condition:
|
41 |
+
raise RequestValidationError(msg)
|
42 |
+
|
43 |
+
|
44 |
+
def _validate_non_empty(data, msg: str):
|
45 |
+
if isinstance(data, torch.Tensor):
|
46 |
+
_validate_that(data is not None and data.numel() > 0, msg)
|
47 |
+
else:
|
48 |
+
_validate_that(data is not None and data.size > 0, msg)
|
49 |
+
|
50 |
+
|
51 |
+
def _validate_single_gt_0(data, msg: str):
|
52 |
+
_validate_non_empty(data, msg)
|
53 |
+
_validate_that(data.flatten()[0] > 0, msg)
|
54 |
+
|
55 |
+
|
56 |
+
def _single_value(data: Optional[np.ndarray]):
|
57 |
+
if data is None:
|
58 |
+
return None
|
59 |
+
return data.flatten()[0]
|
60 |
+
|
61 |
+
|
62 |
+
@dataclass
|
63 |
+
class Request:
|
64 |
+
text_input: np.ndarray = field(default_factory=lambda: np.array([]))
|
65 |
+
decoder_text_input: np.ndarray = None
|
66 |
+
image_input: Optional[np.ndarray] = None
|
67 |
+
image_bytes_input: Optional[np.ndarray] = None
|
68 |
+
image_url_input: Optional[np.ndarray] = None
|
69 |
+
video_bytes_input: Optional[np.ndarray] = None
|
70 |
+
max_tokens: Optional[np.ndarray] = None
|
71 |
+
bad_words: Optional[np.ndarray] = None
|
72 |
+
stop_words: Optional[np.ndarray] = None
|
73 |
+
end_id: Optional[np.ndarray] = None
|
74 |
+
pad_id: Optional[np.ndarray] = None
|
75 |
+
top_k: Optional[np.ndarray] = None
|
76 |
+
top_p: Optional[np.ndarray] = None
|
77 |
+
temperature: Optional[np.ndarray] = None
|
78 |
+
length_penalty: Optional[np.ndarray] = None
|
79 |
+
repetition_penalty: Optional[np.ndarray] = None
|
80 |
+
min_length: Optional[np.ndarray] = None
|
81 |
+
return_log_probs: Optional[np.ndarray] = None
|
82 |
+
prompt_embedding_table: Optional[np.ndarray] = None
|
83 |
+
prompt_vocab_size: Optional[np.ndarray] = None
|
84 |
+
prompt_table_extra_id: Optional[np.ndarray] = None
|
85 |
+
embedding_bias_words: Optional[np.ndarray] = None
|
86 |
+
embedding_bias_weights: Optional[np.ndarray] = None
|
87 |
+
num_draft_tokens: Optional[np.ndarray] = None
|
88 |
+
use_draft_logits: Optional[np.ndarray] = None
|
89 |
+
stream: Optional[np.ndarray] = None
|
90 |
+
beam_width: Optional[np.ndarray] = None
|
91 |
+
return_context_logits: Optional[np.ndarray] = None
|
92 |
+
return_generation_logits: Optional[np.ndarray] = None
|
93 |
+
random_seed: Optional[np.ndarray] = None
|
94 |
+
presence_penalty: Optional[np.ndarray] = None
|
95 |
+
frequency_penalty: Optional[np.ndarray] = None
|
96 |
+
lora_task_id: Optional[np.ndarray] = None
|
97 |
+
lora_weights: Optional[np.ndarray] = None
|
98 |
+
lora_config: Optional[np.ndarray] = None
|
99 |
+
exclude_input_in_output: Optional[np.ndarray] = None
|
100 |
+
return_kv_cache_reuse_stats: Optional[np.ndarray] = None
|
101 |
+
guided_decoding_guide_type: Optional[np.ndarray] = None
|
102 |
+
guided_decoding_guide: Optional[np.ndarray] = None
|
103 |
+
|
104 |
+
def validate(self):
|
105 |
+
_validate_non_empty(self.text_input, "text_input is required")
|
106 |
+
_validate_single_gt_0(self.max_tokens,
|
107 |
+
"max_tokens must be a single value > 0")
|
108 |
+
|
109 |
+
num_draft_tokens = _single_value(self.num_draft_tokens)
|
110 |
+
_single_value(self.return_generation_logits)
|
111 |
+
context_logits = _single_value(self.return_context_logits)
|
112 |
+
|
113 |
+
if num_draft_tokens:
|
114 |
+
_validate_that(
|
115 |
+
not self.stream.any(),
|
116 |
+
"streaming is not supported with speculative decoding")
|
117 |
+
_validate_that(
|
118 |
+
not context_logits,
|
119 |
+
"context logits are not supported with speculative decoding")
|
120 |
+
|
121 |
+
|
122 |
+
@dataclass
|
123 |
+
class DraftRequest:
|
124 |
+
draft_input_ids: Optional[np.ndarray] = None
|
125 |
+
draft_logits: Optional[np.ndarray] = None
|
126 |
+
|
127 |
+
|
128 |
+
@dataclass
|
129 |
+
class PreprocResponse:
|
130 |
+
input_ids: np.ndarray = field(default_factory=lambda: np.array([]))
|
131 |
+
decoder_input_ids: np.ndarray = None
|
132 |
+
input_lengths: np.ndarray = field(default_factory=lambda: np.array([]))
|
133 |
+
decoder_input_lengths: np.ndarray = None
|
134 |
+
bad_words_list: Optional[np.ndarray] = None
|
135 |
+
stop_words_list: Optional[np.ndarray] = None
|
136 |
+
embedding_bias: Optional[np.ndarray] = None
|
137 |
+
end_id: Optional[np.ndarray] = None
|
138 |
+
pad_id: Optional[np.ndarray] = None
|
139 |
+
prompt_table_extra_ids: Optional[np.ndarray] = None
|
140 |
+
pixel_values: Optional[np.ndarray] = None
|
141 |
+
image_sizes: Optional[np.ndarray] = None
|
142 |
+
is_video_input: Optional[np.ndarray] = None
|
143 |
+
|
144 |
+
@classmethod
|
145 |
+
def with_new_inputs(cls,
|
146 |
+
other,
|
147 |
+
input_ids: Optional[np.ndarray] = None,
|
148 |
+
input_lengths: Optional[np.ndarray] = None):
|
149 |
+
return cls(input_ids=(input_ids
|
150 |
+
if input_ids is not None else other.input_ids),
|
151 |
+
input_lengths=(input_lengths if input_lengths is not None
|
152 |
+
else other.input_lengths),
|
153 |
+
decoder_input_ids=other.decoder_input_ids,
|
154 |
+
decoder_input_lengths=other.decoder_input_lengths,
|
155 |
+
bad_words_list=other.bad_words_list,
|
156 |
+
stop_words_list=other.stop_words_list,
|
157 |
+
end_id=other.end_id,
|
158 |
+
pad_id=other.pad_id,
|
159 |
+
prompt_table_extra_ids=other.prompt_table_extra_ids)
|
160 |
+
|
161 |
+
|
162 |
+
@dataclass
|
163 |
+
class MultimodalEncResponse:
|
164 |
+
prompt_embedding_table: Optional[torch.Tensor] = None
|
165 |
+
prompt_vocab_size: Optional[np.ndarray] = None
|
166 |
+
|
167 |
+
|
168 |
+
@dataclass
|
169 |
+
class GenerationResponse:
|
170 |
+
output_ids: np.ndarray = field(default_factory=lambda: np.array([]))
|
171 |
+
sequence_length: np.ndarray = field(default_factory=lambda: np.array([]))
|
172 |
+
cum_log_probs: Optional[np.ndarray] = None
|
173 |
+
output_log_probs: Optional[np.ndarray] = None
|
174 |
+
context_logits: Optional[np.ndarray] = None
|
175 |
+
generation_logits: Optional[np.ndarray] = None
|
176 |
+
batch_index: Optional[np.ndarray] = None
|
177 |
+
sequence_index: Optional[np.ndarray] = None
|
178 |
+
kv_cache_alloc_new_blocks: Optional[np.ndarray] = None
|
179 |
+
kv_cache_reused_blocks: Optional[np.ndarray] = None
|
180 |
+
kv_cache_alloc_total_blocks: Optional[np.ndarray] = None
|
181 |
+
|
182 |
+
|
183 |
+
@dataclass
|
184 |
+
class Response:
|
185 |
+
text_output: np.ndarray = field(default_factory=lambda: np.array([]))
|
186 |
+
cum_log_probs: Optional[np.ndarray] = None
|
187 |
+
output_log_probs: Optional[np.ndarray] = None
|
188 |
+
context_logits: Optional[np.ndarray] = None
|
189 |
+
generation_logits: Optional[np.ndarray] = None
|
190 |
+
batch_index: Optional[np.ndarray] = None
|
191 |
+
sequence_index: Optional[np.ndarray] = None
|
192 |
+
kv_cache_alloc_new_blocks: Optional[np.ndarray] = None
|
193 |
+
kv_cache_reused_blocks: Optional[np.ndarray] = None
|
194 |
+
kv_cache_alloc_total_blocks: Optional[np.ndarray] = None
|
195 |
+
|
196 |
+
def __eq__(self, o) -> bool:
|
197 |
+
"""Just for testing"""
|
198 |
+
if not isinstance(o, Response):
|
199 |
+
return False
|
200 |
+
return (np.array_equal(self.text_output, o.text_output)
|
201 |
+
and np.array_equal(self.cum_log_probs, o.cum_log_probs)
|
202 |
+
and np.array_equal(self.output_log_probs, o.output_log_probs)
|
203 |
+
and np.array_equal(self.context_logits, o.context_logits)
|
204 |
+
and np.array_equal(self.generation_logits, o.generation_logits)
|
205 |
+
and np.array_equal(self.batch_index, o.batch_index)
|
206 |
+
and np.array_equal(self.sequence_index, o.sequence_index)
|
207 |
+
and np.array_equal(self.sequence_index, o.sequence_index)
|
208 |
+
and np.array_equal(self.kv_cache_alloc_new_blocks,
|
209 |
+
o.kv_cache_alloc_new_blocks)
|
210 |
+
and np.array_equal(self.kv_cache_reused_blocks,
|
211 |
+
o.kv_cache_reused_blocks)
|
212 |
+
and np.array_equal(self.kv_cache_alloc_total_blocks,
|
213 |
+
o.kv_cache_alloc_total_blocks))
|
214 |
+
|
215 |
+
|
216 |
+
class Decoder:
|
217 |
+
|
218 |
+
def __init__(self, streaming=False, accumulate=False):
|
219 |
+
self._streaming = streaming
|
220 |
+
self._accumulate = accumulate
|
221 |
+
|
222 |
+
self._accumulated_tokens = []
|
223 |
+
|
224 |
+
def decode(self,
|
225 |
+
request: Request,
|
226 |
+
speculative_decoding=False,
|
227 |
+
is_multimodal=False) -> Generator[Response, None, None]:
|
228 |
+
|
229 |
+
batch_size = request.text_input.shape[0]
|
230 |
+
self._accumulated_tokens = [None] * batch_size
|
231 |
+
preproc_response = self.preprocess(request)
|
232 |
+
|
233 |
+
multimodal_enc_response = None
|
234 |
+
if is_multimodal:
|
235 |
+
multimodal_enc_response = self._multimodal_enc_generate(
|
236 |
+
request, preproc_response)
|
237 |
+
|
238 |
+
if speculative_decoding:
|
239 |
+
if batch_size > 1:
|
240 |
+
raise Exception(
|
241 |
+
"speculative decoding is not supported with batch size > 1"
|
242 |
+
)
|
243 |
+
for gen_response in self._spec_generate(preproc_response, request):
|
244 |
+
yield self.postprocess(gen_response, batch_size)
|
245 |
+
else:
|
246 |
+
if not self._streaming and batch_size == 1:
|
247 |
+
gen_response = self._generate_non_streaming(
|
248 |
+
preproc_response,
|
249 |
+
request,
|
250 |
+
multimodal_enc_response=multimodal_enc_response)
|
251 |
+
yield self.postprocess(gen_response, batch_size)
|
252 |
+
else:
|
253 |
+
for gen_response in self._generate(
|
254 |
+
preproc_response,
|
255 |
+
request,
|
256 |
+
multimodal_enc_response=multimodal_enc_response):
|
257 |
+
yield self.postprocess(gen_response, batch_size)
|
258 |
+
|
259 |
+
def encountered_stop_words(self, input_ids, stop_words_ids):
|
260 |
+
for stop_word_ids in stop_words_ids:
|
261 |
+
if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids):
|
262 |
+
return True
|
263 |
+
return False
|
264 |
+
|
265 |
+
def _spec_generate(
|
266 |
+
self, preproc: PreprocResponse,
|
267 |
+
request: Request) -> Generator[GenerationResponse, None, None]:
|
268 |
+
|
269 |
+
if preproc.input_ids.shape[0] > 1:
|
270 |
+
raise Exception(
|
271 |
+
"Speculative decoding does not support batch size > 1.")
|
272 |
+
|
273 |
+
prompt_input_ids: np.ndarray = preproc.input_ids[0]
|
274 |
+
input_ids: np.ndarray = prompt_input_ids
|
275 |
+
output_len: int = request.max_tokens[0][0]
|
276 |
+
last_input_ids: np.ndarray = None
|
277 |
+
draft_output_ids: np.ndarray = None
|
278 |
+
draft_logits: np.ndarray = None
|
279 |
+
|
280 |
+
target_response: GenerationResponse = None
|
281 |
+
|
282 |
+
cur_preproc = preproc
|
283 |
+
|
284 |
+
counter = 0
|
285 |
+
while True:
|
286 |
+
counter += 1
|
287 |
+
num_draft_tokens = min(
|
288 |
+
request.num_draft_tokens[0][0],
|
289 |
+
len(prompt_input_ids) + output_len - len(input_ids) - 1)
|
290 |
+
|
291 |
+
draft_request = None
|
292 |
+
if num_draft_tokens > 0:
|
293 |
+
request.min_length = np.array([num_draft_tokens],
|
294 |
+
dtype=np.int32)
|
295 |
+
draft_response: GenerationResponse = self._draft_generate_non_streaming(
|
296 |
+
cur_preproc, request, num_draft_tokens)
|
297 |
+
seq_len: int = draft_response.sequence_length[0][0]
|
298 |
+
# [1, beamWidth, outputLength] -> [outputLen]
|
299 |
+
draft_output_ids = draft_response.output_ids[0][0]
|
300 |
+
# [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded]
|
301 |
+
if request.use_draft_logits is not None and request.use_draft_logits[
|
302 |
+
0]:
|
303 |
+
if draft_response.generation_logits is not None:
|
304 |
+
draft_logits = draft_response.generation_logits[0][0]
|
305 |
+
|
306 |
+
input_draft_tokens = draft_output_ids[len(input_ids):seq_len]
|
307 |
+
if len(input_draft_tokens) > 0:
|
308 |
+
draft_request = DraftRequest(
|
309 |
+
draft_input_ids=np.expand_dims(input_draft_tokens, 0))
|
310 |
+
if request.use_draft_logits is not None and request.use_draft_logits[
|
311 |
+
0]:
|
312 |
+
draft_request.draft_logits = np.expand_dims(
|
313 |
+
draft_logits[-len(input_draft_tokens):], 0)
|
314 |
+
else:
|
315 |
+
draft_request = DraftRequest()
|
316 |
+
request.min_length = None
|
317 |
+
else:
|
318 |
+
draft_request = DraftRequest()
|
319 |
+
target_response = self._generate_non_streaming(
|
320 |
+
cur_preproc, request, draft_request)
|
321 |
+
last_input_ids = input_ids
|
322 |
+
input_ids = target_response.output_ids[0][0]
|
323 |
+
cur_preproc = PreprocResponse.with_new_inputs(
|
324 |
+
cur_preproc, np.expand_dims(input_ids, 0),
|
325 |
+
np.array([[len(input_ids)]], dtype=np.int32))
|
326 |
+
|
327 |
+
# Evaluate criteria to stop generation loop.
|
328 |
+
# If we've hit or exceeded the max output length, should stop
|
329 |
+
length_stop = (len(input_ids)
|
330 |
+
>= len(prompt_input_ids) + output_len)
|
331 |
+
if length_stop:
|
332 |
+
break
|
333 |
+
# If draft and target have same outputs, should stop. Normally target should return 1 more token.
|
334 |
+
# If they are the same length, they should differ at the last token
|
335 |
+
target_draft_equal = draft_output_ids is not None and np.array_equal(
|
336 |
+
draft_output_ids, input_ids)
|
337 |
+
if target_draft_equal:
|
338 |
+
break
|
339 |
+
# If tokens no longer change, should stop, means we have hit early stopping
|
340 |
+
last_current_equal = np.array_equal(last_input_ids, input_ids)
|
341 |
+
if last_current_equal:
|
342 |
+
break
|
343 |
+
# Need to check if stop words was encountered
|
344 |
+
hit_stop_words = self.encountered_stop_words(
|
345 |
+
input_ids, preproc.stop_words_list[0])
|
346 |
+
if hit_stop_words:
|
347 |
+
break
|
348 |
+
|
349 |
+
yield target_response
|
350 |
+
|
351 |
+
def _draft_generate_non_streaming(
|
352 |
+
self, preproc: PreprocResponse, request: Request,
|
353 |
+
num_draft_tokens: int) -> GenerationResponse:
|
354 |
+
raise NotImplementedError()
|
355 |
+
|
356 |
+
def _multimodal_enc_generate(
|
357 |
+
self,
|
358 |
+
request: Request,
|
359 |
+
) -> MultimodalEncResponse:
|
360 |
+
raise NotImplementedError()
|
361 |
+
|
362 |
+
def _generate(
|
363 |
+
self,
|
364 |
+
preproc: PreprocResponse,
|
365 |
+
request: Request,
|
366 |
+
draft_request: Optional[DraftRequest] = None,
|
367 |
+
multimodal_enc_response: Optional[MultimodalEncResponse] = None,
|
368 |
+
) -> Generator[GenerationResponse, None, None]:
|
369 |
+
raise NotImplementedError()
|
370 |
+
|
371 |
+
def _generate_non_streaming(
|
372 |
+
self,
|
373 |
+
preproc: PreprocResponse,
|
374 |
+
request: Request,
|
375 |
+
draft_request: Optional[DraftRequest] = None,
|
376 |
+
multimodal_enc_response: Optional[MultimodalEncResponse] = None,
|
377 |
+
) -> GenerationResponse:
|
378 |
+
raise NotImplementedError()
|
379 |
+
|
380 |
+
def postprocess(self, gen_response: GenerationResponse,
|
381 |
+
batch_size) -> Response:
|
382 |
+
if self._accumulate and self._streaming:
|
383 |
+
new_tokens: np.ndarray = gen_response.output_ids
|
384 |
+
if new_tokens.ndim != 3:
|
385 |
+
raise Exception("Expected output_ids tensor to have 3 dims.")
|
386 |
+
if new_tokens.shape[0] != 1:
|
387 |
+
raise Exception("Expected batch size of 1")
|
388 |
+
if new_tokens.shape[1] != 1:
|
389 |
+
raise Exception(
|
390 |
+
"Accumulation of tokens is only implemented for beam width = 1"
|
391 |
+
)
|
392 |
+
|
393 |
+
batch_index = gen_response.batch_index
|
394 |
+
if batch_index is not None:
|
395 |
+
if batch_index.ndim != 2:
|
396 |
+
raise Exception(
|
397 |
+
"Expected batch_index tensor to have 2 dims.")
|
398 |
+
if batch_index.shape[0] != 1:
|
399 |
+
raise Exception("Expected batch size of 1")
|
400 |
+
if batch_index.shape[1] != 1:
|
401 |
+
raise Exception("Expected only one batch_index")
|
402 |
+
|
403 |
+
batch_index = batch_index[0][0] if batch_index is not None else 0
|
404 |
+
|
405 |
+
self._accumulated_tokens[batch_index] = new_tokens if (
|
406 |
+
self._accumulated_tokens[batch_index]
|
407 |
+
is None) else np.concatenate(
|
408 |
+
(self._accumulated_tokens[batch_index], new_tokens),
|
409 |
+
axis=2)
|
410 |
+
sequence_lengths = np.array(
|
411 |
+
[[self._accumulated_tokens[batch_index].shape[2]]],
|
412 |
+
dtype=np.int32)
|
413 |
+
return self._postprocess(self._accumulated_tokens[batch_index],
|
414 |
+
sequence_lengths, gen_response)
|
415 |
+
else:
|
416 |
+
return self._postprocess(gen_response.output_ids, None,
|
417 |
+
gen_response)
|
418 |
+
|
419 |
+
def _postprocess(self, tokens: np.ndarray,
|
420 |
+
sequence_lengths: Optional[np.ndarray],
|
421 |
+
gen_response: GenerationResponse) -> Response:
|
422 |
+
raise NotImplementedError()
|
423 |
+
|
424 |
+
def preprocess(self, request: Request) -> PreprocResponse:
|
425 |
+
raise NotImplementedError()
|
426 |
+
|
427 |
+
def reset_decoder(self):
|
428 |
+
self._accumulated_tokens = []
|
tensorrt_llm_bls/1/lib/triton_decoder.py
ADDED
@@ -0,0 +1,542 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
from collections.abc import Callable
|
28 |
+
from typing import Dict, Optional
|
29 |
+
|
30 |
+
import numpy as np
|
31 |
+
import triton_python_backend_utils as pb_utils
|
32 |
+
from lib.decode import *
|
33 |
+
from torch.utils.dlpack import from_dlpack, to_dlpack
|
34 |
+
from typing_extensions import override
|
35 |
+
|
36 |
+
|
37 |
+
class TritonDecoder(Decoder):
|
38 |
+
|
39 |
+
def __init__(self,
|
40 |
+
streaming=False,
|
41 |
+
accumulate=False,
|
42 |
+
preproc_model_name="preprocessing",
|
43 |
+
postproc_model_name="postprocessing",
|
44 |
+
llm_model_name="tensorrt_llm",
|
45 |
+
draft_llm_model_name: Optional[str] = None,
|
46 |
+
multimodal_encoders_name: Optional[str] = None):
|
47 |
+
super().__init__(streaming=streaming, accumulate=accumulate)
|
48 |
+
self.preproc_model_name = preproc_model_name
|
49 |
+
self.postproc_model_name = postproc_model_name
|
50 |
+
self.llm_model_name = llm_model_name
|
51 |
+
self.draft_llm_model_name = draft_llm_model_name
|
52 |
+
self.multimodal_encoders_name = multimodal_encoders_name
|
53 |
+
|
54 |
+
self._preproc_outputs = [
|
55 |
+
"INPUT_ID",
|
56 |
+
"DECODER_INPUT_ID",
|
57 |
+
"REQUEST_INPUT_LEN",
|
58 |
+
"REQUEST_DECODER_INPUT_LEN",
|
59 |
+
"BAD_WORDS_IDS",
|
60 |
+
"STOP_WORDS_IDS",
|
61 |
+
"EMBEDDING_BIAS",
|
62 |
+
"OUT_PAD_ID",
|
63 |
+
"OUT_END_ID",
|
64 |
+
"OUT_PROMPT_TABLE_EXTRA_IDS",
|
65 |
+
"PIXEL_VALUES",
|
66 |
+
"IMAGE_SIZES",
|
67 |
+
"IS_VIDEO_INPUT",
|
68 |
+
]
|
69 |
+
|
70 |
+
self._multimodal_enc_outputs = [
|
71 |
+
"OUT_PROMPT_EMBEDDING_TABLE", "OUT_PROMPT_VOCAB_SIZE"
|
72 |
+
]
|
73 |
+
|
74 |
+
self._llm_outputs = [
|
75 |
+
"output_ids", "sequence_length", "cum_log_probs",
|
76 |
+
"output_log_probs", "context_logits", "generation_logits",
|
77 |
+
"batch_index", "sequence_index", "kv_cache_alloc_new_blocks",
|
78 |
+
"kv_cache_reused_blocks", "kv_cache_alloc_total_blocks"
|
79 |
+
]
|
80 |
+
|
81 |
+
self._postproc_outputs = [
|
82 |
+
"OUTPUT",
|
83 |
+
]
|
84 |
+
|
85 |
+
self.input_names = [
|
86 |
+
"text_input", "decoder_text_input", "image_input",
|
87 |
+
"image_bytes_input", "image_url_input", "video_bytes_input",
|
88 |
+
"max_tokens", "bad_words", "stop_words", "end_id", "pad_id",
|
89 |
+
"top_k", "top_p", "temperature", "length_penalty",
|
90 |
+
"repetition_penalty", "min_length", "presence_penalty",
|
91 |
+
"frequency_penalty", "random_seed", "return_log_probs",
|
92 |
+
"return_context_logits", "return_generation_logits", "beam_width",
|
93 |
+
"stream", "prompt_embedding_table", "prompt_vocab_size",
|
94 |
+
"prompt_table_extra_id", "embedding_bias_words",
|
95 |
+
"embedding_bias_weights", "num_draft_tokens", "use_draft_logits",
|
96 |
+
"lora_task_id", "lora_weights", "lora_config",
|
97 |
+
"exclude_input_in_output", "return_kv_cache_reuse_stats",
|
98 |
+
"guided_decoding_guide_type", "guided_decoding_guide"
|
99 |
+
]
|
100 |
+
|
101 |
+
self.__undo_reshape_whitelist = {
|
102 |
+
"max_tokens", "end_id", "pad_id", "top_k", "top_p", "temperature",
|
103 |
+
"length_penalty", "repetition_penalty", "min_length",
|
104 |
+
"presence_penalty", "frequency_penalty", "random_seed",
|
105 |
+
"return_log_probs", "return_context_logits",
|
106 |
+
"return_generation_logits", "beam_width", "stream",
|
107 |
+
"prompt_vocab_size", "num_draft_tokens", "use_draft_logits",
|
108 |
+
"exclude_input_in_output", "return_kv_cache_reuse_stats",
|
109 |
+
"lora_weights", "lora_config", "lora_task_id"
|
110 |
+
}
|
111 |
+
|
112 |
+
def _exec_triton_request(self, request):
|
113 |
+
responses = request.exec(decoupled=True)
|
114 |
+
for r in responses:
|
115 |
+
if r.has_error():
|
116 |
+
raise pb_utils.TritonModelException(r.error().message())
|
117 |
+
yield r
|
118 |
+
|
119 |
+
def _exec_triton_request_single(self, request):
|
120 |
+
responses = request.exec(decoupled=False)
|
121 |
+
if responses.has_error():
|
122 |
+
raise pb_utils.TritonModelException(responses.error().message())
|
123 |
+
return responses
|
124 |
+
|
125 |
+
def create_triton_response(self, response: Response):
|
126 |
+
name_map = {
|
127 |
+
"text_output": "text_output",
|
128 |
+
"cum_log_probs": "cum_log_probs",
|
129 |
+
"output_log_probs": "output_log_probs",
|
130 |
+
"context_logits": "context_logits",
|
131 |
+
"generation_logits": "generation_logits",
|
132 |
+
"batch_index": "batch_index",
|
133 |
+
"sequence_index": "sequence_index",
|
134 |
+
"kv_cache_alloc_new_blocks": "kv_cache_alloc_new_blocks",
|
135 |
+
"kv_cache_reused_blocks": "kv_cache_reused_blocks",
|
136 |
+
"kv_cache_alloc_total_blocks": "kv_cache_alloc_total_blocks"
|
137 |
+
}
|
138 |
+
tensors = self.create_triton_tensors(response, name_map)
|
139 |
+
return pb_utils.InferenceResponse(output_tensors=tensors)
|
140 |
+
|
141 |
+
def convert_triton_request(self, triton_request) -> Request:
|
142 |
+
request = Request()
|
143 |
+
for triton_name in self.input_names:
|
144 |
+
tensor = pb_utils.get_input_tensor_by_name(triton_request,
|
145 |
+
triton_name)
|
146 |
+
target_name = triton_name
|
147 |
+
if tensor is None:
|
148 |
+
continue
|
149 |
+
if not hasattr(request, target_name):
|
150 |
+
raise AttributeError(
|
151 |
+
f"Request has no attribute '{target_name}'")
|
152 |
+
setattr(request, target_name, tensor.as_numpy())
|
153 |
+
return request
|
154 |
+
|
155 |
+
def convert_triton_response(self,
|
156 |
+
triton_response,
|
157 |
+
response_factory: Callable,
|
158 |
+
name_map=None):
|
159 |
+
response = response_factory()
|
160 |
+
for tensor in triton_response.output_tensors():
|
161 |
+
if tensor is None:
|
162 |
+
continue
|
163 |
+
triton_name = tensor.name()
|
164 |
+
if tensor.is_cpu():
|
165 |
+
value = tensor.as_numpy()
|
166 |
+
else:
|
167 |
+
# If the tensor is in GPU memory make it torch.Tensor type
|
168 |
+
value = from_dlpack(tensor.to_dlpack())
|
169 |
+
target_name = triton_name
|
170 |
+
if name_map and triton_name in name_map:
|
171 |
+
target_name = name_map[triton_name]
|
172 |
+
if name_map and not triton_name in name_map:
|
173 |
+
continue
|
174 |
+
if target_name is None:
|
175 |
+
# explicitly ignore this triton input
|
176 |
+
continue
|
177 |
+
if not hasattr(response, target_name):
|
178 |
+
raise AttributeError(
|
179 |
+
f"response object has not attribute '{target_name}'")
|
180 |
+
setattr(response, target_name, value)
|
181 |
+
return response
|
182 |
+
|
183 |
+
def __undo_reshape(self, x, name):
|
184 |
+
if name in self.__undo_reshape_whitelist and len(x.shape) == 1:
|
185 |
+
# handle reshapes
|
186 |
+
return np.expand_dims(x, 0)
|
187 |
+
else:
|
188 |
+
return x
|
189 |
+
|
190 |
+
def create_triton_tensors(self, obj, name_map: dict):
|
191 |
+
tensors = []
|
192 |
+
for name, triton_name in name_map.items():
|
193 |
+
if triton_name is None:
|
194 |
+
continue
|
195 |
+
value = getattr(obj, name)
|
196 |
+
if value is None:
|
197 |
+
continue
|
198 |
+
if isinstance(value, np.ndarray):
|
199 |
+
t = pb_utils.Tensor(triton_name,
|
200 |
+
self.__undo_reshape(value, name))
|
201 |
+
elif isinstance(value, torch.Tensor):
|
202 |
+
t = pb_utils.Tensor.from_dlpack(
|
203 |
+
triton_name, to_dlpack(self.__undo_reshape(value, name)))
|
204 |
+
tensors.append(t)
|
205 |
+
return tensors
|
206 |
+
|
207 |
+
@override
|
208 |
+
def preprocess(self, request: Request) -> PreprocResponse:
|
209 |
+
input_tensors = self._get_preproc_tensors(request)
|
210 |
+
triton_req = pb_utils.InferenceRequest(
|
211 |
+
model_name=self.preproc_model_name,
|
212 |
+
inputs=input_tensors,
|
213 |
+
requested_output_names=self._preproc_outputs)
|
214 |
+
triton_output = self._exec_triton_request_single(triton_req)
|
215 |
+
return self._get_preproc_response(triton_output)
|
216 |
+
|
217 |
+
def _get_preproc_tensors(self, request: Request):
|
218 |
+
name_map = {
|
219 |
+
"text_input": "QUERY",
|
220 |
+
"image_bytes_input": "IMAGE_BYTES",
|
221 |
+
"image_url_input": "IMAGE_URL",
|
222 |
+
"video_bytes_input": "VIDEO_BYTES",
|
223 |
+
"decoder_text_input": "DECODER_QUERY",
|
224 |
+
"max_tokens": "REQUEST_OUTPUT_LEN",
|
225 |
+
"bad_words": "BAD_WORDS_DICT",
|
226 |
+
"stop_words": "STOP_WORDS_DICT",
|
227 |
+
"embedding_bias_words": "EMBEDDING_BIAS_WORDS",
|
228 |
+
"embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS",
|
229 |
+
"pad_id": "PAD_ID",
|
230 |
+
"end_id": "END_ID",
|
231 |
+
"prompt_table_extra_id": "PROMPT_TABLE_EXTRA_ID",
|
232 |
+
}
|
233 |
+
return self.create_triton_tensors(request, name_map)
|
234 |
+
|
235 |
+
def _get_preproc_response(self, triton_output):
|
236 |
+
name_map = {
|
237 |
+
"INPUT_ID": "input_ids",
|
238 |
+
"DECODER_INPUT_ID": "decoder_input_ids",
|
239 |
+
"REQUEST_INPUT_LEN": "input_lengths",
|
240 |
+
"REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths",
|
241 |
+
"BAD_WORDS_IDS": "bad_words_list",
|
242 |
+
"STOP_WORDS_IDS": "stop_words_list",
|
243 |
+
"EMBEDDING_BIAS": "embedding_bias",
|
244 |
+
"OUT_PAD_ID": "pad_id",
|
245 |
+
"OUT_END_ID": "end_id",
|
246 |
+
"OUT_PROMPT_TABLE_EXTRA_IDS": "prompt_table_extra_ids",
|
247 |
+
"PIXEL_VALUES": "pixel_values",
|
248 |
+
"IMAGE_SIZES": "image_sizes",
|
249 |
+
"IS_VIDEO_INPUT": "is_video_input",
|
250 |
+
}
|
251 |
+
return self.convert_triton_response(triton_output, PreprocResponse,
|
252 |
+
name_map)
|
253 |
+
|
254 |
+
@override
|
255 |
+
def _multimodal_enc_generate(
|
256 |
+
self,
|
257 |
+
request: Request,
|
258 |
+
preproc: PreprocResponse,
|
259 |
+
) -> MultimodalEncResponse:
|
260 |
+
input_tensors = self._get_multimodal_enc_tensors(request, preproc)
|
261 |
+
triton_req = pb_utils.InferenceRequest(
|
262 |
+
model_name=self.multimodal_encoders_name,
|
263 |
+
inputs=input_tensors,
|
264 |
+
requested_output_names=self._multimodal_enc_outputs)
|
265 |
+
triton_output = self._exec_triton_request_single(triton_req)
|
266 |
+
return self._get_multimodal_enc_response(triton_output)
|
267 |
+
|
268 |
+
def _get_multimodal_enc_tensors(self, request: Request,
|
269 |
+
preproc: PreprocResponse):
|
270 |
+
name_map_request = {
|
271 |
+
"image_input": "IMAGE",
|
272 |
+
}
|
273 |
+
name_map_preproc = {
|
274 |
+
"pixel_values": "pixel_values",
|
275 |
+
"image_sizes": "image_sizes",
|
276 |
+
"is_video_input": "is_video_input"
|
277 |
+
}
|
278 |
+
tensors = []
|
279 |
+
tensors.extend(self.create_triton_tensors(request, name_map_request))
|
280 |
+
tensors.extend(self.create_triton_tensors(preproc, name_map_preproc))
|
281 |
+
return tensors
|
282 |
+
|
283 |
+
def _get_multimodal_enc_response(self, triton_output):
|
284 |
+
name_map = {
|
285 |
+
"OUT_PROMPT_EMBEDDING_TABLE": "prompt_embedding_table",
|
286 |
+
"OUT_PROMPT_VOCAB_SIZE": "prompt_vocab_size",
|
287 |
+
}
|
288 |
+
return self.convert_triton_response(triton_output,
|
289 |
+
MultimodalEncResponse, name_map)
|
290 |
+
|
291 |
+
@override
|
292 |
+
def _draft_generate_non_streaming(
|
293 |
+
self, preproc: PreprocResponse, request: Request,
|
294 |
+
num_draft_tokens: int) -> GenerationResponse:
|
295 |
+
input_tensors = self._get_llm_tensors(preproc, request,
|
296 |
+
num_draft_tokens, None, True)
|
297 |
+
triton_req = pb_utils.InferenceRequest(
|
298 |
+
model_name=self.draft_llm_model_name,
|
299 |
+
inputs=input_tensors,
|
300 |
+
requested_output_names=self._llm_outputs)
|
301 |
+
triton_response = self._exec_triton_request_single(triton_req)
|
302 |
+
llm_response = self._get_llm_response(triton_response)
|
303 |
+
return llm_response
|
304 |
+
|
305 |
+
@override
|
306 |
+
def _generate(
|
307 |
+
self,
|
308 |
+
preproc: PreprocResponse,
|
309 |
+
request: Request,
|
310 |
+
draft_request: Optional[DraftRequest] = None,
|
311 |
+
multimodal_enc_response: Optional[MultimodalEncResponse] = None
|
312 |
+
) -> Generator[GenerationResponse, None, None]:
|
313 |
+
input_tensors = self._get_llm_tensors(
|
314 |
+
preproc,
|
315 |
+
request,
|
316 |
+
None,
|
317 |
+
draft_request,
|
318 |
+
multimodal_enc_response=multimodal_enc_response)
|
319 |
+
triton_req = pb_utils.InferenceRequest(
|
320 |
+
model_name=self.llm_model_name,
|
321 |
+
inputs=input_tensors,
|
322 |
+
requested_output_names=self._llm_outputs)
|
323 |
+
for r in self._exec_triton_request(triton_req):
|
324 |
+
yield self._get_llm_response(r)
|
325 |
+
|
326 |
+
@override
|
327 |
+
def _generate_non_streaming(
|
328 |
+
self,
|
329 |
+
preproc: PreprocResponse,
|
330 |
+
request: Request,
|
331 |
+
draft_request: Optional[DraftRequest] = None,
|
332 |
+
multimodal_enc_response: Optional[MultimodalEncResponse] = None
|
333 |
+
) -> GenerationResponse:
|
334 |
+
input_tensors = self._get_llm_tensors(
|
335 |
+
preproc,
|
336 |
+
request,
|
337 |
+
None,
|
338 |
+
draft_request,
|
339 |
+
multimodal_enc_response=multimodal_enc_response)
|
340 |
+
triton_req = pb_utils.InferenceRequest(
|
341 |
+
model_name=self.llm_model_name,
|
342 |
+
inputs=input_tensors,
|
343 |
+
requested_output_names=self._llm_outputs)
|
344 |
+
r = self._exec_triton_request_single(triton_req)
|
345 |
+
return self._get_llm_response(r)
|
346 |
+
|
347 |
+
def _get_llm_tensors(
|
348 |
+
self,
|
349 |
+
preproc: PreprocResponse,
|
350 |
+
request: Request,
|
351 |
+
num_output_tokens: Optional[int] = None,
|
352 |
+
draft_request: Optional[DraftRequest] = None,
|
353 |
+
is_draft_model_request: bool = False,
|
354 |
+
multimodal_enc_response: MultimodalEncResponse = None):
|
355 |
+
tensors = []
|
356 |
+
tensors.extend(self._get_tensors_from_preproc(preproc))
|
357 |
+
if multimodal_enc_response is not None:
|
358 |
+
tensors.extend(
|
359 |
+
self._get_tensors_from_multimodal_enc(multimodal_enc_response))
|
360 |
+
tensors.extend(
|
361 |
+
self._get_llm_tensors_from_request(request, num_output_tokens,
|
362 |
+
draft_request,
|
363 |
+
is_draft_model_request))
|
364 |
+
return tensors
|
365 |
+
|
366 |
+
def _get_tensors_from_preproc(self, preproc: PreprocResponse):
|
367 |
+
name_map = {
|
368 |
+
"input_ids": "input_ids",
|
369 |
+
"decoder_input_ids": "decoder_input_ids",
|
370 |
+
"input_lengths": "input_lengths",
|
371 |
+
"bad_words_list": "bad_words_list",
|
372 |
+
"stop_words_list": "stop_words_list",
|
373 |
+
"embedding_bias": "embedding_bias",
|
374 |
+
"pad_id": "pad_id",
|
375 |
+
"end_id": "end_id",
|
376 |
+
"prompt_table_extra_ids": "prompt_table_extra_ids",
|
377 |
+
}
|
378 |
+
return self.create_triton_tensors(preproc, name_map)
|
379 |
+
|
380 |
+
def _get_tensors_from_multimodal_enc(
|
381 |
+
self, multimodal_enc_response: MultimodalEncResponse):
|
382 |
+
name_map = {
|
383 |
+
"prompt_embedding_table": "prompt_embedding_table",
|
384 |
+
"prompt_vocab_size": "prompt_vocab_size",
|
385 |
+
}
|
386 |
+
return self.create_triton_tensors(multimodal_enc_response, name_map)
|
387 |
+
|
388 |
+
def _get_llm_tensors_from_request(
|
389 |
+
self,
|
390 |
+
request: Request,
|
391 |
+
num_output_tokens: Optional[int] = None,
|
392 |
+
draft_request: Optional[DraftRequest] = None,
|
393 |
+
is_draft_model_request: bool = False):
|
394 |
+
name_map: Dict[str, Optional[str]] = {
|
395 |
+
"beam_width": "beam_width",
|
396 |
+
"top_k": "runtime_top_k",
|
397 |
+
"top_p": "runtime_top_p",
|
398 |
+
"temperature": "temperature",
|
399 |
+
"length_penalty": "len_penalty",
|
400 |
+
"repetition_penalty": "repetition_penalty",
|
401 |
+
"min_length": "min_length",
|
402 |
+
"presence_penalty": "presence_penalty",
|
403 |
+
"frequency_penalty": "frequency_penalty",
|
404 |
+
"random_seed": "random_seed",
|
405 |
+
"return_log_probs": "return_log_probs",
|
406 |
+
"stream": "streaming",
|
407 |
+
"prompt_embedding_table": "prompt_embedding_table",
|
408 |
+
"prompt_vocab_size": "prompt_vocab_size",
|
409 |
+
"lora_task_id": "lora_task_id",
|
410 |
+
"lora_weights": "lora_weights",
|
411 |
+
"lora_config": "lora_config",
|
412 |
+
"exclude_input_in_output": "exclude_input_in_output",
|
413 |
+
"return_kv_cache_reuse_stats": "return_kv_cache_reuse_stats",
|
414 |
+
"guided_decoding_guide_type": "guided_decoding_guide_type",
|
415 |
+
"guided_decoding_guide": "guided_decoding_guide"
|
416 |
+
}
|
417 |
+
batch_size = request.text_input.shape[0]
|
418 |
+
tensors = self.create_triton_tensors(request, name_map)
|
419 |
+
out_len_tensor = None
|
420 |
+
if request.max_tokens is not None:
|
421 |
+
out_len_tensor = request.max_tokens
|
422 |
+
|
423 |
+
out_len = None
|
424 |
+
if num_output_tokens is not None:
|
425 |
+
out_len = num_output_tokens
|
426 |
+
elif draft_request:
|
427 |
+
out_len = len(
|
428 |
+
draft_request.draft_input_ids[0]
|
429 |
+
) + 1 if draft_request.draft_input_ids is not None else 1
|
430 |
+
|
431 |
+
if out_len is not None:
|
432 |
+
out_len_tensor = [[out_len]] * batch_size
|
433 |
+
|
434 |
+
if out_len_tensor is None:
|
435 |
+
raise Exception("Could not determine request_output_len")
|
436 |
+
else:
|
437 |
+
tensors.append(
|
438 |
+
pb_utils.Tensor("request_output_len",
|
439 |
+
np.array(out_len_tensor, dtype=np.int32)))
|
440 |
+
|
441 |
+
if draft_request:
|
442 |
+
if draft_request.draft_input_ids is not None:
|
443 |
+
tensors.append(
|
444 |
+
pb_utils.Tensor("draft_input_ids",
|
445 |
+
draft_request.draft_input_ids))
|
446 |
+
if draft_request.draft_logits is not None and request.use_draft_logits is not None and request.use_draft_logits[
|
447 |
+
0]:
|
448 |
+
tensors.append(
|
449 |
+
pb_utils.Tensor("draft_logits",
|
450 |
+
draft_request.draft_logits))
|
451 |
+
|
452 |
+
return_context_logits_data = [False]
|
453 |
+
return_generation_logits_data = [False]
|
454 |
+
if draft_request is None:
|
455 |
+
if is_draft_model_request:
|
456 |
+
return_generation_logits_data = request.use_draft_logits if request.use_draft_logits is not None else [
|
457 |
+
False
|
458 |
+
]
|
459 |
+
else:
|
460 |
+
return_context_logits_data = request.return_context_logits if request.return_context_logits is not None else [
|
461 |
+
False
|
462 |
+
]
|
463 |
+
return_generation_logits_data = request.return_generation_logits if request.return_generation_logits is not None else [
|
464 |
+
False
|
465 |
+
]
|
466 |
+
return_context_logits = np.array([return_context_logits_data] *
|
467 |
+
batch_size,
|
468 |
+
dtype=bool)
|
469 |
+
return_generation_logits = np.array([return_generation_logits_data] *
|
470 |
+
batch_size,
|
471 |
+
dtype=bool)
|
472 |
+
|
473 |
+
assert len(return_context_logits.shape) == 2
|
474 |
+
assert len(return_generation_logits.shape) == 2
|
475 |
+
|
476 |
+
tensors.append(
|
477 |
+
pb_utils.Tensor("return_context_logits", return_context_logits))
|
478 |
+
tensors.append(
|
479 |
+
pb_utils.Tensor("return_generation_logits",
|
480 |
+
return_generation_logits))
|
481 |
+
return tensors
|
482 |
+
|
483 |
+
def _get_llm_response(self, triton_output):
|
484 |
+
name_map = {
|
485 |
+
"output_ids": "output_ids",
|
486 |
+
"sequence_length": "sequence_length",
|
487 |
+
"cum_log_probs": "cum_log_probs",
|
488 |
+
"output_log_probs": "output_log_probs",
|
489 |
+
"context_logits": "context_logits",
|
490 |
+
"generation_logits": "generation_logits",
|
491 |
+
"batch_index": "batch_index",
|
492 |
+
"sequence_index": "sequence_index",
|
493 |
+
"kv_cache_alloc_new_blocks": "kv_cache_alloc_new_blocks",
|
494 |
+
"kv_cache_reused_blocks": "kv_cache_reused_blocks",
|
495 |
+
"kv_cache_alloc_total_blocks": "kv_cache_alloc_total_blocks"
|
496 |
+
}
|
497 |
+
return self.convert_triton_response(triton_output, GenerationResponse,
|
498 |
+
name_map)
|
499 |
+
|
500 |
+
def _postprocess(self, tokens: np.ndarray,
|
501 |
+
sequence_lengths: Optional[np.ndarray],
|
502 |
+
gen_response: GenerationResponse) -> Response:
|
503 |
+
input_tensors = self._get_postproc_tensors(tokens, sequence_lengths,
|
504 |
+
gen_response)
|
505 |
+
triton_req = pb_utils.InferenceRequest(
|
506 |
+
model_name=self.postproc_model_name,
|
507 |
+
inputs=input_tensors,
|
508 |
+
requested_output_names=self._postproc_outputs)
|
509 |
+
r = self._exec_triton_request_single(triton_req)
|
510 |
+
response = self._get_response(r, gen_response)
|
511 |
+
return response
|
512 |
+
|
513 |
+
def _get_postproc_tensors(self, tokens: np.ndarray,
|
514 |
+
sequence_lengths: Optional[np.ndarray],
|
515 |
+
gen_response: GenerationResponse):
|
516 |
+
tensors = [
|
517 |
+
pb_utils.Tensor("TOKENS_BATCH", tokens),
|
518 |
+
pb_utils.Tensor(
|
519 |
+
"SEQUENCE_LENGTH", sequence_lengths
|
520 |
+
if sequence_lengths else gen_response.sequence_length)
|
521 |
+
]
|
522 |
+
return tensors
|
523 |
+
|
524 |
+
def _get_response(self, triton_output, gen_res: GenerationResponse):
|
525 |
+
tensors = triton_output.output_tensors()
|
526 |
+
t_map = {}
|
527 |
+
for named_t in tensors:
|
528 |
+
name = named_t.name()
|
529 |
+
t = named_t.as_numpy()
|
530 |
+
t_map[name] = t
|
531 |
+
response = Response(
|
532 |
+
text_output=t_map["OUTPUT"],
|
533 |
+
cum_log_probs=gen_res.cum_log_probs,
|
534 |
+
output_log_probs=gen_res.output_log_probs,
|
535 |
+
context_logits=gen_res.context_logits,
|
536 |
+
generation_logits=gen_res.generation_logits,
|
537 |
+
batch_index=gen_res.batch_index,
|
538 |
+
sequence_index=gen_res.sequence_index,
|
539 |
+
kv_cache_alloc_new_blocks=gen_res.kv_cache_alloc_new_blocks,
|
540 |
+
kv_cache_reused_blocks=gen_res.kv_cache_reused_blocks,
|
541 |
+
kv_cache_alloc_total_blocks=gen_res.kv_cache_alloc_total_blocks)
|
542 |
+
return response
|
tensorrt_llm_bls/1/model.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
import json
|
28 |
+
import traceback
|
29 |
+
|
30 |
+
import triton_python_backend_utils as pb_utils
|
31 |
+
from lib.triton_decoder import TritonDecoder
|
32 |
+
|
33 |
+
|
34 |
+
def get_valid_param_value(param, default_value=''):
|
35 |
+
value = param.get('string_value', '')
|
36 |
+
return default_value if value.startswith('${') or value == '' else value
|
37 |
+
|
38 |
+
|
39 |
+
class TritonPythonModel:
|
40 |
+
|
41 |
+
def initialize(self, args):
|
42 |
+
|
43 |
+
# Parse model configs
|
44 |
+
model_config = json.loads(args['model_config'])
|
45 |
+
|
46 |
+
params = model_config['parameters']
|
47 |
+
|
48 |
+
accumulate_tokens_str = get_valid_param_value(
|
49 |
+
params.get('accumulate_tokens', {}))
|
50 |
+
self.accumulate_tokens = accumulate_tokens_str.lower() in [
|
51 |
+
'true', 'yes', '1', 't'
|
52 |
+
]
|
53 |
+
|
54 |
+
self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
|
55 |
+
model_config)
|
56 |
+
|
57 |
+
self.logger = pb_utils.Logger
|
58 |
+
|
59 |
+
default_tensorrt_llm_model_name = 'tensorrt_llm'
|
60 |
+
self.llm_model_name = get_valid_param_value(
|
61 |
+
params.get('tensorrt_llm_model_name', {}),
|
62 |
+
default_tensorrt_llm_model_name)
|
63 |
+
|
64 |
+
self.draft_llm_model_name = get_valid_param_value(
|
65 |
+
params.get('tensorrt_llm_draft_model_name', {}), None)
|
66 |
+
|
67 |
+
self.multimodal_encoders_name = get_valid_param_value(
|
68 |
+
params.get('multimodal_encoders_name', {}), None)
|
69 |
+
|
70 |
+
self.decoder = TritonDecoder(
|
71 |
+
streaming=self.decoupled,
|
72 |
+
accumulate=self.accumulate_tokens,
|
73 |
+
preproc_model_name="preprocessing",
|
74 |
+
postproc_model_name="postprocessing",
|
75 |
+
llm_model_name=self.llm_model_name,
|
76 |
+
draft_llm_model_name=self.draft_llm_model_name,
|
77 |
+
multimodal_encoders_name=self.multimodal_encoders_name)
|
78 |
+
|
79 |
+
def execute(self, requests):
|
80 |
+
|
81 |
+
responses = []
|
82 |
+
|
83 |
+
for request in requests:
|
84 |
+
if self.decoupled:
|
85 |
+
response_sender = request.get_response_sender()
|
86 |
+
try:
|
87 |
+
|
88 |
+
req = self.decoder.convert_triton_request(request)
|
89 |
+
req.validate()
|
90 |
+
speculative_decode = (req.num_draft_tokens is not None
|
91 |
+
and req.num_draft_tokens[0][0] > 0)
|
92 |
+
if speculative_decode and (self.draft_llm_model_name is None
|
93 |
+
or self.draft_llm_model_name == ""):
|
94 |
+
raise Exception(
|
95 |
+
"cannot perform speculative decoding without draft model"
|
96 |
+
)
|
97 |
+
is_multimodal = req.image_input is not None or req.image_bytes_input is not None or req.image_url_input is not None or req.video_bytes_input is not None
|
98 |
+
|
99 |
+
if speculative_decode and is_multimodal:
|
100 |
+
raise Exception(
|
101 |
+
"Multimodal and speculative decoding is not currently supported"
|
102 |
+
)
|
103 |
+
res_gen = self.decoder.decode(
|
104 |
+
req,
|
105 |
+
speculative_decoding=speculative_decode,
|
106 |
+
is_multimodal=is_multimodal)
|
107 |
+
|
108 |
+
for res in res_gen:
|
109 |
+
triton_response = self.decoder.create_triton_response(res)
|
110 |
+
if self.decoupled:
|
111 |
+
response_sender.send(triton_response)
|
112 |
+
else:
|
113 |
+
responses.append(triton_response)
|
114 |
+
|
115 |
+
if self.decoupled:
|
116 |
+
response_sender.send(
|
117 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
118 |
+
|
119 |
+
except Exception:
|
120 |
+
self.logger.log_error(traceback.format_exc())
|
121 |
+
# If encountering an error, send a response with err msg
|
122 |
+
error_response = pb_utils.InferenceResponse(
|
123 |
+
output_tensors=[],
|
124 |
+
error=pb_utils.TritonError(traceback.format_exc()))
|
125 |
+
|
126 |
+
if self.decoupled:
|
127 |
+
response_sender.send(error_response)
|
128 |
+
response_sender.send(
|
129 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
130 |
+
else:
|
131 |
+
responses.append(error_response)
|
132 |
+
|
133 |
+
self.decoder.reset_decoder()
|
134 |
+
|
135 |
+
if self.decoupled:
|
136 |
+
return None
|
137 |
+
else:
|
138 |
+
assert len(responses) == len(requests)
|
139 |
+
return responses
|
140 |
+
|
141 |
+
def finalize(self):
|
142 |
+
"""`finalize` is called only once when the model is being unloaded.
|
143 |
+
Implementing `finalize` function is optional. This function allows
|
144 |
+
the model to perform any necessary clean ups before exit.
|
145 |
+
"""
|
146 |
+
print('Cleaning up...')
|
tensorrt_llm_bls/config.pbtxt
ADDED
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "tensorrt_llm_bls"
|
28 |
+
backend: "python"
|
29 |
+
max_batch_size: 32
|
30 |
+
|
31 |
+
model_transaction_policy {
|
32 |
+
decoupled: True
|
33 |
+
}
|
34 |
+
|
35 |
+
input [
|
36 |
+
{
|
37 |
+
name: "text_input"
|
38 |
+
data_type: TYPE_STRING
|
39 |
+
dims: [ 1 ]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
name: "decoder_text_input"
|
43 |
+
data_type: TYPE_STRING
|
44 |
+
dims: [ 1 ]
|
45 |
+
optional: true
|
46 |
+
},
|
47 |
+
{
|
48 |
+
name: "image_input"
|
49 |
+
data_type: TYPE_FP16
|
50 |
+
dims: [ -1, 3, -1, -1 ]
|
51 |
+
optional: true
|
52 |
+
},
|
53 |
+
{
|
54 |
+
name: "image_bytes_input"
|
55 |
+
data_type: TYPE_UINT8
|
56 |
+
dims: [ -1, -1, -1, -1 ]
|
57 |
+
optional: true
|
58 |
+
},
|
59 |
+
{
|
60 |
+
name: "image_url_input"
|
61 |
+
data_type: TYPE_STRING
|
62 |
+
dims: [ 1 ]
|
63 |
+
optional: true
|
64 |
+
},
|
65 |
+
{
|
66 |
+
name: "video_bytes_input"
|
67 |
+
data_type: TYPE_UINT8
|
68 |
+
dims: [ -1, -1, -1, -1 ]
|
69 |
+
optional: true
|
70 |
+
},
|
71 |
+
{
|
72 |
+
name: "max_tokens"
|
73 |
+
data_type: TYPE_INT32
|
74 |
+
dims: [ 1 ]
|
75 |
+
},
|
76 |
+
{
|
77 |
+
name: "bad_words"
|
78 |
+
data_type: TYPE_STRING
|
79 |
+
dims: [ -1 ]
|
80 |
+
optional: true
|
81 |
+
},
|
82 |
+
{
|
83 |
+
name: "stop_words"
|
84 |
+
data_type: TYPE_STRING
|
85 |
+
dims: [ -1 ]
|
86 |
+
optional: true
|
87 |
+
},
|
88 |
+
{
|
89 |
+
name: "exclude_input_in_output"
|
90 |
+
data_type: TYPE_BOOL
|
91 |
+
dims: [ 1 ]
|
92 |
+
optional: true
|
93 |
+
},
|
94 |
+
{
|
95 |
+
name: "end_id"
|
96 |
+
data_type: TYPE_INT32
|
97 |
+
dims: [ 1 ]
|
98 |
+
optional: true
|
99 |
+
},
|
100 |
+
{
|
101 |
+
name: "pad_id"
|
102 |
+
data_type: TYPE_INT32
|
103 |
+
dims: [ 1 ]
|
104 |
+
optional: true
|
105 |
+
},
|
106 |
+
{
|
107 |
+
name: "top_k"
|
108 |
+
data_type: TYPE_INT32
|
109 |
+
dims: [ 1 ]
|
110 |
+
optional: true
|
111 |
+
},
|
112 |
+
{
|
113 |
+
name: "top_p"
|
114 |
+
data_type: TYPE_FP32
|
115 |
+
dims: [ 1 ]
|
116 |
+
optional: true
|
117 |
+
},
|
118 |
+
{
|
119 |
+
name: "temperature"
|
120 |
+
data_type: TYPE_FP32
|
121 |
+
dims: [ 1 ]
|
122 |
+
optional: true
|
123 |
+
},
|
124 |
+
{
|
125 |
+
name: "length_penalty"
|
126 |
+
data_type: TYPE_FP32
|
127 |
+
dims: [ 1 ]
|
128 |
+
optional: true
|
129 |
+
},
|
130 |
+
{
|
131 |
+
name: "repetition_penalty"
|
132 |
+
data_type: TYPE_FP32
|
133 |
+
dims: [ 1 ]
|
134 |
+
optional: true
|
135 |
+
},
|
136 |
+
{
|
137 |
+
name: "min_length"
|
138 |
+
data_type: TYPE_INT32
|
139 |
+
dims: [ 1 ]
|
140 |
+
optional: true
|
141 |
+
},
|
142 |
+
{
|
143 |
+
name: "presence_penalty"
|
144 |
+
data_type: TYPE_FP32
|
145 |
+
dims: [ 1 ]
|
146 |
+
optional: true
|
147 |
+
},
|
148 |
+
{
|
149 |
+
name: "frequency_penalty"
|
150 |
+
data_type: TYPE_FP32
|
151 |
+
dims: [ 1 ]
|
152 |
+
optional: true
|
153 |
+
},
|
154 |
+
{
|
155 |
+
name: "random_seed"
|
156 |
+
data_type: TYPE_UINT64
|
157 |
+
dims: [ 1 ]
|
158 |
+
optional: true
|
159 |
+
},
|
160 |
+
{
|
161 |
+
name: "return_log_probs"
|
162 |
+
data_type: TYPE_BOOL
|
163 |
+
dims: [ 1 ]
|
164 |
+
reshape: { shape: [ ] }
|
165 |
+
optional: true
|
166 |
+
},
|
167 |
+
{
|
168 |
+
name: "return_context_logits"
|
169 |
+
data_type: TYPE_BOOL
|
170 |
+
dims: [ 1 ]
|
171 |
+
reshape: { shape: [ ] }
|
172 |
+
optional: true
|
173 |
+
},
|
174 |
+
{
|
175 |
+
name: "return_generation_logits"
|
176 |
+
data_type: TYPE_BOOL
|
177 |
+
dims: [ 1 ]
|
178 |
+
reshape: { shape: [ ] }
|
179 |
+
optional: true
|
180 |
+
},
|
181 |
+
{
|
182 |
+
name: "num_return_sequences"
|
183 |
+
data_type: TYPE_INT32
|
184 |
+
dims: [ 1 ]
|
185 |
+
reshape: { shape: [ ] }
|
186 |
+
optional: true
|
187 |
+
},
|
188 |
+
{
|
189 |
+
name: "beam_width"
|
190 |
+
data_type: TYPE_INT32
|
191 |
+
dims: [ 1 ]
|
192 |
+
optional: true
|
193 |
+
},
|
194 |
+
{
|
195 |
+
name: "stream"
|
196 |
+
data_type: TYPE_BOOL
|
197 |
+
dims: [ 1 ]
|
198 |
+
optional: true
|
199 |
+
},
|
200 |
+
{
|
201 |
+
name: "prompt_embedding_table"
|
202 |
+
data_type: TYPE_FP16
|
203 |
+
dims: [ -1, -1 ]
|
204 |
+
optional: true
|
205 |
+
},
|
206 |
+
{
|
207 |
+
name: "prompt_vocab_size"
|
208 |
+
data_type: TYPE_INT32
|
209 |
+
dims: [ 1 ]
|
210 |
+
optional: true
|
211 |
+
},
|
212 |
+
{
|
213 |
+
name: "prompt_table_extra_id"
|
214 |
+
data_type: TYPE_UINT64
|
215 |
+
dims: [ 1 ]
|
216 |
+
optional: true
|
217 |
+
},
|
218 |
+
{
|
219 |
+
name: "embedding_bias_words"
|
220 |
+
data_type: TYPE_STRING
|
221 |
+
dims: [ -1 ]
|
222 |
+
optional: true
|
223 |
+
},
|
224 |
+
{
|
225 |
+
name: "embedding_bias_weights"
|
226 |
+
data_type: TYPE_FP32
|
227 |
+
dims: [ -1 ]
|
228 |
+
optional: true
|
229 |
+
},
|
230 |
+
{
|
231 |
+
name: "num_draft_tokens",
|
232 |
+
data_type: TYPE_INT32,
|
233 |
+
dims: [ 1 ]
|
234 |
+
optional: true
|
235 |
+
},
|
236 |
+
{
|
237 |
+
name: "use_draft_logits",
|
238 |
+
data_type: TYPE_BOOL,
|
239 |
+
dims: [ 1 ]
|
240 |
+
reshape: { shape: [ ] }
|
241 |
+
optional: true
|
242 |
+
},
|
243 |
+
# the unique task ID for the given LoRA.
|
244 |
+
# To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
|
245 |
+
# The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
|
246 |
+
# If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached.
|
247 |
+
{
|
248 |
+
name: "lora_task_id"
|
249 |
+
data_type: TYPE_UINT64
|
250 |
+
dims: [ 1 ]
|
251 |
+
reshape: { shape: [ ] }
|
252 |
+
optional: true
|
253 |
+
},
|
254 |
+
# weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
|
255 |
+
# where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
|
256 |
+
# each of the in / out tensors are first flattened and then concatenated together in the format above.
|
257 |
+
# D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
|
258 |
+
{
|
259 |
+
name: "lora_weights"
|
260 |
+
data_type: TYPE_FP16
|
261 |
+
dims: [ -1, -1 ]
|
262 |
+
optional: true
|
263 |
+
allow_ragged_batch: true
|
264 |
+
},
|
265 |
+
# module identifier (same size a first dimension of lora_weights)
|
266 |
+
# See LoraModule::ModuleType for model id mapping
|
267 |
+
#
|
268 |
+
# "attn_qkv": 0 # compbined qkv adapter
|
269 |
+
# "attn_q": 1 # q adapter
|
270 |
+
# "attn_k": 2 # k adapter
|
271 |
+
# "attn_v": 3 # v adapter
|
272 |
+
# "attn_dense": 4 # adapter for the dense layer in attention
|
273 |
+
# "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
|
274 |
+
# "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
|
275 |
+
# "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
|
276 |
+
#
|
277 |
+
# last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
|
278 |
+
{
|
279 |
+
name: "lora_config"
|
280 |
+
data_type: TYPE_INT32
|
281 |
+
dims: [ -1, 3 ]
|
282 |
+
optional: true
|
283 |
+
allow_ragged_batch: true
|
284 |
+
},
|
285 |
+
{
|
286 |
+
name: "return_kv_cache_reuse_stats"
|
287 |
+
data_type: TYPE_BOOL
|
288 |
+
dims: [ 1 ]
|
289 |
+
reshape: { shape: [ ] }
|
290 |
+
optional: true
|
291 |
+
},
|
292 |
+
{
|
293 |
+
name: "guided_decoding_guide_type"
|
294 |
+
data_type: TYPE_STRING
|
295 |
+
dims: [ 1 ]
|
296 |
+
optional: true
|
297 |
+
},
|
298 |
+
{
|
299 |
+
name: "guided_decoding_guide"
|
300 |
+
data_type: TYPE_STRING
|
301 |
+
dims: [ 1 ]
|
302 |
+
optional: true
|
303 |
+
}
|
304 |
+
]
|
305 |
+
output [
|
306 |
+
{
|
307 |
+
name: "text_output"
|
308 |
+
data_type: TYPE_STRING
|
309 |
+
dims: [ -1 ]
|
310 |
+
},
|
311 |
+
{
|
312 |
+
name: "cum_log_probs"
|
313 |
+
data_type: TYPE_FP32
|
314 |
+
dims: [ -1 ]
|
315 |
+
},
|
316 |
+
{
|
317 |
+
name: "output_log_probs"
|
318 |
+
data_type: TYPE_FP32
|
319 |
+
dims: [ -1, -1 ]
|
320 |
+
},
|
321 |
+
{
|
322 |
+
name: "context_logits"
|
323 |
+
data_type: TYPE_FP16
|
324 |
+
dims: [ -1, -1 ]
|
325 |
+
},
|
326 |
+
{
|
327 |
+
name: "generation_logits"
|
328 |
+
data_type: TYPE_FP16
|
329 |
+
dims: [ -1, -1, -1 ]
|
330 |
+
},
|
331 |
+
{
|
332 |
+
name: "batch_index"
|
333 |
+
data_type: TYPE_INT32
|
334 |
+
dims: [ 1 ]
|
335 |
+
},
|
336 |
+
{
|
337 |
+
name: "sequence_index"
|
338 |
+
data_type: TYPE_INT32
|
339 |
+
dims: [ 1 ]
|
340 |
+
},
|
341 |
+
{
|
342 |
+
name: "kv_cache_alloc_new_blocks"
|
343 |
+
data_type: TYPE_INT32
|
344 |
+
dims: [ 1 ]
|
345 |
+
},
|
346 |
+
{
|
347 |
+
name: "kv_cache_reused_blocks"
|
348 |
+
data_type: TYPE_INT32
|
349 |
+
dims: [ 1 ]
|
350 |
+
},
|
351 |
+
{
|
352 |
+
name: "kv_cache_alloc_total_blocks"
|
353 |
+
data_type: TYPE_INT32
|
354 |
+
dims: [ 1 ]
|
355 |
+
}
|
356 |
+
]
|
357 |
+
|
358 |
+
parameters: {
|
359 |
+
key: "accumulate_tokens"
|
360 |
+
value: {
|
361 |
+
string_value: "${accumulate_tokens}"
|
362 |
+
}
|
363 |
+
}
|
364 |
+
parameters: {
|
365 |
+
key: "tensorrt_llm_model_name"
|
366 |
+
value: {
|
367 |
+
string_value: "tensorrt_llm"
|
368 |
+
}
|
369 |
+
}
|
370 |
+
parameters: {
|
371 |
+
key: "tensorrt_llm_draft_model_name"
|
372 |
+
value: {
|
373 |
+
string_value: "${tensorrt_llm_draft_model_name}"
|
374 |
+
}
|
375 |
+
}
|
376 |
+
parameters: {
|
377 |
+
key: "multimodal_encoders_name"
|
378 |
+
value: {
|
379 |
+
string_value: "${multimodal_encoders_name}"
|
380 |
+
}
|
381 |
+
}
|
382 |
+
|
383 |
+
instance_group [
|
384 |
+
{
|
385 |
+
count: 1
|
386 |
+
kind : KIND_CPU
|
387 |
+
}
|
388 |
+
]
|