Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- ensemble/1/.tmp +0 -0
- ensemble/config.pbtxt +487 -0
- postprocessing/1/__pycache__/model.cpython-310.pyc +0 -0
- postprocessing/1/model.py +250 -0
- postprocessing/config.pbtxt +124 -0
- preprocessing/1/__pycache__/model.cpython-310.pyc +0 -0
- preprocessing/1/model.py +439 -0
- preprocessing/config.pbtxt +170 -0
- tensorrt_llm/1/.gitkeep +0 -0
- tensorrt_llm/1/config.json +170 -0
- tensorrt_llm/1/model.py +947 -0
- tensorrt_llm/1/rank0.engine +3 -0
- tensorrt_llm/1/rank1.engine +3 -0
- tensorrt_llm/config.pbtxt +556 -0
- tensorrt_llm_bls/1/__pycache__/model.cpython-310.pyc +0 -0
- tensorrt_llm_bls/1/lib/__pycache__/decode.cpython-310.pyc +0 -0
- tensorrt_llm_bls/1/lib/__pycache__/triton_decoder.cpython-310.pyc +0 -0
- tensorrt_llm_bls/1/lib/decode.py +386 -0
- tensorrt_llm_bls/1/lib/triton_decoder.py +523 -0
- tensorrt_llm_bls/1/model.py +145 -0
- tensorrt_llm_bls/config.pbtxt +270 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tensorrt_llm/1/rank0.engine filter=lfs diff=lfs merge=lfs -text
|
37 |
+
tensorrt_llm/1/rank1.engine filter=lfs diff=lfs merge=lfs -text
|
ensemble/1/.tmp
ADDED
File without changes
|
ensemble/config.pbtxt
ADDED
@@ -0,0 +1,487 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "ensemble"
|
28 |
+
platform: "ensemble"
|
29 |
+
max_batch_size: 32
|
30 |
+
input [
|
31 |
+
{
|
32 |
+
name: "text_input"
|
33 |
+
data_type: TYPE_STRING
|
34 |
+
dims: [ 1 ]
|
35 |
+
},
|
36 |
+
{
|
37 |
+
name: "decoder_text_input"
|
38 |
+
data_type: TYPE_STRING
|
39 |
+
dims: [ 1 ]
|
40 |
+
optional: true
|
41 |
+
},
|
42 |
+
{
|
43 |
+
name: "max_tokens"
|
44 |
+
data_type: TYPE_INT32
|
45 |
+
dims: [ 1 ]
|
46 |
+
},
|
47 |
+
{
|
48 |
+
name: "bad_words"
|
49 |
+
data_type: TYPE_STRING
|
50 |
+
dims: [ -1 ]
|
51 |
+
optional: true
|
52 |
+
},
|
53 |
+
{
|
54 |
+
name: "stop_words"
|
55 |
+
data_type: TYPE_STRING
|
56 |
+
dims: [ -1 ]
|
57 |
+
optional: true
|
58 |
+
},
|
59 |
+
{
|
60 |
+
name: "end_id"
|
61 |
+
data_type: TYPE_INT32
|
62 |
+
dims: [ 1 ]
|
63 |
+
optional: true
|
64 |
+
},
|
65 |
+
{
|
66 |
+
name: "pad_id"
|
67 |
+
data_type: TYPE_INT32
|
68 |
+
dims: [ 1 ]
|
69 |
+
optional: true
|
70 |
+
},
|
71 |
+
{
|
72 |
+
name: "top_k"
|
73 |
+
data_type: TYPE_INT32
|
74 |
+
dims: [ 1 ]
|
75 |
+
optional: true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
name: "top_p"
|
79 |
+
data_type: TYPE_FP32
|
80 |
+
dims: [ 1 ]
|
81 |
+
optional: true
|
82 |
+
},
|
83 |
+
{
|
84 |
+
name: "temperature"
|
85 |
+
data_type: TYPE_FP32
|
86 |
+
dims: [ 1 ]
|
87 |
+
optional: true
|
88 |
+
},
|
89 |
+
{
|
90 |
+
name: "length_penalty"
|
91 |
+
data_type: TYPE_FP32
|
92 |
+
dims: [ 1 ]
|
93 |
+
optional: true
|
94 |
+
},
|
95 |
+
{
|
96 |
+
name: "repetition_penalty"
|
97 |
+
data_type: TYPE_FP32
|
98 |
+
dims: [ 1 ]
|
99 |
+
optional: true
|
100 |
+
},
|
101 |
+
{
|
102 |
+
name: "min_length"
|
103 |
+
data_type: TYPE_INT32
|
104 |
+
dims: [ 1 ]
|
105 |
+
optional: true
|
106 |
+
},
|
107 |
+
{
|
108 |
+
name: "presence_penalty"
|
109 |
+
data_type: TYPE_FP32
|
110 |
+
dims: [ 1 ]
|
111 |
+
optional: true
|
112 |
+
},
|
113 |
+
{
|
114 |
+
name: "frequency_penalty"
|
115 |
+
data_type: TYPE_FP32
|
116 |
+
dims: [ 1 ]
|
117 |
+
optional: true
|
118 |
+
},
|
119 |
+
{
|
120 |
+
name: "random_seed"
|
121 |
+
data_type: TYPE_UINT64
|
122 |
+
dims: [ 1 ]
|
123 |
+
optional: true
|
124 |
+
},
|
125 |
+
{
|
126 |
+
name: "return_log_probs"
|
127 |
+
data_type: TYPE_BOOL
|
128 |
+
dims: [ 1 ]
|
129 |
+
optional: true
|
130 |
+
},
|
131 |
+
{
|
132 |
+
name: "return_context_logits"
|
133 |
+
data_type: TYPE_BOOL
|
134 |
+
dims: [ 1 ]
|
135 |
+
optional: true
|
136 |
+
},
|
137 |
+
{
|
138 |
+
name: "return_generation_logits"
|
139 |
+
data_type: TYPE_BOOL
|
140 |
+
dims: [ 1 ]
|
141 |
+
optional: true
|
142 |
+
},
|
143 |
+
{
|
144 |
+
name: "beam_width"
|
145 |
+
data_type: TYPE_INT32
|
146 |
+
dims: [ 1 ]
|
147 |
+
optional: true
|
148 |
+
},
|
149 |
+
{
|
150 |
+
name: "stream"
|
151 |
+
data_type: TYPE_BOOL
|
152 |
+
dims: [ 1 ]
|
153 |
+
optional: true
|
154 |
+
},
|
155 |
+
{
|
156 |
+
name: "prompt_embedding_table"
|
157 |
+
data_type: TYPE_FP16
|
158 |
+
dims: [ -1, -1 ]
|
159 |
+
optional: true
|
160 |
+
},
|
161 |
+
{
|
162 |
+
name: "prompt_vocab_size"
|
163 |
+
data_type: TYPE_INT32
|
164 |
+
dims: [ 1 ]
|
165 |
+
optional: true
|
166 |
+
},
|
167 |
+
{
|
168 |
+
name: "embedding_bias_words"
|
169 |
+
data_type: TYPE_STRING
|
170 |
+
dims: [ -1 ]
|
171 |
+
optional: true
|
172 |
+
},
|
173 |
+
{
|
174 |
+
name: "embedding_bias_weights"
|
175 |
+
data_type: TYPE_FP32
|
176 |
+
dims: [ -1 ]
|
177 |
+
optional: true
|
178 |
+
}
|
179 |
+
]
|
180 |
+
output [
|
181 |
+
{
|
182 |
+
name: "text_output"
|
183 |
+
data_type: TYPE_STRING
|
184 |
+
dims: [ -1 ]
|
185 |
+
},
|
186 |
+
{
|
187 |
+
name: "cum_log_probs"
|
188 |
+
data_type: TYPE_FP32
|
189 |
+
dims: [ -1 ]
|
190 |
+
},
|
191 |
+
{
|
192 |
+
name: "output_log_probs"
|
193 |
+
data_type: TYPE_FP32
|
194 |
+
dims: [ -1, -1 ]
|
195 |
+
},
|
196 |
+
{
|
197 |
+
name: "context_logits"
|
198 |
+
data_type: TYPE_FP32
|
199 |
+
dims: [ -1, -1 ]
|
200 |
+
},
|
201 |
+
{
|
202 |
+
name: "generation_logits"
|
203 |
+
data_type: TYPE_FP32
|
204 |
+
dims: [ -1, -1, -1 ]
|
205 |
+
},
|
206 |
+
{
|
207 |
+
name: "batch_index"
|
208 |
+
data_type: TYPE_INT32
|
209 |
+
dims: [ 1 ]
|
210 |
+
}
|
211 |
+
]
|
212 |
+
ensemble_scheduling {
|
213 |
+
step [
|
214 |
+
{
|
215 |
+
model_name: "preprocessing"
|
216 |
+
model_version: -1
|
217 |
+
input_map {
|
218 |
+
key: "QUERY"
|
219 |
+
value: "text_input"
|
220 |
+
}
|
221 |
+
input_map {
|
222 |
+
key: "DECODER_QUERY"
|
223 |
+
value: "decoder_text_input"
|
224 |
+
}
|
225 |
+
input_map {
|
226 |
+
key: "REQUEST_OUTPUT_LEN"
|
227 |
+
value: "max_tokens"
|
228 |
+
}
|
229 |
+
input_map {
|
230 |
+
key: "BAD_WORDS_DICT"
|
231 |
+
value: "bad_words"
|
232 |
+
}
|
233 |
+
input_map {
|
234 |
+
key: "STOP_WORDS_DICT"
|
235 |
+
value: "stop_words"
|
236 |
+
}
|
237 |
+
input_map {
|
238 |
+
key: "EMBEDDING_BIAS_WORDS"
|
239 |
+
value: "embedding_bias_words"
|
240 |
+
}
|
241 |
+
input_map {
|
242 |
+
key: "EMBEDDING_BIAS_WEIGHTS"
|
243 |
+
value: "embedding_bias_weights"
|
244 |
+
}
|
245 |
+
input_map {
|
246 |
+
key: "END_ID"
|
247 |
+
value: "end_id"
|
248 |
+
}
|
249 |
+
input_map {
|
250 |
+
key: "PAD_ID"
|
251 |
+
value: "pad_id"
|
252 |
+
}
|
253 |
+
output_map {
|
254 |
+
key: "REQUEST_INPUT_LEN"
|
255 |
+
value: "_REQUEST_INPUT_LEN"
|
256 |
+
}
|
257 |
+
output_map {
|
258 |
+
key: "INPUT_ID"
|
259 |
+
value: "_INPUT_ID"
|
260 |
+
}
|
261 |
+
output_map {
|
262 |
+
key: "REQUEST_DECODER_INPUT_LEN"
|
263 |
+
value: "_REQUEST_DECODER_INPUT_LEN"
|
264 |
+
}
|
265 |
+
output_map {
|
266 |
+
key: "DECODER_INPUT_ID"
|
267 |
+
value: "_DECODER_INPUT_ID"
|
268 |
+
}
|
269 |
+
output_map {
|
270 |
+
key: "REQUEST_OUTPUT_LEN"
|
271 |
+
value: "_REQUEST_OUTPUT_LEN"
|
272 |
+
}
|
273 |
+
output_map {
|
274 |
+
key: "STOP_WORDS_IDS"
|
275 |
+
value: "_STOP_WORDS_IDS"
|
276 |
+
}
|
277 |
+
output_map {
|
278 |
+
key: "BAD_WORDS_IDS"
|
279 |
+
value: "_BAD_WORDS_IDS"
|
280 |
+
}
|
281 |
+
output_map {
|
282 |
+
key: "EMBEDDING_BIAS"
|
283 |
+
value: "_EMBEDDING_BIAS"
|
284 |
+
}
|
285 |
+
output_map {
|
286 |
+
key: "OUT_END_ID"
|
287 |
+
value: "_PREPROCESSOR_END_ID"
|
288 |
+
}
|
289 |
+
output_map {
|
290 |
+
key: "OUT_PAD_ID"
|
291 |
+
value: "_PREPROCESSOR_PAD_ID"
|
292 |
+
}
|
293 |
+
},
|
294 |
+
{
|
295 |
+
model_name: "tensorrt_llm"
|
296 |
+
model_version: -1
|
297 |
+
input_map {
|
298 |
+
key: "input_ids"
|
299 |
+
value: "_INPUT_ID"
|
300 |
+
}
|
301 |
+
input_map {
|
302 |
+
key: "decoder_input_ids"
|
303 |
+
value: "_DECODER_INPUT_ID"
|
304 |
+
}
|
305 |
+
input_map {
|
306 |
+
key: "input_lengths"
|
307 |
+
value: "_REQUEST_INPUT_LEN"
|
308 |
+
}
|
309 |
+
input_map {
|
310 |
+
key: "decoder_input_lengths"
|
311 |
+
value: "_REQUEST_DECODER_INPUT_LEN"
|
312 |
+
}
|
313 |
+
input_map {
|
314 |
+
key: "request_output_len"
|
315 |
+
value: "_REQUEST_OUTPUT_LEN"
|
316 |
+
}
|
317 |
+
input_map {
|
318 |
+
key: "end_id"
|
319 |
+
value: "_PREPROCESSOR_END_ID"
|
320 |
+
}
|
321 |
+
input_map {
|
322 |
+
key: "pad_id"
|
323 |
+
value: "_PREPROCESSOR_PAD_ID"
|
324 |
+
}
|
325 |
+
input_map {
|
326 |
+
key: "embedding_bias"
|
327 |
+
value: "_EMBEDDING_BIAS"
|
328 |
+
}
|
329 |
+
input_map {
|
330 |
+
key: "runtime_top_k"
|
331 |
+
value: "top_k"
|
332 |
+
}
|
333 |
+
input_map {
|
334 |
+
key: "runtime_top_p"
|
335 |
+
value: "top_p"
|
336 |
+
}
|
337 |
+
input_map {
|
338 |
+
key: "temperature"
|
339 |
+
value: "temperature"
|
340 |
+
}
|
341 |
+
input_map {
|
342 |
+
key: "len_penalty"
|
343 |
+
value: "length_penalty"
|
344 |
+
}
|
345 |
+
input_map {
|
346 |
+
key: "repetition_penalty"
|
347 |
+
value: "repetition_penalty"
|
348 |
+
}
|
349 |
+
input_map {
|
350 |
+
key: "min_length"
|
351 |
+
value: "min_length"
|
352 |
+
}
|
353 |
+
input_map {
|
354 |
+
key: "presence_penalty"
|
355 |
+
value: "presence_penalty"
|
356 |
+
}
|
357 |
+
input_map {
|
358 |
+
key: "frequency_penalty"
|
359 |
+
value: "frequency_penalty"
|
360 |
+
}
|
361 |
+
input_map {
|
362 |
+
key: "random_seed"
|
363 |
+
value: "random_seed"
|
364 |
+
}
|
365 |
+
input_map {
|
366 |
+
key: "return_log_probs"
|
367 |
+
value: "return_log_probs"
|
368 |
+
}
|
369 |
+
input_map {
|
370 |
+
key: "return_context_logits"
|
371 |
+
value: "return_context_logits"
|
372 |
+
}
|
373 |
+
input_map {
|
374 |
+
key: "return_generation_logits"
|
375 |
+
value: "return_generation_logits"
|
376 |
+
}
|
377 |
+
input_map {
|
378 |
+
key: "beam_width"
|
379 |
+
value: "beam_width"
|
380 |
+
}
|
381 |
+
input_map {
|
382 |
+
key: "streaming"
|
383 |
+
value: "stream"
|
384 |
+
}
|
385 |
+
input_map {
|
386 |
+
key: "prompt_embedding_table"
|
387 |
+
value: "prompt_embedding_table"
|
388 |
+
}
|
389 |
+
input_map {
|
390 |
+
key: "prompt_vocab_size"
|
391 |
+
value: "prompt_vocab_size"
|
392 |
+
}
|
393 |
+
input_map {
|
394 |
+
key: "stop_words_list"
|
395 |
+
value: "_STOP_WORDS_IDS"
|
396 |
+
}
|
397 |
+
input_map {
|
398 |
+
key: "bad_words_list"
|
399 |
+
value: "_BAD_WORDS_IDS"
|
400 |
+
}
|
401 |
+
output_map {
|
402 |
+
key: "output_ids"
|
403 |
+
value: "_TOKENS_BATCH"
|
404 |
+
}
|
405 |
+
output_map {
|
406 |
+
key: "sequence_length"
|
407 |
+
value: "_SEQUENCE_LENGTH"
|
408 |
+
},
|
409 |
+
output_map {
|
410 |
+
key: "cum_log_probs"
|
411 |
+
value: "_CUM_LOG_PROBS"
|
412 |
+
}
|
413 |
+
output_map {
|
414 |
+
key: "output_log_probs"
|
415 |
+
value: "_OUTPUT_LOG_PROBS"
|
416 |
+
},
|
417 |
+
output_map {
|
418 |
+
key: "context_logits"
|
419 |
+
value: "_CONTEXT_LOGITS"
|
420 |
+
},
|
421 |
+
output_map {
|
422 |
+
key: "generation_logits"
|
423 |
+
value: "_GENERATION_LOGITS"
|
424 |
+
},
|
425 |
+
output_map {
|
426 |
+
key: "batch_index"
|
427 |
+
value: "_BATCH_INDEX"
|
428 |
+
}
|
429 |
+
},
|
430 |
+
{
|
431 |
+
model_name: "postprocessing"
|
432 |
+
model_version: -1
|
433 |
+
input_map {
|
434 |
+
key: "TOKENS_BATCH"
|
435 |
+
value: "_TOKENS_BATCH"
|
436 |
+
}
|
437 |
+
input_map {
|
438 |
+
key: "CUM_LOG_PROBS"
|
439 |
+
value: "_CUM_LOG_PROBS"
|
440 |
+
}
|
441 |
+
input_map {
|
442 |
+
key: "OUTPUT_LOG_PROBS"
|
443 |
+
value: "_OUTPUT_LOG_PROBS"
|
444 |
+
}
|
445 |
+
input_map {
|
446 |
+
key: "CONTEXT_LOGITS"
|
447 |
+
value: "_CONTEXT_LOGITS"
|
448 |
+
}
|
449 |
+
input_map {
|
450 |
+
key: "GENERATION_LOGITS"
|
451 |
+
value: "_GENERATION_LOGITS"
|
452 |
+
}
|
453 |
+
input_map {
|
454 |
+
key: "SEQUENCE_LENGTH"
|
455 |
+
value: "_SEQUENCE_LENGTH"
|
456 |
+
}
|
457 |
+
input_map {
|
458 |
+
key: "BATCH_INDEX"
|
459 |
+
value: "_BATCH_INDEX"
|
460 |
+
}
|
461 |
+
output_map {
|
462 |
+
key: "OUTPUT"
|
463 |
+
value: "text_output"
|
464 |
+
}
|
465 |
+
output_map {
|
466 |
+
key: "OUT_OUTPUT_LOG_PROBS"
|
467 |
+
value: "output_log_probs"
|
468 |
+
}
|
469 |
+
output_map {
|
470 |
+
key: "OUT_CUM_LOG_PROBS"
|
471 |
+
value: "cum_log_probs"
|
472 |
+
}
|
473 |
+
output_map {
|
474 |
+
key: "OUT_CONTEXT_LOGITS"
|
475 |
+
value: "context_logits"
|
476 |
+
}
|
477 |
+
output_map {
|
478 |
+
key: "OUT_GENERATION_LOGITS"
|
479 |
+
value: "generation_logits"
|
480 |
+
}
|
481 |
+
output_map {
|
482 |
+
key: "OUT_BATCH_INDEX"
|
483 |
+
value: "batch_index"
|
484 |
+
}
|
485 |
+
}
|
486 |
+
]
|
487 |
+
}
|
postprocessing/1/__pycache__/model.cpython-310.pyc
ADDED
Binary file (5.61 kB). View file
|
|
postprocessing/1/model.py
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
import json
|
28 |
+
|
29 |
+
import numpy as np
|
30 |
+
import triton_python_backend_utils as pb_utils
|
31 |
+
from transformers import AutoTokenizer
|
32 |
+
|
33 |
+
|
34 |
+
class TritonPythonModel:
|
35 |
+
"""Your Python model must use the same class name. Every Python model
|
36 |
+
that is created must have "TritonPythonModel" as the class name.
|
37 |
+
"""
|
38 |
+
|
39 |
+
def initialize(self, args):
|
40 |
+
"""`initialize` is called only once when the model is being loaded.
|
41 |
+
Implementing `initialize` function is optional. This function allows
|
42 |
+
the model to initialize any state associated with this model.
|
43 |
+
Parameters
|
44 |
+
----------
|
45 |
+
args : dict
|
46 |
+
Both keys and values are strings. The dictionary keys and values are:
|
47 |
+
* model_config: A JSON string containing the model configuration
|
48 |
+
* model_instance_kind: A string containing model instance kind
|
49 |
+
* model_instance_device_id: A string containing model instance device ID
|
50 |
+
* model_repository: Model repository path
|
51 |
+
* model_version: Model version
|
52 |
+
* model_name: Model name
|
53 |
+
"""
|
54 |
+
# Parse model configs
|
55 |
+
model_config = json.loads(args['model_config'])
|
56 |
+
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
|
57 |
+
'string_value']
|
58 |
+
|
59 |
+
skip_special_tokens = model_config['parameters'].get(
|
60 |
+
'skip_special_tokens')
|
61 |
+
if skip_special_tokens is not None:
|
62 |
+
skip_special_tokens_str = skip_special_tokens[
|
63 |
+
'string_value'].lower()
|
64 |
+
if skip_special_tokens_str in [
|
65 |
+
'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
|
66 |
+
]:
|
67 |
+
self.skip_special_tokens = skip_special_tokens_str in [
|
68 |
+
'true', '1', 't', 'y', 'yes'
|
69 |
+
]
|
70 |
+
else:
|
71 |
+
print(
|
72 |
+
f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default."
|
73 |
+
)
|
74 |
+
self.skip_special_tokens = True
|
75 |
+
else:
|
76 |
+
print(
|
77 |
+
f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default."
|
78 |
+
)
|
79 |
+
self.skip_special_tokens = True
|
80 |
+
|
81 |
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
|
82 |
+
legacy=False,
|
83 |
+
padding_side='left',
|
84 |
+
trust_remote_code=True)
|
85 |
+
if not self.tokenizer.pad_token:
|
86 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
87 |
+
|
88 |
+
# Parse model output configs
|
89 |
+
output_config = pb_utils.get_output_config_by_name(
|
90 |
+
model_config, "OUTPUT")
|
91 |
+
|
92 |
+
# Convert Triton types to numpy types
|
93 |
+
self.output_dtype = pb_utils.triton_string_to_numpy(
|
94 |
+
output_config['data_type'])
|
95 |
+
|
96 |
+
def execute(self, requests):
|
97 |
+
"""`execute` must be implemented in every Python model. `execute`
|
98 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
99 |
+
argument. This function is called when an inference is requested
|
100 |
+
for this model. Depending on the batching configuration (e.g. Dynamic
|
101 |
+
Batching) used, `requests` may contain multiple requests. Every
|
102 |
+
Python model, must create one pb_utils.InferenceResponse for every
|
103 |
+
pb_utils.InferenceRequest in `requests`. If there is an error, you can
|
104 |
+
set the error argument when creating a pb_utils.InferenceResponse.
|
105 |
+
Parameters
|
106 |
+
----------
|
107 |
+
requests : list
|
108 |
+
A list of pb_utils.InferenceRequest
|
109 |
+
Returns
|
110 |
+
-------
|
111 |
+
list
|
112 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
113 |
+
be the same as `requests`
|
114 |
+
"""
|
115 |
+
|
116 |
+
responses = []
|
117 |
+
|
118 |
+
# Every Python backend must iterate over everyone of the requests
|
119 |
+
# and create a pb_utils.InferenceResponse for each of them.
|
120 |
+
for idx, request in enumerate(requests):
|
121 |
+
# Get input tensors
|
122 |
+
tokens_batch = pb_utils.get_input_tensor_by_name(
|
123 |
+
request, 'TOKENS_BATCH').as_numpy()
|
124 |
+
|
125 |
+
# Get sequence length
|
126 |
+
sequence_lengths = pb_utils.get_input_tensor_by_name(
|
127 |
+
request, 'SEQUENCE_LENGTH').as_numpy()
|
128 |
+
|
129 |
+
# Get cum log probs
|
130 |
+
cum_log_probs = pb_utils.get_input_tensor_by_name(
|
131 |
+
request, 'CUM_LOG_PROBS')
|
132 |
+
|
133 |
+
# Get sequence length
|
134 |
+
output_log_probs = pb_utils.get_input_tensor_by_name(
|
135 |
+
request, 'OUTPUT_LOG_PROBS')
|
136 |
+
|
137 |
+
# Get context logits
|
138 |
+
context_logits = pb_utils.get_input_tensor_by_name(
|
139 |
+
request, 'CONTEXT_LOGITS')
|
140 |
+
|
141 |
+
# Get generation logits
|
142 |
+
generation_logits = pb_utils.get_input_tensor_by_name(
|
143 |
+
request, 'GENERATION_LOGITS')
|
144 |
+
|
145 |
+
# Get the batch index
|
146 |
+
batch_index = pb_utils.get_input_tensor_by_name(
|
147 |
+
request, 'BATCH_INDEX')
|
148 |
+
|
149 |
+
# Reshape Input
|
150 |
+
# tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
|
151 |
+
# tokens_batch = tokens_batch.T
|
152 |
+
|
153 |
+
# Postprocessing output data.
|
154 |
+
outputs = self._postprocessing(tokens_batch, sequence_lengths)
|
155 |
+
|
156 |
+
# Create output tensors. You need pb_utils.Tensor
|
157 |
+
# objects to create pb_utils.InferenceResponse.
|
158 |
+
output_tensor = pb_utils.Tensor(
|
159 |
+
'OUTPUT',
|
160 |
+
np.array(outputs).astype(self.output_dtype))
|
161 |
+
|
162 |
+
outputs = []
|
163 |
+
outputs.append(output_tensor)
|
164 |
+
|
165 |
+
if cum_log_probs:
|
166 |
+
out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
|
167 |
+
cum_log_probs.as_numpy())
|
168 |
+
outputs.append(out_cum_log_probs)
|
169 |
+
else:
|
170 |
+
out_cum_log_probs = pb_utils.Tensor(
|
171 |
+
'OUT_CUM_LOG_PROBS', np.array([[0.0]], dtype=np.float32))
|
172 |
+
outputs.append(out_cum_log_probs)
|
173 |
+
|
174 |
+
if output_log_probs:
|
175 |
+
out_output_log_probs = pb_utils.Tensor(
|
176 |
+
'OUT_OUTPUT_LOG_PROBS', output_log_probs.as_numpy())
|
177 |
+
outputs.append(out_output_log_probs)
|
178 |
+
else:
|
179 |
+
out_output_log_probs = pb_utils.Tensor(
|
180 |
+
'OUT_OUTPUT_LOG_PROBS',
|
181 |
+
np.array([[[0.0]]], dtype=np.float32))
|
182 |
+
outputs.append(out_output_log_probs)
|
183 |
+
|
184 |
+
if context_logits:
|
185 |
+
out_context_logits = pb_utils.Tensor('OUT_CONTEXT_LOGITS',
|
186 |
+
context_logits.as_numpy())
|
187 |
+
outputs.append(out_context_logits)
|
188 |
+
else:
|
189 |
+
out_context_logits = pb_utils.Tensor(
|
190 |
+
'OUT_CONTEXT_LOGITS', np.array([[[0.0]]],
|
191 |
+
dtype=np.float32))
|
192 |
+
outputs.append(out_context_logits)
|
193 |
+
|
194 |
+
if generation_logits:
|
195 |
+
out_generation_logits = pb_utils.Tensor(
|
196 |
+
'OUT_GENERATION_LOGITS', generation_logits.as_numpy())
|
197 |
+
outputs.append(out_generation_logits)
|
198 |
+
else:
|
199 |
+
out_generation_logits = pb_utils.Tensor(
|
200 |
+
'OUT_GENERATION_LOGITS',
|
201 |
+
np.array([[[[0.0]]]], dtype=np.float32))
|
202 |
+
outputs.append(out_generation_logits)
|
203 |
+
|
204 |
+
if batch_index:
|
205 |
+
out_batch_index = pb_utils.Tensor('OUT_BATCH_INDEX',
|
206 |
+
batch_index.as_numpy())
|
207 |
+
outputs.append(out_batch_index)
|
208 |
+
else:
|
209 |
+
out_batch_index = pb_utils.Tensor(
|
210 |
+
'OUT_BATCH_INDEX', np.array([[0]], dtype=np.int32))
|
211 |
+
outputs.append(out_batch_index)
|
212 |
+
|
213 |
+
# Create InferenceResponse. You can set an error here in case
|
214 |
+
# there was a problem with handling this inference request.
|
215 |
+
# Below is an example of how you can set errors in inference
|
216 |
+
# response:
|
217 |
+
#
|
218 |
+
# pb_utils.InferenceResponse(
|
219 |
+
# output_tensors=..., TritonError("An error occurred"))
|
220 |
+
inference_response = pb_utils.InferenceResponse(
|
221 |
+
output_tensors=outputs)
|
222 |
+
responses.append(inference_response)
|
223 |
+
|
224 |
+
# You should return a list of pb_utils.InferenceResponse. Length
|
225 |
+
# of this list must match the length of `requests` list.
|
226 |
+
return responses
|
227 |
+
|
228 |
+
def finalize(self):
|
229 |
+
"""`finalize` is called only once when the model is being unloaded.
|
230 |
+
Implementing `finalize` function is optional. This function allows
|
231 |
+
the model to perform any necessary clean ups before exit.
|
232 |
+
"""
|
233 |
+
print('Cleaning up...')
|
234 |
+
|
235 |
+
def _postprocessing(self, tokens_batch, sequence_lengths):
|
236 |
+
outputs = []
|
237 |
+
for batch_idx, beam_tokens in enumerate(tokens_batch):
|
238 |
+
for beam_idx, tokens in enumerate(beam_tokens):
|
239 |
+
seq_len = sequence_lengths[batch_idx][beam_idx]
|
240 |
+
# Exclude fake ids in multimodal models
|
241 |
+
fake_id_len = 0
|
242 |
+
for i in range(seq_len):
|
243 |
+
if tokens[i] < self.tokenizer.vocab_size:
|
244 |
+
fake_id_len = i
|
245 |
+
break
|
246 |
+
output = self.tokenizer.decode(
|
247 |
+
tokens[fake_id_len:seq_len],
|
248 |
+
skip_special_tokens=self.skip_special_tokens)
|
249 |
+
outputs.append(output.encode('utf8'))
|
250 |
+
return outputs
|
postprocessing/config.pbtxt
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "postprocessing"
|
28 |
+
backend: "python"
|
29 |
+
max_batch_size: 32
|
30 |
+
input [
|
31 |
+
{
|
32 |
+
name: "TOKENS_BATCH"
|
33 |
+
data_type: TYPE_INT32
|
34 |
+
dims: [ -1, -1 ]
|
35 |
+
},
|
36 |
+
{
|
37 |
+
name: "SEQUENCE_LENGTH"
|
38 |
+
data_type: TYPE_INT32
|
39 |
+
dims: [ -1 ]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
name: "CUM_LOG_PROBS"
|
43 |
+
data_type: TYPE_FP32
|
44 |
+
dims: [ -1 ]
|
45 |
+
optional: true
|
46 |
+
},
|
47 |
+
{
|
48 |
+
name: "OUTPUT_LOG_PROBS"
|
49 |
+
data_type: TYPE_FP32
|
50 |
+
dims: [ -1, -1 ]
|
51 |
+
optional: true
|
52 |
+
},
|
53 |
+
{
|
54 |
+
name: "CONTEXT_LOGITS"
|
55 |
+
data_type: TYPE_FP32
|
56 |
+
dims: [ -1, -1 ]
|
57 |
+
optional: true
|
58 |
+
},
|
59 |
+
{
|
60 |
+
name: "GENERATION_LOGITS"
|
61 |
+
data_type: TYPE_FP32
|
62 |
+
dims: [ -1, -1, -1 ]
|
63 |
+
optional: true
|
64 |
+
},
|
65 |
+
{
|
66 |
+
name: "BATCH_INDEX"
|
67 |
+
data_type: TYPE_INT32
|
68 |
+
dims: [ 1 ]
|
69 |
+
optional: true
|
70 |
+
}
|
71 |
+
]
|
72 |
+
output [
|
73 |
+
{
|
74 |
+
name: "OUTPUT"
|
75 |
+
data_type: TYPE_STRING
|
76 |
+
dims: [ -1 ]
|
77 |
+
},
|
78 |
+
{
|
79 |
+
name: "OUT_CUM_LOG_PROBS"
|
80 |
+
data_type: TYPE_FP32
|
81 |
+
dims: [ -1 ]
|
82 |
+
},
|
83 |
+
{
|
84 |
+
name: "OUT_OUTPUT_LOG_PROBS"
|
85 |
+
data_type: TYPE_FP32
|
86 |
+
dims: [ -1, -1 ]
|
87 |
+
},
|
88 |
+
{
|
89 |
+
name: "OUT_CONTEXT_LOGITS"
|
90 |
+
data_type: TYPE_FP32
|
91 |
+
dims: [ -1, -1 ]
|
92 |
+
},
|
93 |
+
{
|
94 |
+
name: "OUT_GENERATION_LOGITS"
|
95 |
+
data_type: TYPE_FP32
|
96 |
+
dims: [ -1, -1, -1 ]
|
97 |
+
},
|
98 |
+
{
|
99 |
+
name: "OUT_BATCH_INDEX"
|
100 |
+
data_type: TYPE_INT32
|
101 |
+
dims: [ 1 ]
|
102 |
+
}
|
103 |
+
]
|
104 |
+
|
105 |
+
parameters {
|
106 |
+
key: "tokenizer_dir"
|
107 |
+
value: {
|
108 |
+
string_value: "mlabonne/Llama-3.1-70B-Instruct-lorablated"
|
109 |
+
}
|
110 |
+
}
|
111 |
+
|
112 |
+
parameters {
|
113 |
+
key: "skip_special_tokens"
|
114 |
+
value: {
|
115 |
+
string_value: "True"
|
116 |
+
}
|
117 |
+
}
|
118 |
+
|
119 |
+
instance_group [
|
120 |
+
{
|
121 |
+
count: 1
|
122 |
+
kind: KIND_CPU
|
123 |
+
}
|
124 |
+
]
|
preprocessing/1/__pycache__/model.cpython-310.pyc
ADDED
Binary file (11.1 kB). View file
|
|
preprocessing/1/model.py
ADDED
@@ -0,0 +1,439 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
import json
|
28 |
+
import os
|
29 |
+
from typing import List
|
30 |
+
|
31 |
+
import numpy as np
|
32 |
+
import triton_python_backend_utils as pb_utils
|
33 |
+
from transformers import AutoTokenizer, T5Tokenizer
|
34 |
+
|
35 |
+
|
36 |
+
class TritonPythonModel:
|
37 |
+
"""Your Python model must use the same class name. Every Python model
|
38 |
+
that is created must have "TritonPythonModel" as the class name.
|
39 |
+
"""
|
40 |
+
|
41 |
+
def initialize(self, args):
|
42 |
+
"""`initialize` is called only once when the model is being loaded.
|
43 |
+
Implementing `initialize` function is optional. This function allows
|
44 |
+
the model to initialize any state associated with this model.
|
45 |
+
Parameters
|
46 |
+
----------
|
47 |
+
args : dict
|
48 |
+
Both keys and values are strings. The dictionary keys and values are:
|
49 |
+
* model_config: A JSON string containing the model configuration
|
50 |
+
* model_instance_kind: A string containing model instance kind
|
51 |
+
* model_instance_device_id: A string containing model instance device ID
|
52 |
+
* model_repository: Model repository path
|
53 |
+
* model_version: Model version
|
54 |
+
* model_name: Model name
|
55 |
+
"""
|
56 |
+
# Parse model configs
|
57 |
+
model_config = json.loads(args['model_config'])
|
58 |
+
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
|
59 |
+
'string_value']
|
60 |
+
|
61 |
+
add_special_tokens = model_config['parameters'].get(
|
62 |
+
'add_special_tokens')
|
63 |
+
visual_model_path = model_config['parameters']['visual_model_path'][
|
64 |
+
'string_value']
|
65 |
+
if visual_model_path == "${visual_model_path}" or visual_model_path == "":
|
66 |
+
visual_model_path = None
|
67 |
+
|
68 |
+
if add_special_tokens is not None:
|
69 |
+
add_special_tokens_str = add_special_tokens['string_value'].lower()
|
70 |
+
if add_special_tokens_str in [
|
71 |
+
'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
|
72 |
+
]:
|
73 |
+
self.add_special_tokens = add_special_tokens_str in [
|
74 |
+
'true', '1', 't', 'y', 'yes'
|
75 |
+
]
|
76 |
+
else:
|
77 |
+
print(
|
78 |
+
f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default."
|
79 |
+
)
|
80 |
+
self.add_special_tokens = True
|
81 |
+
else:
|
82 |
+
print(
|
83 |
+
f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default."
|
84 |
+
)
|
85 |
+
self.add_special_tokens = True
|
86 |
+
|
87 |
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
|
88 |
+
legacy=False,
|
89 |
+
padding_side='left',
|
90 |
+
trust_remote_code=True)
|
91 |
+
if isinstance(self.tokenizer, T5Tokenizer):
|
92 |
+
self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id()
|
93 |
+
|
94 |
+
if not self.tokenizer.pad_token:
|
95 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
96 |
+
|
97 |
+
self.tokenizer_end_id = self.tokenizer.encode(
|
98 |
+
self.tokenizer.eos_token, add_special_tokens=False)[0]
|
99 |
+
self.tokenizer_pad_id = self.tokenizer.encode(
|
100 |
+
self.tokenizer.pad_token, add_special_tokens=False)[0]
|
101 |
+
|
102 |
+
self.is_multimodal = False
|
103 |
+
if visual_model_path is not None:
|
104 |
+
self.is_multimodal = True
|
105 |
+
visual_model_path = os.path.join(visual_model_path, 'config.json')
|
106 |
+
with open(visual_model_path, 'r') as f:
|
107 |
+
visual_model_config = json.load(f)
|
108 |
+
self.model_type = visual_model_config['builder_config'][
|
109 |
+
'model_type']
|
110 |
+
|
111 |
+
assert self.model_type in [
|
112 |
+
'llava', 'blip2-opt'
|
113 |
+
], f"[TensorRT-LLM][ERROR] Currently supported multi-modal models are llava and blip2-opt"
|
114 |
+
|
115 |
+
llm_model_path = model_config['parameters']['gpt_model_path'][
|
116 |
+
'string_value']
|
117 |
+
llm_model_path = os.path.join(llm_model_path, 'config.json')
|
118 |
+
with open(llm_model_path, 'r') as f:
|
119 |
+
llm_model_config = json.load(f)
|
120 |
+
self.vocab_size = int(
|
121 |
+
llm_model_config["pretrained_config"]["vocab_size"])
|
122 |
+
self._setup_ptable_shape(llm_model_config)
|
123 |
+
|
124 |
+
# Parse model output configs and convert Triton types to numpy types
|
125 |
+
output_names = [
|
126 |
+
"INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN",
|
127 |
+
"REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS",
|
128 |
+
"OUT_END_ID", "OUT_PAD_ID"
|
129 |
+
]
|
130 |
+
input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"]
|
131 |
+
for input_name in input_names:
|
132 |
+
setattr(
|
133 |
+
self,
|
134 |
+
input_name.lower() + "_dtype",
|
135 |
+
pb_utils.triton_string_to_numpy(
|
136 |
+
pb_utils.get_input_config_by_name(
|
137 |
+
model_config, input_name)['data_type']))
|
138 |
+
|
139 |
+
for output_name in output_names:
|
140 |
+
setattr(
|
141 |
+
self,
|
142 |
+
output_name.lower() + "_dtype",
|
143 |
+
pb_utils.triton_string_to_numpy(
|
144 |
+
pb_utils.get_output_config_by_name(
|
145 |
+
model_config, output_name)['data_type']))
|
146 |
+
|
147 |
+
def _setup_ptable_shape(self, llm_model_config):
|
148 |
+
max_prompt_embedding_table_size = llm_model_config['build_config'][
|
149 |
+
'max_prompt_embedding_table_size']
|
150 |
+
max_batch_size = llm_model_config['build_config']['max_batch_size']
|
151 |
+
|
152 |
+
num_visual_features = max_prompt_embedding_table_size // max_batch_size
|
153 |
+
hidden_size = llm_model_config['pretrained_config']['hidden_size']
|
154 |
+
|
155 |
+
self.ptable_shape = (-1, num_visual_features, hidden_size)
|
156 |
+
|
157 |
+
def execute(self, requests):
|
158 |
+
"""`execute` must be implemented in every Python model. `execute`
|
159 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
160 |
+
argument. This function is called when an inference is requested
|
161 |
+
for this model. Depending on the batching configuration (e.g. Dynamic
|
162 |
+
Batching) used, `requests` may contain multiple requests. Every
|
163 |
+
Python model, must create one pb_utils.InferenceResponse for every
|
164 |
+
pb_utils.InferenceRequest in `requests`. If there is an error, you can
|
165 |
+
set the error argument when creating a pb_utils.InferenceResponse.
|
166 |
+
Parameters
|
167 |
+
----------
|
168 |
+
requests : list
|
169 |
+
A list of pb_utils.InferenceRequest
|
170 |
+
Returns
|
171 |
+
-------
|
172 |
+
list
|
173 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
174 |
+
be the same as `requests`
|
175 |
+
"""
|
176 |
+
|
177 |
+
responses = []
|
178 |
+
|
179 |
+
# Every Python backend must iterate over everyone of the requests
|
180 |
+
# and create a pb_utils.InferenceResponse for each of them.
|
181 |
+
for idx, request in enumerate(requests):
|
182 |
+
# Get input tensors
|
183 |
+
query = pb_utils.get_input_tensor_by_name(request,
|
184 |
+
'QUERY').as_numpy()
|
185 |
+
batch_size = query.shape[0]
|
186 |
+
|
187 |
+
decoder_query = pb_utils.get_input_tensor_by_name(
|
188 |
+
request, 'DECODER_QUERY')
|
189 |
+
if decoder_query is not None:
|
190 |
+
decoder_query = decoder_query.as_numpy()
|
191 |
+
|
192 |
+
request_output_len = pb_utils.get_input_tensor_by_name(
|
193 |
+
request, 'REQUEST_OUTPUT_LEN').as_numpy()
|
194 |
+
|
195 |
+
bad_words_dict = pb_utils.get_input_tensor_by_name(
|
196 |
+
request, 'BAD_WORDS_DICT')
|
197 |
+
if bad_words_dict is not None:
|
198 |
+
bad_words_dict = bad_words_dict.as_numpy()
|
199 |
+
|
200 |
+
stop_words_dict = pb_utils.get_input_tensor_by_name(
|
201 |
+
request, 'STOP_WORDS_DICT')
|
202 |
+
if stop_words_dict is not None:
|
203 |
+
stop_words_dict = stop_words_dict.as_numpy()
|
204 |
+
|
205 |
+
embedding_bias_words = pb_utils.get_input_tensor_by_name(
|
206 |
+
request, 'EMBEDDING_BIAS_WORDS')
|
207 |
+
if embedding_bias_words is not None:
|
208 |
+
embedding_bias_words = embedding_bias_words.as_numpy()
|
209 |
+
|
210 |
+
embedding_bias_weights = pb_utils.get_input_tensor_by_name(
|
211 |
+
request, 'EMBEDDING_BIAS_WEIGHTS')
|
212 |
+
if embedding_bias_weights is not None:
|
213 |
+
embedding_bias_weights = embedding_bias_weights.as_numpy()
|
214 |
+
|
215 |
+
# Take the end_id from the input tensors
|
216 |
+
# If not specified, use tokenizer to get end_id
|
217 |
+
end_id = pb_utils.get_input_tensor_by_name(request, 'END_ID')
|
218 |
+
if end_id is not None:
|
219 |
+
end_id = end_id.as_numpy()
|
220 |
+
else:
|
221 |
+
end_id = [[self.tokenizer_end_id]] * batch_size
|
222 |
+
|
223 |
+
# Take the pad_id from the input tensors
|
224 |
+
# If not specified, use tokenizer to get pad_id
|
225 |
+
pad_id = pb_utils.get_input_tensor_by_name(request, 'PAD_ID')
|
226 |
+
if pad_id is not None:
|
227 |
+
pad_id = pad_id.as_numpy()
|
228 |
+
else:
|
229 |
+
pad_id = [[self.tokenizer_pad_id]] * batch_size
|
230 |
+
|
231 |
+
# Preprocessing input data.
|
232 |
+
input_id, request_input_len = self._create_request(query)
|
233 |
+
if decoder_query is not None:
|
234 |
+
decoder_input_id, request_decoder_input_len = self._create_request(
|
235 |
+
decoder_query)
|
236 |
+
else:
|
237 |
+
decoder_input_id = pad_id * np.ones((batch_size, 1), np.int32)
|
238 |
+
request_decoder_input_len = 1 * np.ones(
|
239 |
+
(batch_size, 1), np.int32)
|
240 |
+
|
241 |
+
bad_words = self._to_word_list_format(bad_words_dict, batch_size)
|
242 |
+
stop_words = self._to_word_list_format(stop_words_dict, batch_size)
|
243 |
+
|
244 |
+
embedding_bias = self._get_embedding_bias(
|
245 |
+
embedding_bias_words, embedding_bias_weights,
|
246 |
+
self.embedding_bias_weights_dtype, batch_size)
|
247 |
+
|
248 |
+
# Create output tensors. You need pb_utils.Tensor
|
249 |
+
# objects to create pb_utils.InferenceResponse.
|
250 |
+
input_id_tensor = pb_utils.Tensor(
|
251 |
+
'INPUT_ID', input_id.astype(self.input_id_dtype))
|
252 |
+
request_input_len_tensor = pb_utils.Tensor(
|
253 |
+
'REQUEST_INPUT_LEN',
|
254 |
+
request_input_len.astype(self.request_input_len_dtype))
|
255 |
+
decoder_input_id_tensor = pb_utils.Tensor(
|
256 |
+
'DECODER_INPUT_ID',
|
257 |
+
decoder_input_id.astype(self.decoder_input_id_dtype))
|
258 |
+
request_decoder_input_len_tensor = pb_utils.Tensor(
|
259 |
+
'REQUEST_DECODER_INPUT_LEN',
|
260 |
+
request_decoder_input_len.astype(
|
261 |
+
self.request_decoder_input_len_dtype))
|
262 |
+
request_output_len_tensor = pb_utils.Tensor(
|
263 |
+
'REQUEST_OUTPUT_LEN', request_output_len)
|
264 |
+
bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
|
265 |
+
stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
|
266 |
+
stop_words)
|
267 |
+
embedding_bias_tensor = pb_utils.Tensor('EMBEDDING_BIAS',
|
268 |
+
embedding_bias)
|
269 |
+
end_id_tensor = pb_utils.Tensor('OUT_END_ID',
|
270 |
+
np.array(end_id, dtype=np.int32))
|
271 |
+
pad_id_tensor = pb_utils.Tensor('OUT_PAD_ID',
|
272 |
+
np.array(pad_id, dtype=np.int32))
|
273 |
+
|
274 |
+
inference_response = pb_utils.InferenceResponse(output_tensors=[
|
275 |
+
input_id_tensor, decoder_input_id_tensor, bad_words_ids_tensor,
|
276 |
+
stop_words_ids_tensor, request_input_len_tensor,
|
277 |
+
request_decoder_input_len_tensor, request_output_len_tensor,
|
278 |
+
embedding_bias_tensor, end_id_tensor, pad_id_tensor
|
279 |
+
])
|
280 |
+
responses.append(inference_response)
|
281 |
+
|
282 |
+
# You should return a list of pb_utils.InferenceResponse. Length
|
283 |
+
# of this list must match the length of `requests` list.
|
284 |
+
return responses
|
285 |
+
|
286 |
+
def finalize(self):
|
287 |
+
"""`finalize` is called only once when the model is being unloaded.
|
288 |
+
Implementing `finalize` function is optional. This function allows
|
289 |
+
the model to perform any necessary clean ups before exit.
|
290 |
+
"""
|
291 |
+
print('Cleaning up...')
|
292 |
+
|
293 |
+
def _create_request(self, query):
|
294 |
+
"""
|
295 |
+
query : batch string (2D numpy array)
|
296 |
+
"""
|
297 |
+
if isinstance(self.tokenizer, T5Tokenizer):
|
298 |
+
start_ids = [
|
299 |
+
np.array([self.tokenizer_bos_id] + self.tokenizer.encode(
|
300 |
+
s[0].decode(), add_special_tokens=self.add_special_tokens)
|
301 |
+
).astype(int) for s in query
|
302 |
+
]
|
303 |
+
else:
|
304 |
+
start_ids = [
|
305 |
+
np.array(
|
306 |
+
self.tokenizer.encode(
|
307 |
+
s[0].decode(),
|
308 |
+
add_special_tokens=self.add_special_tokens)).astype(
|
309 |
+
int) for s in query
|
310 |
+
]
|
311 |
+
|
312 |
+
if self.is_multimodal:
|
313 |
+
if 'blip2' in self.model_type:
|
314 |
+
pre_prompt = None
|
315 |
+
post_prompt = None
|
316 |
+
elif 'llava' == self.model_type:
|
317 |
+
pre_prompt = "USER:\n"
|
318 |
+
post_prompt = " ASSISTANT:"
|
319 |
+
|
320 |
+
fake_prompt_id = np.arange(self.vocab_size,
|
321 |
+
self.vocab_size + self.ptable_shape[1])
|
322 |
+
|
323 |
+
if pre_prompt is not None:
|
324 |
+
pre_prompt_id = np.array(
|
325 |
+
self.tokenizer.encode(
|
326 |
+
pre_prompt,
|
327 |
+
add_special_tokens=self.add_special_tokens,
|
328 |
+
padding=True))
|
329 |
+
|
330 |
+
if post_prompt is not None:
|
331 |
+
post_prompt_id = np.array(
|
332 |
+
self.tokenizer.encode(
|
333 |
+
post_prompt,
|
334 |
+
add_special_tokens=self.add_special_tokens,
|
335 |
+
padding=True))
|
336 |
+
|
337 |
+
if post_prompt is None:
|
338 |
+
start_ids = [
|
339 |
+
np.concatenate((fake_prompt_id, ids), axis=0)
|
340 |
+
for ids in start_ids
|
341 |
+
]
|
342 |
+
else:
|
343 |
+
start_ids = [
|
344 |
+
np.concatenate(
|
345 |
+
(pre_prompt_id, fake_prompt_id, ids, post_prompt_id),
|
346 |
+
axis=0) for ids in start_ids
|
347 |
+
]
|
348 |
+
start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int)
|
349 |
+
|
350 |
+
max_len = 0
|
351 |
+
for seq in start_ids:
|
352 |
+
max_len = max(max_len, seq.shape[0])
|
353 |
+
start_ids = np.stack([
|
354 |
+
np.pad(seq, (0, max_len - seq.shape[0]),
|
355 |
+
'constant',
|
356 |
+
constant_values=(0, self.tokenizer_pad_id))
|
357 |
+
for seq in start_ids
|
358 |
+
])
|
359 |
+
|
360 |
+
return start_ids, start_lengths
|
361 |
+
|
362 |
+
def _to_word_list_format(self, word_lists: List[List[str | bytes]],
|
363 |
+
batch_size):
|
364 |
+
'''
|
365 |
+
word_lists format:
|
366 |
+
len(word_lists) == batch_size
|
367 |
+
word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
|
368 |
+
'''
|
369 |
+
assert self.tokenizer != None, "need to set tokenizer"
|
370 |
+
|
371 |
+
if word_lists is None:
|
372 |
+
# Return an empty array of shape (1,2,0)
|
373 |
+
return np.empty([batch_size, 2, 0], dtype="int32")
|
374 |
+
|
375 |
+
flat_ids = []
|
376 |
+
offsets = []
|
377 |
+
for word_list in word_lists:
|
378 |
+
item_flat_ids = []
|
379 |
+
item_offsets = []
|
380 |
+
|
381 |
+
for word in word_list:
|
382 |
+
if isinstance(word, bytes):
|
383 |
+
word = word.decode()
|
384 |
+
|
385 |
+
ids = self.tokenizer.encode(word, add_special_tokens=False)
|
386 |
+
if len(ids) == 0:
|
387 |
+
continue
|
388 |
+
|
389 |
+
item_flat_ids += ids
|
390 |
+
item_offsets.append(len(ids))
|
391 |
+
|
392 |
+
flat_ids.append(np.array(item_flat_ids))
|
393 |
+
offsets.append(np.cumsum(np.array(item_offsets)))
|
394 |
+
|
395 |
+
pad_to = max(1, max(len(ids) for ids in flat_ids))
|
396 |
+
|
397 |
+
for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
|
398 |
+
flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)),
|
399 |
+
constant_values=0)
|
400 |
+
offsets[i] = np.pad(offs, (0, pad_to - len(offs)),
|
401 |
+
constant_values=-1)
|
402 |
+
|
403 |
+
return np.array([flat_ids, offsets], dtype="int32").transpose(
|
404 |
+
(1, 0, 2))
|
405 |
+
|
406 |
+
def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights,
|
407 |
+
bias_dtype, batch_size):
|
408 |
+
|
409 |
+
assert self.tokenizer != None, "need to set tokenizer"
|
410 |
+
|
411 |
+
if embedding_bias_words is None or embedding_bias_weights is None:
|
412 |
+
return np.empty([batch_size, 0],
|
413 |
+
dtype=self.embedding_bias_weights_dtype)
|
414 |
+
|
415 |
+
batch_embedding_bias = []
|
416 |
+
for words, weights in zip(embedding_bias_words,
|
417 |
+
embedding_bias_weights):
|
418 |
+
|
419 |
+
vocab_size = self.tokenizer.vocab_size
|
420 |
+
embedding_bias = [0.] * vocab_size
|
421 |
+
|
422 |
+
assert len(words) == len(
|
423 |
+
weights
|
424 |
+
), "Embedding bias words must have same dimension as embedding bias weights"
|
425 |
+
|
426 |
+
for word, weight in zip(words, weights):
|
427 |
+
if isinstance(word, bytes):
|
428 |
+
word = word.decode()
|
429 |
+
ids = self.tokenizer.encode(word)
|
430 |
+
|
431 |
+
if len(ids) == 0:
|
432 |
+
continue
|
433 |
+
|
434 |
+
for id in ids:
|
435 |
+
embedding_bias[id] += weight
|
436 |
+
|
437 |
+
batch_embedding_bias.append(np.array(embedding_bias))
|
438 |
+
|
439 |
+
return np.array(batch_embedding_bias, dtype=bias_dtype)
|
preprocessing/config.pbtxt
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "preprocessing"
|
28 |
+
backend: "python"
|
29 |
+
max_batch_size: 32
|
30 |
+
input [
|
31 |
+
{
|
32 |
+
name: "QUERY"
|
33 |
+
data_type: TYPE_STRING
|
34 |
+
dims: [ 1 ]
|
35 |
+
},
|
36 |
+
{
|
37 |
+
name: "DECODER_QUERY"
|
38 |
+
data_type: TYPE_STRING
|
39 |
+
dims: [ 1 ]
|
40 |
+
optional: true
|
41 |
+
},
|
42 |
+
{
|
43 |
+
name: "REQUEST_OUTPUT_LEN"
|
44 |
+
data_type: TYPE_INT32
|
45 |
+
dims: [ 1 ]
|
46 |
+
},
|
47 |
+
{
|
48 |
+
name: "BAD_WORDS_DICT"
|
49 |
+
data_type: TYPE_STRING
|
50 |
+
dims: [ -1 ]
|
51 |
+
optional: true
|
52 |
+
},
|
53 |
+
{
|
54 |
+
name: "STOP_WORDS_DICT"
|
55 |
+
data_type: TYPE_STRING
|
56 |
+
dims: [ -1 ]
|
57 |
+
optional: true
|
58 |
+
},
|
59 |
+
{
|
60 |
+
name: "EMBEDDING_BIAS_WORDS"
|
61 |
+
data_type: TYPE_STRING
|
62 |
+
dims: [ -1 ]
|
63 |
+
optional: true
|
64 |
+
},
|
65 |
+
{
|
66 |
+
name: "EMBEDDING_BIAS_WEIGHTS"
|
67 |
+
data_type: TYPE_FP32
|
68 |
+
dims: [ -1 ]
|
69 |
+
optional: true
|
70 |
+
},
|
71 |
+
{
|
72 |
+
name: "END_ID"
|
73 |
+
data_type: TYPE_INT32
|
74 |
+
dims: [ 1 ]
|
75 |
+
optional: true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
name: "PAD_ID"
|
79 |
+
data_type: TYPE_INT32
|
80 |
+
dims: [ 1 ]
|
81 |
+
optional: true
|
82 |
+
}
|
83 |
+
]
|
84 |
+
output [
|
85 |
+
{
|
86 |
+
name: "INPUT_ID"
|
87 |
+
data_type: TYPE_INT32
|
88 |
+
dims: [ -1 ]
|
89 |
+
},
|
90 |
+
{
|
91 |
+
name: "REQUEST_INPUT_LEN"
|
92 |
+
data_type: TYPE_INT32
|
93 |
+
dims: [ 1 ]
|
94 |
+
},
|
95 |
+
{
|
96 |
+
name: "DECODER_INPUT_ID"
|
97 |
+
data_type: TYPE_INT32
|
98 |
+
dims: [ -1 ]
|
99 |
+
},
|
100 |
+
{
|
101 |
+
name: "REQUEST_DECODER_INPUT_LEN"
|
102 |
+
data_type: TYPE_INT32
|
103 |
+
dims: [ 1 ]
|
104 |
+
},
|
105 |
+
{
|
106 |
+
name: "BAD_WORDS_IDS"
|
107 |
+
data_type: TYPE_INT32
|
108 |
+
dims: [ 2, -1 ]
|
109 |
+
},
|
110 |
+
{
|
111 |
+
name: "STOP_WORDS_IDS"
|
112 |
+
data_type: TYPE_INT32
|
113 |
+
dims: [ 2, -1 ]
|
114 |
+
},
|
115 |
+
{
|
116 |
+
name: "EMBEDDING_BIAS"
|
117 |
+
data_type: TYPE_FP32
|
118 |
+
dims: [ -1 ]
|
119 |
+
},
|
120 |
+
{
|
121 |
+
name: "REQUEST_OUTPUT_LEN"
|
122 |
+
data_type: TYPE_INT32
|
123 |
+
dims: [ -1 ]
|
124 |
+
},
|
125 |
+
{
|
126 |
+
name: "OUT_END_ID"
|
127 |
+
data_type: TYPE_INT32
|
128 |
+
dims: [ 1 ]
|
129 |
+
},
|
130 |
+
{
|
131 |
+
name: "OUT_PAD_ID"
|
132 |
+
data_type: TYPE_INT32
|
133 |
+
dims: [ 1 ]
|
134 |
+
}
|
135 |
+
]
|
136 |
+
|
137 |
+
parameters {
|
138 |
+
key: "tokenizer_dir"
|
139 |
+
value: {
|
140 |
+
string_value: "mlabonne/Llama-3.1-70B-Instruct-lorablated"
|
141 |
+
}
|
142 |
+
}
|
143 |
+
|
144 |
+
parameters {
|
145 |
+
key: "add_special_tokens"
|
146 |
+
value: {
|
147 |
+
string_value: "False"
|
148 |
+
}
|
149 |
+
}
|
150 |
+
|
151 |
+
parameters {
|
152 |
+
key: "visual_model_path"
|
153 |
+
value: {
|
154 |
+
string_value: "${visual_model_path}"
|
155 |
+
}
|
156 |
+
}
|
157 |
+
|
158 |
+
parameters: {
|
159 |
+
key: "gpt_model_path"
|
160 |
+
value: {
|
161 |
+
string_value: "/all_models/inflight_batcher_llm/tensorrt_llm/1"
|
162 |
+
}
|
163 |
+
}
|
164 |
+
|
165 |
+
instance_group [
|
166 |
+
{
|
167 |
+
count: 1
|
168 |
+
kind: KIND_CPU
|
169 |
+
}
|
170 |
+
]
|
tensorrt_llm/1/.gitkeep
ADDED
File without changes
|
tensorrt_llm/1/config.json
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"version": "0.13.0.dev2024082000",
|
3 |
+
"pretrained_config": {
|
4 |
+
"mlp_bias": false,
|
5 |
+
"attn_bias": false,
|
6 |
+
"rotary_base": 500000.0,
|
7 |
+
"rotary_scaling": {
|
8 |
+
"factor": 32.0,
|
9 |
+
"high_freq_factor": 4.0,
|
10 |
+
"low_freq_factor": 1.0,
|
11 |
+
"original_max_position_embeddings": 8192,
|
12 |
+
"rope_type": "llama3"
|
13 |
+
},
|
14 |
+
"residual_mlp": false,
|
15 |
+
"disable_weight_only_quant_plugin": false,
|
16 |
+
"moe": {
|
17 |
+
"num_experts": 0,
|
18 |
+
"top_k": 0,
|
19 |
+
"normalization_mode": 1,
|
20 |
+
"tp_mode": 0
|
21 |
+
},
|
22 |
+
"remove_duplicated_kv_heads": false,
|
23 |
+
"architecture": "LlamaForCausalLM",
|
24 |
+
"dtype": "float16",
|
25 |
+
"vocab_size": 128256,
|
26 |
+
"hidden_size": 2048,
|
27 |
+
"num_hidden_layers": 16,
|
28 |
+
"num_attention_heads": 32,
|
29 |
+
"hidden_act": "silu",
|
30 |
+
"logits_dtype": "float16",
|
31 |
+
"norm_epsilon": 1e-05,
|
32 |
+
"position_embedding_type": "rope_gpt_neox",
|
33 |
+
"max_position_embeddings": 131072,
|
34 |
+
"num_key_value_heads": 8,
|
35 |
+
"intermediate_size": 8192,
|
36 |
+
"mapping": {
|
37 |
+
"world_size": 2,
|
38 |
+
"gpus_per_node": 8,
|
39 |
+
"cp_size": 1,
|
40 |
+
"tp_size": 2,
|
41 |
+
"pp_size": 1,
|
42 |
+
"moe_tp_size": 2,
|
43 |
+
"moe_ep_size": 1
|
44 |
+
},
|
45 |
+
"quantization": {
|
46 |
+
"quant_algo": "FP8",
|
47 |
+
"kv_cache_quant_algo": "FP8",
|
48 |
+
"group_size": 128,
|
49 |
+
"smoothquant_val": 0.5,
|
50 |
+
"clamp_val": null,
|
51 |
+
"has_zero_point": false,
|
52 |
+
"pre_quant_scale": false,
|
53 |
+
"exclude_modules": null
|
54 |
+
},
|
55 |
+
"use_parallel_embedding": true,
|
56 |
+
"embedding_sharding_dim": 0,
|
57 |
+
"share_embedding_table": false,
|
58 |
+
"head_size": 64,
|
59 |
+
"qk_layernorm": false,
|
60 |
+
"producer": {
|
61 |
+
"name": "modelopt",
|
62 |
+
"version": "0.15.1"
|
63 |
+
},
|
64 |
+
"bias": false,
|
65 |
+
"rotary_pct": 1.0,
|
66 |
+
"rank": 0,
|
67 |
+
"decoder": "llama",
|
68 |
+
"rmsnorm": true,
|
69 |
+
"lm_head_bias": false
|
70 |
+
},
|
71 |
+
"build_config": {
|
72 |
+
"max_input_len": 124000,
|
73 |
+
"max_seq_len": 4194304,
|
74 |
+
"opt_batch_size": null,
|
75 |
+
"max_batch_size": 32,
|
76 |
+
"max_beam_width": 1,
|
77 |
+
"max_num_tokens": 128000,
|
78 |
+
"opt_num_tokens": null,
|
79 |
+
"max_prompt_embedding_table_size": 0,
|
80 |
+
"kv_cache_type": "PAGED",
|
81 |
+
"gather_context_logits": false,
|
82 |
+
"gather_generation_logits": false,
|
83 |
+
"strongly_typed": true,
|
84 |
+
"builder_opt": null,
|
85 |
+
"force_num_profiles": null,
|
86 |
+
"profiling_verbosity": "layer_names_only",
|
87 |
+
"enable_debug_output": false,
|
88 |
+
"max_draft_len": 0,
|
89 |
+
"speculative_decoding_mode": 1,
|
90 |
+
"use_refit": false,
|
91 |
+
"input_timing_cache": null,
|
92 |
+
"output_timing_cache": "model.cache",
|
93 |
+
"lora_config": {
|
94 |
+
"lora_dir": [],
|
95 |
+
"lora_ckpt_source": "hf",
|
96 |
+
"max_lora_rank": 64,
|
97 |
+
"lora_target_modules": [],
|
98 |
+
"trtllm_modules_to_hf_modules": {}
|
99 |
+
},
|
100 |
+
"auto_parallel_config": {
|
101 |
+
"world_size": 1,
|
102 |
+
"gpus_per_node": 8,
|
103 |
+
"cluster_key": "H100-PCIe",
|
104 |
+
"cluster_info": null,
|
105 |
+
"sharding_cost_model": "alpha_beta",
|
106 |
+
"comm_cost_model": "alpha_beta",
|
107 |
+
"enable_pipeline_parallelism": false,
|
108 |
+
"enable_shard_unbalanced_shape": false,
|
109 |
+
"enable_shard_dynamic_shape": false,
|
110 |
+
"enable_reduce_scatter": true,
|
111 |
+
"builder_flags": null,
|
112 |
+
"debug_mode": false,
|
113 |
+
"infer_shape": true,
|
114 |
+
"validation_mode": false,
|
115 |
+
"same_buffer_io": {
|
116 |
+
"past_key_value_(\\d+)": "present_key_value_\\1"
|
117 |
+
},
|
118 |
+
"same_spec_io": {},
|
119 |
+
"sharded_io_allowlist": [
|
120 |
+
"past_key_value_\\d+",
|
121 |
+
"present_key_value_\\d*"
|
122 |
+
],
|
123 |
+
"fill_weights": false,
|
124 |
+
"parallel_config_cache": null,
|
125 |
+
"profile_cache": null,
|
126 |
+
"dump_path": null,
|
127 |
+
"debug_outputs": []
|
128 |
+
},
|
129 |
+
"weight_sparsity": false,
|
130 |
+
"weight_streaming": false,
|
131 |
+
"plugin_config": {
|
132 |
+
"dtype": "float16",
|
133 |
+
"bert_attention_plugin": "auto",
|
134 |
+
"gpt_attention_plugin": "float16",
|
135 |
+
"gemm_plugin": "fp8",
|
136 |
+
"gemm_swiglu_plugin": null,
|
137 |
+
"fp8_rowwise_gemm_plugin": null,
|
138 |
+
"smooth_quant_gemm_plugin": null,
|
139 |
+
"identity_plugin": null,
|
140 |
+
"layernorm_quantization_plugin": null,
|
141 |
+
"rmsnorm_quantization_plugin": null,
|
142 |
+
"nccl_plugin": "float16",
|
143 |
+
"lookup_plugin": null,
|
144 |
+
"lora_plugin": null,
|
145 |
+
"weight_only_groupwise_quant_matmul_plugin": null,
|
146 |
+
"weight_only_quant_matmul_plugin": null,
|
147 |
+
"quantize_per_token_plugin": false,
|
148 |
+
"quantize_tensor_plugin": false,
|
149 |
+
"moe_plugin": "auto",
|
150 |
+
"mamba_conv1d_plugin": "auto",
|
151 |
+
"context_fmha": true,
|
152 |
+
"bert_context_fmha_fp32_acc": false,
|
153 |
+
"paged_kv_cache": true,
|
154 |
+
"remove_input_padding": true,
|
155 |
+
"reduce_fusion": false,
|
156 |
+
"enable_xqa": true,
|
157 |
+
"tokens_per_block": 64,
|
158 |
+
"use_paged_context_fmha": true,
|
159 |
+
"use_fp8_context_fmha": true,
|
160 |
+
"multiple_profiles": true,
|
161 |
+
"paged_state": false,
|
162 |
+
"streamingllm": false,
|
163 |
+
"manage_weights": false,
|
164 |
+
"use_fused_mlp": true
|
165 |
+
},
|
166 |
+
"use_strip_plan": false,
|
167 |
+
"max_encoder_input_len": 1024,
|
168 |
+
"use_fused_mlp": true
|
169 |
+
}
|
170 |
+
}
|
tensorrt_llm/1/model.py
ADDED
@@ -0,0 +1,947 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
import time
|
6 |
+
from random import randint
|
7 |
+
from threading import Lock, Thread
|
8 |
+
|
9 |
+
import numpy as np
|
10 |
+
import torch
|
11 |
+
import triton_python_backend_utils as pb_utils
|
12 |
+
from torch import from_numpy
|
13 |
+
from torch.utils.dlpack import from_dlpack
|
14 |
+
|
15 |
+
import tensorrt_llm.bindings.executor as trtllm
|
16 |
+
|
17 |
+
|
18 |
+
def get_input_tensor_by_name(request,
|
19 |
+
name,
|
20 |
+
expected_batch_size=None,
|
21 |
+
batch_index=None):
|
22 |
+
tensor = pb_utils.get_input_tensor_by_name(request, name)
|
23 |
+
if tensor is None:
|
24 |
+
return None
|
25 |
+
|
26 |
+
if tensor.is_cpu():
|
27 |
+
tensor = tensor.as_numpy()
|
28 |
+
else:
|
29 |
+
tensor = from_dlpack(tensor.to_dlpack())
|
30 |
+
|
31 |
+
if expected_batch_size is not None and tensor.shape[
|
32 |
+
0] != expected_batch_size:
|
33 |
+
raise pb_utils.TritonModelException(
|
34 |
+
f"Expected batch size doesn't match batch size for tensor {name}. Expected {expected_batch_size} got {tensor.shape[0]}"
|
35 |
+
)
|
36 |
+
|
37 |
+
if batch_index is not None and expected_batch_size is not None and batch_index >= expected_batch_size:
|
38 |
+
raise pb_utils.TritonModelException(
|
39 |
+
f"Invalid batch index in get_input_tensor_by_name for {name}")
|
40 |
+
|
41 |
+
if batch_index is not None:
|
42 |
+
# Add leading 1 batch dimension
|
43 |
+
if isinstance(tensor, np.ndarray):
|
44 |
+
return np.expand_dims(tensor[batch_index], axis=0)
|
45 |
+
elif isinstance(tensor, torch.Tensor):
|
46 |
+
return torch.unsqueeze(tensor[batch_index], dim=0)
|
47 |
+
else:
|
48 |
+
return tensor
|
49 |
+
|
50 |
+
|
51 |
+
def get_input_scalar_by_name(request,
|
52 |
+
name,
|
53 |
+
expected_batch_size=1,
|
54 |
+
batch_index=0):
|
55 |
+
tensor = pb_utils.get_input_tensor_by_name(request, name)
|
56 |
+
if tensor is None:
|
57 |
+
return None
|
58 |
+
tensor = tensor.as_numpy()
|
59 |
+
|
60 |
+
if tensor.size != expected_batch_size:
|
61 |
+
raise pb_utils.TritonModelException(
|
62 |
+
f"Expected a scalar tensor for tensor {name}")
|
63 |
+
|
64 |
+
return tensor.item(batch_index)
|
65 |
+
|
66 |
+
|
67 |
+
def read_parameter_as_type(value, name, pytype=str):
|
68 |
+
if value == "":
|
69 |
+
return None
|
70 |
+
if value.startswith("${") and value.endswith("}"):
|
71 |
+
return None
|
72 |
+
if pytype is bool:
|
73 |
+
return value.lower() in ["1", "true"]
|
74 |
+
try:
|
75 |
+
result = pytype(value)
|
76 |
+
return result
|
77 |
+
except:
|
78 |
+
pb_utils.Logger.log_warning(
|
79 |
+
f"Could not read parameter '{name}' with value '{value}', will use default."
|
80 |
+
)
|
81 |
+
return None
|
82 |
+
|
83 |
+
|
84 |
+
def get_parameter(model_config, name, pytype=str):
|
85 |
+
if name not in model_config['parameters']:
|
86 |
+
return None
|
87 |
+
return read_parameter_as_type(
|
88 |
+
model_config['parameters'][name]['string_value'], name, pytype)
|
89 |
+
|
90 |
+
|
91 |
+
def convert_word_list(word_list):
|
92 |
+
if word_list is None:
|
93 |
+
return None
|
94 |
+
word_list = word_list.tolist()
|
95 |
+
if len(word_list) == 0 or len(word_list[0]) != 2:
|
96 |
+
raise pb_utils.TritonModelException(f"Invalid format for word list.")
|
97 |
+
words, indices = word_list[0]
|
98 |
+
result = []
|
99 |
+
current_index = 0
|
100 |
+
for i in indices:
|
101 |
+
if i == -1:
|
102 |
+
continue
|
103 |
+
if i > len(words):
|
104 |
+
raise pb_utils.TritonModelException(
|
105 |
+
f"Invalid format for word list.")
|
106 |
+
current_word = []
|
107 |
+
while current_index < i:
|
108 |
+
current_word.append(words[current_index])
|
109 |
+
current_index += 1
|
110 |
+
result.append(current_word)
|
111 |
+
return result
|
112 |
+
|
113 |
+
|
114 |
+
def parse_medusa_choices(medusa_choices):
|
115 |
+
if medusa_choices is None:
|
116 |
+
return None
|
117 |
+
try:
|
118 |
+
result = json.loads(
|
119 |
+
"[" + medusa_choices.replace("{", "[").replace("}", "]") + "]")
|
120 |
+
assert isinstance(result, list) and len(result) > 0
|
121 |
+
assert all([isinstance(x, list) for x in result])
|
122 |
+
assert all([isinstance(y, int) for x in result for y in x])
|
123 |
+
except Exception:
|
124 |
+
raise pb_utils.TritonModelException(
|
125 |
+
"Invalid format for medusa_choices")
|
126 |
+
return result
|
127 |
+
|
128 |
+
|
129 |
+
def get_sampling_config_from_request(request, batch_size=1, batch_index=0):
|
130 |
+
kwargs = {}
|
131 |
+
kwargs['beam_width'] = get_input_scalar_by_name(
|
132 |
+
request, 'beam_width', batch_size, batch_index) or 1
|
133 |
+
kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k',
|
134 |
+
batch_size, batch_index)
|
135 |
+
kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p',
|
136 |
+
batch_size, batch_index)
|
137 |
+
kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[
|
138 |
+
'top_p'] <= 0 else kwargs['top_p']
|
139 |
+
kwargs['random_seed'] = get_input_scalar_by_name(request, 'random_seed',
|
140 |
+
batch_size, batch_index)
|
141 |
+
kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature',
|
142 |
+
batch_size, batch_index)
|
143 |
+
kwargs['min_length'] = get_input_scalar_by_name(request, 'min_length',
|
144 |
+
batch_size, batch_index)
|
145 |
+
kwargs['repetition_penalty'] = get_input_scalar_by_name(
|
146 |
+
request, 'repetition_penalty', batch_size, batch_index)
|
147 |
+
kwargs['presence_penalty'] = get_input_scalar_by_name(
|
148 |
+
request, 'presence_penalty', batch_size, batch_index)
|
149 |
+
kwargs['frequency_penalty'] = get_input_scalar_by_name(
|
150 |
+
request, 'frequency_penalty', batch_size, batch_index)
|
151 |
+
kwargs['length_penalty'] = get_input_scalar_by_name(
|
152 |
+
request, 'len_penalty', batch_size, batch_index)
|
153 |
+
kwargs['top_p_min'] = get_input_scalar_by_name(request,
|
154 |
+
'runtime_top_p_min',
|
155 |
+
batch_size, batch_index)
|
156 |
+
kwargs['top_p_reset_ids'] = get_input_scalar_by_name(
|
157 |
+
request, 'runtime_top_p_reset_ids', batch_size, batch_index)
|
158 |
+
kwargs['top_p_decay'] = get_input_scalar_by_name(request,
|
159 |
+
'runtime_top_p_decay',
|
160 |
+
batch_size, batch_index)
|
161 |
+
kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name(
|
162 |
+
request, 'beam_search_diversity_rate', batch_size, batch_index)
|
163 |
+
kwargs['early_stopping'] = get_input_scalar_by_name(
|
164 |
+
request, 'early_stopping', batch_size, batch_index)
|
165 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
166 |
+
return trtllm.SamplingConfig(**kwargs)
|
167 |
+
|
168 |
+
|
169 |
+
def get_output_config_from_request(request,
|
170 |
+
exclude_input_from_output,
|
171 |
+
batch_size=1,
|
172 |
+
batch_index=0):
|
173 |
+
kwargs = {}
|
174 |
+
kwargs["return_log_probs"] = get_input_scalar_by_name(
|
175 |
+
request, 'return_log_probs', batch_size, batch_index)
|
176 |
+
kwargs["return_context_logits"] = get_input_scalar_by_name(
|
177 |
+
request, 'return_context_logits', batch_size, batch_index)
|
178 |
+
kwargs["return_generation_logits"] = get_input_scalar_by_name(
|
179 |
+
request, 'return_generation_logits', batch_size, batch_index)
|
180 |
+
kwargs["exclude_input_from_output"] = exclude_input_from_output
|
181 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
182 |
+
return trtllm.OutputConfig(**kwargs)
|
183 |
+
|
184 |
+
|
185 |
+
def get_external_draft_tokens_config_from_request(request,
|
186 |
+
batch_size=1,
|
187 |
+
batch_index=0):
|
188 |
+
kwargs = {}
|
189 |
+
draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids',
|
190 |
+
batch_size, batch_index)
|
191 |
+
if draft_input_ids is not None:
|
192 |
+
kwargs['tokens'] = draft_input_ids[0].tolist()
|
193 |
+
draft_logits = get_input_tensor_by_name(request, 'draft_logits',
|
194 |
+
batch_size, batch_index)
|
195 |
+
if draft_logits is not None:
|
196 |
+
kwargs['logits'] = from_numpy(draft_logits).squeeze()
|
197 |
+
kwargs['acceptance_threshold'] = get_input_scalar_by_name(
|
198 |
+
request, 'draft_acceptance_threshold', batch_size, batch_index)
|
199 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
200 |
+
if len(kwargs) > 0:
|
201 |
+
return trtllm.ExternalDraftTokensConfig(**kwargs)
|
202 |
+
return None
|
203 |
+
|
204 |
+
|
205 |
+
def get_prompt_tuning_config_from_request(request,
|
206 |
+
batch_size=1,
|
207 |
+
batch_index=0):
|
208 |
+
# prompt_vocab_size is unused by executor.
|
209 |
+
kwargs = {}
|
210 |
+
prompt_embedding_table = get_input_tensor_by_name(
|
211 |
+
request, 'prompt_embedding_table', batch_size, batch_index)
|
212 |
+
if prompt_embedding_table is not None:
|
213 |
+
if isinstance(prompt_embedding_table, np.ndarray):
|
214 |
+
kwargs["embedding_table"] = from_numpy(
|
215 |
+
prompt_embedding_table).squeeze()
|
216 |
+
elif isinstance(prompt_embedding_table, torch.Tensor):
|
217 |
+
kwargs["embedding_table"] = from_dlpack(
|
218 |
+
prompt_embedding_table.to_dlpack()).squeeze(dim=0)
|
219 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
220 |
+
if len(kwargs) > 0:
|
221 |
+
return trtllm.PromptTuningConfig(**kwargs)
|
222 |
+
return None
|
223 |
+
|
224 |
+
|
225 |
+
def get_lora_config_from_request(request, batch_size=1, batch_index=0):
|
226 |
+
kwargs = {}
|
227 |
+
kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id',
|
228 |
+
batch_size, batch_index)
|
229 |
+
lora_weights = get_input_tensor_by_name(request, 'lora_weights',
|
230 |
+
batch_size, batch_index)
|
231 |
+
if lora_weights is not None:
|
232 |
+
kwargs["weights"] = from_numpy(lora_weights).squeeze()
|
233 |
+
lora_config = get_input_tensor_by_name(request, 'lora_config', batch_size,
|
234 |
+
batch_index)
|
235 |
+
if lora_config is not None:
|
236 |
+
kwargs["config"] = from_numpy(lora_config).squeeze()
|
237 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
238 |
+
if len(kwargs) > 0:
|
239 |
+
return trtllm.LoraConfig(**kwargs)
|
240 |
+
return None
|
241 |
+
|
242 |
+
|
243 |
+
def convert_request(request, exclude_input_from_output, decoupled):
|
244 |
+
inputs = {}
|
245 |
+
input_token_ids = get_input_tensor_by_name(request, 'input_ids')
|
246 |
+
if input_token_ids is None:
|
247 |
+
raise pb_utils.TritonModelException(
|
248 |
+
"A value is required for input_ids")
|
249 |
+
if len(input_token_ids.shape) != 2:
|
250 |
+
raise pb_utils.TritonModelException(f"Invalid format for input_ids")
|
251 |
+
batch_size = input_token_ids.shape[0]
|
252 |
+
requests = []
|
253 |
+
for batch_index in range(0, batch_size):
|
254 |
+
input_token_ids = get_input_tensor_by_name(request, 'input_ids',
|
255 |
+
batch_size, batch_index)[0]
|
256 |
+
if input_token_ids is None:
|
257 |
+
raise pb_utils.TritonModelException(
|
258 |
+
"A value is required for input_ids")
|
259 |
+
input_token_ids = input_token_ids.tolist()
|
260 |
+
if len(input_token_ids) == 0:
|
261 |
+
raise pb_utils.TritonModelException(
|
262 |
+
f"Invalid format for input_ids")
|
263 |
+
|
264 |
+
input_length = get_input_scalar_by_name(request, 'input_lengths',
|
265 |
+
batch_size, batch_index)
|
266 |
+
if input_length is None:
|
267 |
+
input_length = len(input_token_ids)
|
268 |
+
# Trim input token ids with input_lengths
|
269 |
+
inputs['input_token_ids'] = input_token_ids[0:input_length]
|
270 |
+
|
271 |
+
inputs['max_new_tokens'] = get_input_scalar_by_name(
|
272 |
+
request, 'request_output_len', batch_size, batch_index)
|
273 |
+
if inputs['max_new_tokens'] is None:
|
274 |
+
raise pb_utils.TritonModelException(
|
275 |
+
"A value is required for request_output_len")
|
276 |
+
inputs['streaming'] = get_input_scalar_by_name(request, 'streaming',
|
277 |
+
batch_size, batch_index)
|
278 |
+
if inputs['streaming'] and not decoupled:
|
279 |
+
raise pb_utils.TritonModelException(
|
280 |
+
"Streaming is only supported in decoupled mode.")
|
281 |
+
inputs['end_id'] = get_input_scalar_by_name(request, 'end_id',
|
282 |
+
batch_size, batch_index)
|
283 |
+
inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id',
|
284 |
+
batch_size, batch_index)
|
285 |
+
inputs['stop_words'] = convert_word_list(
|
286 |
+
get_input_tensor_by_name(request, 'stop_words_list', batch_size,
|
287 |
+
batch_index))
|
288 |
+
inputs['bad_words'] = convert_word_list(
|
289 |
+
get_input_tensor_by_name(request, 'bad_words_list', batch_size,
|
290 |
+
batch_index))
|
291 |
+
embedding_bias = get_input_tensor_by_name(request, 'embedding_bias',
|
292 |
+
batch_size, batch_index)
|
293 |
+
if embedding_bias is not None and embedding_bias.size != 0:
|
294 |
+
inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze()
|
295 |
+
|
296 |
+
sampling_config = get_sampling_config_from_request(
|
297 |
+
request, batch_size, batch_index)
|
298 |
+
output_config = get_output_config_from_request(
|
299 |
+
request, exclude_input_from_output, batch_size, batch_index)
|
300 |
+
external_draft_tokens_config = get_external_draft_tokens_config_from_request(
|
301 |
+
request, batch_size, batch_index)
|
302 |
+
prompt_tuning_config = get_prompt_tuning_config_from_request(
|
303 |
+
request, batch_size, batch_index)
|
304 |
+
lora_config = get_lora_config_from_request(request, batch_size,
|
305 |
+
batch_index)
|
306 |
+
|
307 |
+
requests.append(
|
308 |
+
trtllm.Request(
|
309 |
+
**inputs,
|
310 |
+
sampling_config=sampling_config,
|
311 |
+
output_config=output_config,
|
312 |
+
external_draft_tokens_config=external_draft_tokens_config,
|
313 |
+
prompt_tuning_config=prompt_tuning_config,
|
314 |
+
lora_config=lora_config,
|
315 |
+
))
|
316 |
+
return requests
|
317 |
+
|
318 |
+
|
319 |
+
def convert_response(response, batch_index):
|
320 |
+
if response.has_error():
|
321 |
+
return pb_utils.InferenceResponse(output_tensors=[],
|
322 |
+
error=pb_utils.TritonError(
|
323 |
+
response.error_msg)), True
|
324 |
+
result = response.result
|
325 |
+
beam_lengths = np.expand_dims(
|
326 |
+
np.array([len(beam) for beam in result.output_token_ids], np.int32), 0)
|
327 |
+
max_beam_length = max([len(beam) for beam in result.output_token_ids])
|
328 |
+
output_ids = np.full((1, len(result.output_token_ids), max_beam_length),
|
329 |
+
-1, np.int32)
|
330 |
+
for idx, beam in enumerate(result.output_token_ids):
|
331 |
+
output_ids[0, idx, :len(beam)] = beam
|
332 |
+
output_tensors = [
|
333 |
+
pb_utils.Tensor("output_ids", output_ids),
|
334 |
+
pb_utils.Tensor("sequence_length", beam_lengths),
|
335 |
+
]
|
336 |
+
output_tensors.append(
|
337 |
+
pb_utils.Tensor(
|
338 |
+
"cum_log_probs",
|
339 |
+
np.expand_dims(np.array(result.cum_log_probs, np.float32), 0)
|
340 |
+
if result.cum_log_probs is not None else np.zeros(
|
341 |
+
(1, 1), np.float32)))
|
342 |
+
output_tensors.append(
|
343 |
+
pb_utils.Tensor(
|
344 |
+
"output_log_probs",
|
345 |
+
np.expand_dims(np.array(result.log_probs, np.float32), 0) if
|
346 |
+
result.log_probs is not None else np.zeros((1, 1, 1), np.float32)))
|
347 |
+
output_tensors.append(
|
348 |
+
pb_utils.Tensor(
|
349 |
+
"context_logits",
|
350 |
+
np.expand_dims(np.array(result.context_logits, np.float32), 0)
|
351 |
+
if result.context_logits is not None else np.zeros(
|
352 |
+
(1, 1, 1), np.float32)))
|
353 |
+
output_tensors.append(
|
354 |
+
pb_utils.Tensor(
|
355 |
+
"generation_logits",
|
356 |
+
np.expand_dims(np.array(result.generation_logits, np.float32), 0)
|
357 |
+
if result.generation_logits is not None else np.zeros(
|
358 |
+
(1, 1, 1, 1), np.float32)))
|
359 |
+
output_tensors.append(
|
360 |
+
pb_utils.Tensor("batch_index",
|
361 |
+
np.expand_dims(np.array([batch_index], np.int32), 0)))
|
362 |
+
|
363 |
+
return pb_utils.InferenceResponse(output_tensors), result.is_final
|
364 |
+
|
365 |
+
|
366 |
+
def convert_scheduler_policy(batch_scheduler_policy: str):
|
367 |
+
if batch_scheduler_policy.lower() == "max_utilization":
|
368 |
+
return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
|
369 |
+
elif batch_scheduler_policy.lower() == "guaranteed_no_evict":
|
370 |
+
return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
|
371 |
+
raise pb_utils.TritonModelException(
|
372 |
+
f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported."
|
373 |
+
)
|
374 |
+
|
375 |
+
|
376 |
+
def convert_batching_type(gpt_model_type: str):
|
377 |
+
if gpt_model_type is None:
|
378 |
+
return None
|
379 |
+
if gpt_model_type.lower(
|
380 |
+
) == "inflight_fused_batching" or gpt_model_type.lower(
|
381 |
+
) == "inflight_batching":
|
382 |
+
return trtllm.BatchingType.INFLIGHT
|
383 |
+
elif gpt_model_type.lower() == "v1":
|
384 |
+
return trtllm.BatchingType.STATIC
|
385 |
+
raise pb_utils.TritonModelException(
|
386 |
+
f"gpt_model_type value of '{gpt_model_type}' is not supported.")
|
387 |
+
|
388 |
+
|
389 |
+
def convert_decoding_mode(decoding_mode: str):
|
390 |
+
if decoding_mode is None:
|
391 |
+
return None
|
392 |
+
elif decoding_mode == "auto":
|
393 |
+
return trtllm.DecodingMode.Auto()
|
394 |
+
elif decoding_mode == "top_k":
|
395 |
+
return trtllm.DecodingMode.TopK()
|
396 |
+
elif decoding_mode == "top_p":
|
397 |
+
return trtllm.DecodingMode.TopP()
|
398 |
+
elif decoding_mode == "top_k_top_p":
|
399 |
+
return trtllm.DecodingMode.TopKTopP()
|
400 |
+
elif decoding_mode == "beam_search":
|
401 |
+
return trtllm.DecodingMode.BeamSearch()
|
402 |
+
elif decoding_mode == "medusa":
|
403 |
+
return trtllm.DecodingMode.Medusa()
|
404 |
+
raise pb_utils.TritonModelException(
|
405 |
+
f"decoding_mode value of '{decoding_mode}' is not supported.")
|
406 |
+
|
407 |
+
|
408 |
+
def convert_timestamp_to_seconds(timestamp: str):
|
409 |
+
return int(
|
410 |
+
datetime.datetime.strptime(timestamp,
|
411 |
+
"%m-%d-%Y %H:%M:%S.%f").timestamp())
|
412 |
+
|
413 |
+
|
414 |
+
class TritonPythonModel:
|
415 |
+
"""Your Python model must use the same class name. Every Python model
|
416 |
+
that is created must have "TritonPythonModel" as the class name.
|
417 |
+
"""
|
418 |
+
|
419 |
+
def get_scheduler_config(self, model_config):
|
420 |
+
batch_scheduler_policy = get_parameter(model_config,
|
421 |
+
"batch_scheduler_policy")
|
422 |
+
if batch_scheduler_policy is None:
|
423 |
+
return trtllm.SchedulerConfig()
|
424 |
+
return trtllm.SchedulerConfig(
|
425 |
+
convert_scheduler_policy(batch_scheduler_policy))
|
426 |
+
|
427 |
+
def get_kv_cache_config(self, model_config):
|
428 |
+
kwargs = {
|
429 |
+
"enable_block_reuse":
|
430 |
+
get_parameter(model_config, "enable_kv_cache_reuse", bool),
|
431 |
+
"max_tokens":
|
432 |
+
get_parameter(model_config, "max_tokens_in_paged_kv_cache", int),
|
433 |
+
"sink_token_length":
|
434 |
+
get_parameter(model_config, "sink_token_length", int),
|
435 |
+
"free_gpu_memory_fraction":
|
436 |
+
get_parameter(model_config, "kv_cache_free_gpu_mem_fraction",
|
437 |
+
float),
|
438 |
+
"host_cache_size":
|
439 |
+
get_parameter(model_config, "kv_cache_host_memory_bytes", int),
|
440 |
+
"onboard_blocks":
|
441 |
+
get_parameter(model_config, "kv_cache_onboard_blocks", bool),
|
442 |
+
}
|
443 |
+
max_attention_window_size = get_parameter(model_config,
|
444 |
+
"max_attention_window_size")
|
445 |
+
if max_attention_window_size:
|
446 |
+
kwargs["max_attention_window"] = [
|
447 |
+
int(x) for x in max_attention_window_size.split(",")
|
448 |
+
]
|
449 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
450 |
+
return trtllm.KvCacheConfig(**kwargs)
|
451 |
+
|
452 |
+
def get_parallel_config(self, model_config):
|
453 |
+
kwargs = {}
|
454 |
+
gpu_device_ids = get_parameter(model_config, "gpu_device_ids")
|
455 |
+
if gpu_device_ids:
|
456 |
+
kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")]
|
457 |
+
self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR",
|
458 |
+
"0") == "1"
|
459 |
+
if self.use_orchestrator_mode:
|
460 |
+
kwargs[
|
461 |
+
"communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR
|
462 |
+
worker_path = get_parameter(model_config, "worker_path")
|
463 |
+
if worker_path is not None:
|
464 |
+
raise pb_utils.TritonModelException(
|
465 |
+
"worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable."
|
466 |
+
)
|
467 |
+
executor_worker_path = get_parameter(model_config,
|
468 |
+
"executor_worker_path")
|
469 |
+
kwargs["orchestrator_config"] = trtllm.OrchestratorConfig(
|
470 |
+
True, executor_worker_path)
|
471 |
+
if len(kwargs) > 0:
|
472 |
+
return trtllm.ParallelConfig(**kwargs)
|
473 |
+
return None
|
474 |
+
|
475 |
+
def get_peft_cache_config(self, model_config):
|
476 |
+
kwargs = {
|
477 |
+
"optimal_adapter_size":
|
478 |
+
get_parameter(model_config, "lora_cache_optimal_adapter_size",
|
479 |
+
int),
|
480 |
+
"max_adapter_size":
|
481 |
+
get_parameter(model_config, "lora_cache_max_adapter_size", int),
|
482 |
+
"device_cache_percent":
|
483 |
+
get_parameter(model_config, "lora_cache_gpu_memory_fraction",
|
484 |
+
float),
|
485 |
+
"host_cache_size":
|
486 |
+
get_parameter(model_config, "lora_cache_host_memory_bytes", int),
|
487 |
+
}
|
488 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
489 |
+
return trtllm.PeftCacheConfig(**kwargs)
|
490 |
+
|
491 |
+
def get_decoding_config(self, model_config):
|
492 |
+
kwargs = {
|
493 |
+
"medusa_choices":
|
494 |
+
parse_medusa_choices(get_parameter(model_config,
|
495 |
+
"medusa_choices")),
|
496 |
+
"decoding_mode":
|
497 |
+
convert_decoding_mode(get_parameter(model_config,
|
498 |
+
"decoding_mode")),
|
499 |
+
}
|
500 |
+
print(kwargs)
|
501 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
502 |
+
return trtllm.DecodingConfig(**kwargs)
|
503 |
+
|
504 |
+
def get_extended_runtime_perf_knob_config(self, model_config):
|
505 |
+
kwargs = {
|
506 |
+
"multi_block_mode":
|
507 |
+
get_parameter(model_config, "multi_block_mode", bool),
|
508 |
+
"enable_context_fmha_fp32_acc":
|
509 |
+
get_parameter(model_config, "enable_context_fmha_fp32_acc", bool)
|
510 |
+
}
|
511 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
512 |
+
return trtllm.ExtendedRuntimePerfKnobConfig(**kwargs)
|
513 |
+
|
514 |
+
def get_executor_config(self, model_config):
|
515 |
+
kwargs = {
|
516 |
+
"max_beam_width":
|
517 |
+
get_parameter(model_config, "max_beam_width", int),
|
518 |
+
"scheduler_config":
|
519 |
+
self.get_scheduler_config(model_config),
|
520 |
+
"kv_cache_config":
|
521 |
+
self.get_kv_cache_config(model_config),
|
522 |
+
"enable_chunked_context":
|
523 |
+
get_parameter(model_config, "enable_chunked_context", bool),
|
524 |
+
"normalize_log_probs":
|
525 |
+
get_parameter(model_config, "normalize_log_probs", bool),
|
526 |
+
"batching_type":
|
527 |
+
convert_batching_type(get_parameter(model_config,
|
528 |
+
"gpt_model_type")),
|
529 |
+
"parallel_config":
|
530 |
+
self.get_parallel_config(model_config),
|
531 |
+
"peft_cache_config":
|
532 |
+
self.get_peft_cache_config(model_config),
|
533 |
+
"decoding_config":
|
534 |
+
self.get_decoding_config(model_config),
|
535 |
+
"max_queue_size":
|
536 |
+
model_config.get(
|
537 |
+
"dynamic_batching",
|
538 |
+
{},
|
539 |
+
).get(
|
540 |
+
"default_queue_policy",
|
541 |
+
{},
|
542 |
+
).get("max_queue_size"),
|
543 |
+
"extended_runtime_perf_knob_config":
|
544 |
+
self.get_extended_runtime_perf_knob_config(model_config)
|
545 |
+
}
|
546 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
547 |
+
return trtllm.ExecutorConfig(**kwargs)
|
548 |
+
|
549 |
+
def create_metrics(self, model: str, version: str, is_v1_model: bool):
|
550 |
+
self.request_metric_family = pb_utils.MetricFamily(
|
551 |
+
name="nv_trt_llm_request_metrics",
|
552 |
+
description="TRT LLM request metrics",
|
553 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
554 |
+
)
|
555 |
+
self.runtime_memory_metric_family = pb_utils.MetricFamily(
|
556 |
+
name="nv_trt_llm_runtime_memory_metrics",
|
557 |
+
description="TRT LLM runtime memory metrics",
|
558 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
559 |
+
)
|
560 |
+
self.kv_cache_metric_family = pb_utils.MetricFamily(
|
561 |
+
name="nv_trt_llm_kv_cache_block_metrics",
|
562 |
+
description="TRT LLM KV cache block metrics",
|
563 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
564 |
+
)
|
565 |
+
model_type = "v1" if is_v1_model else "inflight_batcher"
|
566 |
+
self.model_type_metric_family = pb_utils.MetricFamily(
|
567 |
+
name=f"nv_trt_llm_{model_type}_metrics",
|
568 |
+
description=f"TRT LLM {model_type}-specific metrics",
|
569 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
570 |
+
)
|
571 |
+
self.general_metric_family = pb_utils.MetricFamily(
|
572 |
+
name="nv_trt_llm_general_metrics",
|
573 |
+
description="General TRT LLM metrics",
|
574 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
575 |
+
)
|
576 |
+
common_labels = {"model": model, "version": version}
|
577 |
+
self.all_metrics = {
|
578 |
+
# Request metrics
|
579 |
+
"num_active_requests":
|
580 |
+
self.request_metric_family.Metric(labels={
|
581 |
+
"request_type": "active",
|
582 |
+
**common_labels
|
583 |
+
}),
|
584 |
+
"max_num_active_requests":
|
585 |
+
self.request_metric_family.Metric(labels={
|
586 |
+
"request_type": "max",
|
587 |
+
**common_labels
|
588 |
+
}),
|
589 |
+
"num_scheduled_requests":
|
590 |
+
self.request_metric_family.Metric(labels={
|
591 |
+
"request_type": "scheduled",
|
592 |
+
**common_labels
|
593 |
+
}),
|
594 |
+
"num_context_requests":
|
595 |
+
self.request_metric_family.Metric(labels={
|
596 |
+
"request_type": "context",
|
597 |
+
**common_labels
|
598 |
+
}),
|
599 |
+
# Runtime metrics
|
600 |
+
"cpu_mem_usage":
|
601 |
+
self.runtime_memory_metric_family.Metric(labels={
|
602 |
+
"memory_type": "cpu",
|
603 |
+
**common_labels
|
604 |
+
}),
|
605 |
+
"gpu_mem_usage":
|
606 |
+
self.runtime_memory_metric_family.Metric(labels={
|
607 |
+
"memory_type": "gpu",
|
608 |
+
**common_labels
|
609 |
+
}),
|
610 |
+
"pinned_mem_usage":
|
611 |
+
self.runtime_memory_metric_family.Metric(labels={
|
612 |
+
"memory_type": "pinned",
|
613 |
+
**common_labels
|
614 |
+
}),
|
615 |
+
# KV cache metrics
|
616 |
+
"max_num_blocks":
|
617 |
+
self.kv_cache_metric_family.Metric(labels={
|
618 |
+
"kv_cache_block_type": "max",
|
619 |
+
**common_labels
|
620 |
+
}),
|
621 |
+
"free_num_blocks":
|
622 |
+
self.kv_cache_metric_family.Metric(labels={
|
623 |
+
"kv_cache_block_type": "free",
|
624 |
+
**common_labels
|
625 |
+
}),
|
626 |
+
"used_num_blocks":
|
627 |
+
self.kv_cache_metric_family.Metric(labels={
|
628 |
+
"kv_cache_block_type": "used",
|
629 |
+
**common_labels
|
630 |
+
}),
|
631 |
+
"tokens_per_block":
|
632 |
+
self.kv_cache_metric_family.Metric(labels={
|
633 |
+
"kv_cache_block_type": "tokens_per",
|
634 |
+
**common_labels
|
635 |
+
}),
|
636 |
+
# General metrics
|
637 |
+
"timestamp":
|
638 |
+
self.general_metric_family.Metric(labels={
|
639 |
+
"general_type": "timestamp",
|
640 |
+
**common_labels
|
641 |
+
}),
|
642 |
+
"iter":
|
643 |
+
self.general_metric_family.Metric(labels={
|
644 |
+
"general_type": "iteration_counter",
|
645 |
+
**common_labels
|
646 |
+
}),
|
647 |
+
}
|
648 |
+
if is_v1_model:
|
649 |
+
self.all_metrics.update({
|
650 |
+
"num_ctx_tokens":
|
651 |
+
self.model_type_metric_family.Metric(labels={
|
652 |
+
"v1_specific_metric": "total_context_tokens",
|
653 |
+
**common_labels
|
654 |
+
}),
|
655 |
+
"num_gen_tokens":
|
656 |
+
self.model_type_metric_family.Metric(
|
657 |
+
labels={
|
658 |
+
"v1_specific_metric": "total_generation_tokens",
|
659 |
+
**common_labels
|
660 |
+
}),
|
661 |
+
"empty_gen_slots":
|
662 |
+
self.model_type_metric_family.Metric(
|
663 |
+
labels={
|
664 |
+
"v1_specific_metric": "empty_generation_slots",
|
665 |
+
**common_labels
|
666 |
+
}),
|
667 |
+
})
|
668 |
+
else:
|
669 |
+
self.all_metrics.update({
|
670 |
+
"num_ctx_tokens":
|
671 |
+
self.model_type_metric_family.Metric(
|
672 |
+
labels={
|
673 |
+
"inflight_batcher_specific_metric":
|
674 |
+
"total_context_tokens",
|
675 |
+
**common_labels
|
676 |
+
}),
|
677 |
+
"num_gen_requests":
|
678 |
+
self.model_type_metric_family.Metric(
|
679 |
+
labels={
|
680 |
+
"inflight_batcher_specific_metric":
|
681 |
+
"generation_requests",
|
682 |
+
**common_labels
|
683 |
+
}),
|
684 |
+
"micro_batch_id":
|
685 |
+
self.model_type_metric_family.Metric(
|
686 |
+
labels={
|
687 |
+
"inflight_batcher_specific_metric": "micro_batch_id",
|
688 |
+
**common_labels
|
689 |
+
}),
|
690 |
+
"num_paused_requests":
|
691 |
+
self.model_type_metric_family.Metric(
|
692 |
+
labels={
|
693 |
+
"inflight_batcher_specific_metric": "paused_requests",
|
694 |
+
**common_labels
|
695 |
+
}),
|
696 |
+
})
|
697 |
+
|
698 |
+
def initialize(self, args):
|
699 |
+
"""`initialize` is called only once when the model is being loaded.
|
700 |
+
Implementing `initialize` function is optional. This function allows
|
701 |
+
the model to initialize any state associated with this model.
|
702 |
+
|
703 |
+
Parameters
|
704 |
+
----------
|
705 |
+
args : dict
|
706 |
+
Both keys and values are strings. The dictionary keys and values are:
|
707 |
+
* model_config: A JSON string containing the model configuration
|
708 |
+
* model_instance_kind: A string containing model instance kind
|
709 |
+
* model_instance_device_id: A string containing model instance device ID
|
710 |
+
* model_repository: Model repository path
|
711 |
+
* model_version: Model version
|
712 |
+
* model_name: Model name
|
713 |
+
"""
|
714 |
+
model_config = json.loads(args['model_config'])
|
715 |
+
gpt_model_path = get_parameter(model_config, "gpt_model_path")
|
716 |
+
if get_parameter(model_config, "enable_trt_overlap", bool):
|
717 |
+
raise pb_utils.TritonModelException(
|
718 |
+
f"enable_trt_overlap=true is not supported.")
|
719 |
+
self.exclude_input_from_output = get_parameter(
|
720 |
+
model_config, "exclude_input_in_output", bool)
|
721 |
+
executor_config = self.get_executor_config(model_config)
|
722 |
+
self.executor = trtllm.Executor(gpt_model_path,
|
723 |
+
trtllm.ModelType.DECODER_ONLY,
|
724 |
+
executor_config)
|
725 |
+
self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
|
726 |
+
model_config)
|
727 |
+
self.cancellation_check_period_ms = get_parameter(
|
728 |
+
model_config, "cancellation_check_period_ms", int) or 100
|
729 |
+
self.stats_check_period_ms = get_parameter(
|
730 |
+
model_config, "stats_check_period_ms", int) or 100
|
731 |
+
|
732 |
+
if not self.decoupled:
|
733 |
+
raise pb_utils.TritonModelException(
|
734 |
+
"Please enable decoupled transaction policy in the model configuration to serve this model"
|
735 |
+
)
|
736 |
+
|
737 |
+
self.create_metrics(args["model_name"],
|
738 |
+
args["model_version"],
|
739 |
+
is_v1_model=executor_config.batching_type ==
|
740 |
+
trtllm.BatchingType.STATIC)
|
741 |
+
self.triton_user_id_to_req_ids = {}
|
742 |
+
self.triton_req_id_to_req_ids = {}
|
743 |
+
self.req_id_to_request_data = {}
|
744 |
+
self.lock = Lock()
|
745 |
+
self.running = False
|
746 |
+
self.awaiter_thread = Thread(target=self.awaiter_loop)
|
747 |
+
self.cancellation_thread = Thread(target=self.cancellation_loop)
|
748 |
+
self.metrics_thread = Thread(target=self.metrics_loop)
|
749 |
+
if self.executor.can_enqueue_requests():
|
750 |
+
self.running = True
|
751 |
+
self.awaiter_thread.start()
|
752 |
+
self.cancellation_thread.start()
|
753 |
+
self.metrics_thread.start()
|
754 |
+
else:
|
755 |
+
# In leader mode, worker ranks will wait here until leader is done.
|
756 |
+
self.executor.shutdown()
|
757 |
+
|
758 |
+
def handle_stop_request(self, triton_user_id, response_sender):
|
759 |
+
if triton_user_id is None or triton_user_id == "":
|
760 |
+
response_sender.send(
|
761 |
+
pb_utils.InferenceResponse(error=pb_utils.TritonError(
|
762 |
+
"A request id must be provided for request cancellation")),
|
763 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
764 |
+
return
|
765 |
+
|
766 |
+
with self.lock:
|
767 |
+
if triton_user_id in self.triton_user_id_to_req_ids:
|
768 |
+
req_ids = self.triton_user_id_to_req_ids[triton_user_id]
|
769 |
+
for req_id in req_ids:
|
770 |
+
self.executor.cancel_request(req_id)
|
771 |
+
|
772 |
+
response_sender.send(
|
773 |
+
pb_utils.InferenceResponse(),
|
774 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
775 |
+
|
776 |
+
def execute(self, requests):
|
777 |
+
"""`execute` must be implemented in every Python model. `execute`
|
778 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
779 |
+
argument. This function is called when an inference is requested
|
780 |
+
for this model.
|
781 |
+
|
782 |
+
Parameters
|
783 |
+
----------
|
784 |
+
requests : list
|
785 |
+
A list of pb_utils.InferenceRequest
|
786 |
+
|
787 |
+
Returns
|
788 |
+
-------
|
789 |
+
list
|
790 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
791 |
+
be the same as `requests`
|
792 |
+
"""
|
793 |
+
if not self.executor.can_enqueue_requests():
|
794 |
+
return
|
795 |
+
|
796 |
+
# Convert to executor requests.
|
797 |
+
|
798 |
+
triton_requests = []
|
799 |
+
executor_requests = []
|
800 |
+
batch_indices = []
|
801 |
+
triton_user_ids = []
|
802 |
+
triton_req_ids = []
|
803 |
+
|
804 |
+
for request in requests:
|
805 |
+
|
806 |
+
triton_user_id = request.request_id()
|
807 |
+
|
808 |
+
response_sender = request.get_response_sender()
|
809 |
+
stop = get_input_scalar_by_name(request, 'stop')
|
810 |
+
|
811 |
+
if stop:
|
812 |
+
self.handle_stop_request(triton_user_id, response_sender)
|
813 |
+
else:
|
814 |
+
#Unique request id used to identify each triton request
|
815 |
+
triton_req_id = str(randint(0, sys.maxsize))
|
816 |
+
self.triton_req_id_to_req_ids[triton_req_id] = set()
|
817 |
+
if triton_user_id is not None and triton_user_id != "":
|
818 |
+
self.triton_user_id_to_req_ids[triton_user_id] = set()
|
819 |
+
|
820 |
+
try:
|
821 |
+
converted_reqs = convert_request(
|
822 |
+
request, self.exclude_input_from_output,
|
823 |
+
self.decoupled)
|
824 |
+
except Exception as e:
|
825 |
+
response_sender.send(
|
826 |
+
pb_utils.InferenceResponse(error=pb_utils.TritonError(
|
827 |
+
f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'"
|
828 |
+
)),
|
829 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
830 |
+
else:
|
831 |
+
for batch_index, converted_req in enumerate(
|
832 |
+
converted_reqs):
|
833 |
+
triton_requests.append(request)
|
834 |
+
executor_requests.append(converted_req)
|
835 |
+
triton_user_ids.append(triton_user_id)
|
836 |
+
triton_req_ids.append(triton_req_id)
|
837 |
+
batch_indices.append(batch_index)
|
838 |
+
|
839 |
+
with self.lock:
|
840 |
+
request_ids = self.executor.enqueue_requests(executor_requests)
|
841 |
+
for req_id, triton_req_id, triton_user_id, triton_request, batch_index in zip(
|
842 |
+
request_ids, triton_req_ids, triton_user_ids,
|
843 |
+
triton_requests, batch_indices):
|
844 |
+
self.req_id_to_request_data[
|
845 |
+
req_id] = triton_req_id, triton_user_id, batch_index, triton_request.get_response_sender(
|
846 |
+
)
|
847 |
+
self.triton_req_id_to_req_ids[triton_req_id].add(req_id)
|
848 |
+
if triton_user_id is not None and triton_user_id != "":
|
849 |
+
self.triton_user_id_to_req_ids[triton_user_id].add(req_id)
|
850 |
+
|
851 |
+
return None
|
852 |
+
|
853 |
+
def awaiter_loop(self):
|
854 |
+
"""Gets responses from executor and returns the results."""
|
855 |
+
while self.running:
|
856 |
+
for response in self.executor.await_responses(
|
857 |
+
timeout=datetime.timedelta(milliseconds=1)):
|
858 |
+
req_id = response.request_id
|
859 |
+
with self.lock:
|
860 |
+
if req_id not in self.req_id_to_request_data:
|
861 |
+
continue
|
862 |
+
triton_req_id, triton_user_id, batch_index, response_sender = self.req_id_to_request_data[
|
863 |
+
req_id]
|
864 |
+
|
865 |
+
triton_response, is_final = convert_response(
|
866 |
+
response, batch_index)
|
867 |
+
|
868 |
+
triton_request_final = False
|
869 |
+
if is_final:
|
870 |
+
with self.lock:
|
871 |
+
# Check if all executor requests part of that triton request are finished
|
872 |
+
self.triton_req_id_to_req_ids[triton_req_id].remove(
|
873 |
+
req_id)
|
874 |
+
if len(self.triton_req_id_to_req_ids[triton_req_id]
|
875 |
+
) == 0:
|
876 |
+
pb_utils.Logger.log_info(
|
877 |
+
f"DELETING Req id {req_id}, triton_req_id {triton_req_id} "
|
878 |
+
)
|
879 |
+
triton_request_final = True
|
880 |
+
del self.triton_req_id_to_req_ids[triton_req_id]
|
881 |
+
if triton_user_id is not None and triton_user_id != "":
|
882 |
+
del self.triton_user_id_to_req_ids[
|
883 |
+
triton_user_id]
|
884 |
+
del self.req_id_to_request_data[req_id]
|
885 |
+
|
886 |
+
response_sender.send(
|
887 |
+
triton_response,
|
888 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
|
889 |
+
if triton_request_final else 0)
|
890 |
+
|
891 |
+
# Remove local reference so response_sender can be cleaned properly.
|
892 |
+
del response_sender
|
893 |
+
|
894 |
+
def cancellation_loop(self):
|
895 |
+
"""Checks if any pending requests have been cancelled."""
|
896 |
+
while self.running:
|
897 |
+
time.sleep(self.cancellation_check_period_ms / 1000.0)
|
898 |
+
with self.lock:
|
899 |
+
for req_id, (triton_req_id, triton_user_id, batch_index,
|
900 |
+
response_sender
|
901 |
+
) in self.req_id_to_request_data.items():
|
902 |
+
if response_sender.is_cancelled():
|
903 |
+
self.executor.cancel_request(req_id)
|
904 |
+
# Remove local reference so response_sender can be cleaned properly.
|
905 |
+
del response_sender
|
906 |
+
|
907 |
+
def metrics_loop(self):
|
908 |
+
"""Updates triton metrics using stats from the executor."""
|
909 |
+
while self.running:
|
910 |
+
time.sleep(self.stats_check_period_ms / 1000.0)
|
911 |
+
for stat in self.executor.get_latest_iteration_stats():
|
912 |
+
try:
|
913 |
+
for key, metric in self.all_metrics.items():
|
914 |
+
value = None
|
915 |
+
if hasattr(stat, key):
|
916 |
+
value = getattr(stat, key)
|
917 |
+
elif stat.kv_cache_stats is not None and hasattr(
|
918 |
+
stat.kv_cache_stats, key):
|
919 |
+
value = getattr(stat.kv_cache_stats, key)
|
920 |
+
elif stat.static_batching_stats is not None and hasattr(
|
921 |
+
stat.static_batching_stats, key):
|
922 |
+
value = getattr(stat.static_batching_stats, key)
|
923 |
+
elif stat.inflight_batching_stats is not None and hasattr(
|
924 |
+
stat.inflight_batching_stats, key):
|
925 |
+
value = getattr(stat.inflight_batching_stats, key)
|
926 |
+
if value is not None:
|
927 |
+
if key == "timestamp":
|
928 |
+
value = convert_timestamp_to_seconds(value)
|
929 |
+
metric.set(value)
|
930 |
+
else:
|
931 |
+
pb_utils.Logger.log_warn(
|
932 |
+
f"Metric \"{key}\" not found.")
|
933 |
+
except Exception as e:
|
934 |
+
pb_utils.Logger.log_warn(
|
935 |
+
f"Error while processing metrics: {e}")
|
936 |
+
|
937 |
+
def finalize(self):
|
938 |
+
"""`finalize` is called only once when the model is being unloaded.
|
939 |
+
Implementing `finalize` function is optional. This function allows
|
940 |
+
the model to perform any necessary clean ups before exit.
|
941 |
+
"""
|
942 |
+
if self.executor.can_enqueue_requests():
|
943 |
+
self.running = False
|
944 |
+
self.awaiter_thread.join()
|
945 |
+
self.cancellation_thread.join()
|
946 |
+
self.metrics_thread.join()
|
947 |
+
self.executor.shutdown()
|
tensorrt_llm/1/rank0.engine
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9b8418460f6786395ac4ace17e6dafad6c2b60a021fb247da853718db2c4fd13
|
3 |
+
size 1065214420
|
tensorrt_llm/1/rank1.engine
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c40598916cbd21bcfa434ae02004e9e8b1d6f50445a3f5bdd4bb3971634072cf
|
3 |
+
size 1065215172
|
tensorrt_llm/config.pbtxt
ADDED
@@ -0,0 +1,556 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "tensorrt_llm"
|
28 |
+
backend: "tensorrtllm"
|
29 |
+
max_batch_size: 32
|
30 |
+
|
31 |
+
model_transaction_policy {
|
32 |
+
decoupled: True
|
33 |
+
}
|
34 |
+
|
35 |
+
input [
|
36 |
+
{
|
37 |
+
name: "input_ids"
|
38 |
+
data_type: TYPE_INT32
|
39 |
+
dims: [ -1 ]
|
40 |
+
allow_ragged_batch: true
|
41 |
+
},
|
42 |
+
{
|
43 |
+
name: "input_lengths"
|
44 |
+
data_type: TYPE_INT32
|
45 |
+
dims: [ 1 ]
|
46 |
+
reshape: { shape: [ ] }
|
47 |
+
},
|
48 |
+
{
|
49 |
+
name: "request_output_len"
|
50 |
+
data_type: TYPE_INT32
|
51 |
+
dims: [ 1 ]
|
52 |
+
reshape: { shape: [ ] }
|
53 |
+
},
|
54 |
+
{
|
55 |
+
name: "draft_input_ids"
|
56 |
+
data_type: TYPE_INT32
|
57 |
+
dims: [ -1 ]
|
58 |
+
optional: true
|
59 |
+
allow_ragged_batch: true
|
60 |
+
},
|
61 |
+
{
|
62 |
+
name: "decoder_input_ids"
|
63 |
+
data_type: TYPE_INT32
|
64 |
+
dims: [ -1 ]
|
65 |
+
optional: true
|
66 |
+
allow_ragged_batch: true
|
67 |
+
},
|
68 |
+
{
|
69 |
+
name: "decoder_input_lengths"
|
70 |
+
data_type: TYPE_INT32
|
71 |
+
dims: [ 1 ]
|
72 |
+
optional: true
|
73 |
+
reshape: { shape: [ ] }
|
74 |
+
},
|
75 |
+
{
|
76 |
+
name: "draft_logits"
|
77 |
+
data_type: TYPE_FP32
|
78 |
+
dims: [ -1, -1 ]
|
79 |
+
optional: true
|
80 |
+
allow_ragged_batch: true
|
81 |
+
},
|
82 |
+
{
|
83 |
+
name: "draft_acceptance_threshold"
|
84 |
+
data_type: TYPE_FP32
|
85 |
+
dims: [ 1 ]
|
86 |
+
reshape: { shape: [ ] }
|
87 |
+
optional: true
|
88 |
+
},
|
89 |
+
{
|
90 |
+
name: "end_id"
|
91 |
+
data_type: TYPE_INT32
|
92 |
+
dims: [ 1 ]
|
93 |
+
reshape: { shape: [ ] }
|
94 |
+
optional: true
|
95 |
+
},
|
96 |
+
{
|
97 |
+
name: "pad_id"
|
98 |
+
data_type: TYPE_INT32
|
99 |
+
dims: [ 1 ]
|
100 |
+
reshape: { shape: [ ] }
|
101 |
+
optional: true
|
102 |
+
},
|
103 |
+
{
|
104 |
+
name: "stop_words_list"
|
105 |
+
data_type: TYPE_INT32
|
106 |
+
dims: [ 2, -1 ]
|
107 |
+
optional: true
|
108 |
+
allow_ragged_batch: true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
name: "bad_words_list"
|
112 |
+
data_type: TYPE_INT32
|
113 |
+
dims: [ 2, -1 ]
|
114 |
+
optional: true
|
115 |
+
allow_ragged_batch: true
|
116 |
+
},
|
117 |
+
{
|
118 |
+
name: "embedding_bias"
|
119 |
+
data_type: TYPE_FP32
|
120 |
+
dims: [ -1 ]
|
121 |
+
optional: true
|
122 |
+
allow_ragged_batch: true
|
123 |
+
},
|
124 |
+
{
|
125 |
+
name: "beam_width"
|
126 |
+
data_type: TYPE_INT32
|
127 |
+
dims: [ 1 ]
|
128 |
+
reshape: { shape: [ ] }
|
129 |
+
optional: true
|
130 |
+
},
|
131 |
+
{
|
132 |
+
name: "temperature"
|
133 |
+
data_type: TYPE_FP32
|
134 |
+
dims: [ 1 ]
|
135 |
+
reshape: { shape: [ ] }
|
136 |
+
optional: true
|
137 |
+
},
|
138 |
+
{
|
139 |
+
name: "runtime_top_k"
|
140 |
+
data_type: TYPE_INT32
|
141 |
+
dims: [ 1 ]
|
142 |
+
reshape: { shape: [ ] }
|
143 |
+
optional: true
|
144 |
+
},
|
145 |
+
{
|
146 |
+
name: "runtime_top_p"
|
147 |
+
data_type: TYPE_FP32
|
148 |
+
dims: [ 1 ]
|
149 |
+
reshape: { shape: [ ] }
|
150 |
+
optional: true
|
151 |
+
},
|
152 |
+
{
|
153 |
+
name: "runtime_top_p_min"
|
154 |
+
data_type: TYPE_FP32
|
155 |
+
dims: [ 1 ]
|
156 |
+
reshape: { shape: [ ] }
|
157 |
+
optional: true
|
158 |
+
},
|
159 |
+
{
|
160 |
+
name: "runtime_top_p_decay"
|
161 |
+
data_type: TYPE_FP32
|
162 |
+
dims: [ 1 ]
|
163 |
+
reshape: { shape: [ ] }
|
164 |
+
optional: true
|
165 |
+
},
|
166 |
+
{
|
167 |
+
name: "runtime_top_p_reset_ids"
|
168 |
+
data_type: TYPE_INT32
|
169 |
+
dims: [ 1 ]
|
170 |
+
reshape: { shape: [ ] }
|
171 |
+
optional: true
|
172 |
+
},
|
173 |
+
{
|
174 |
+
name: "len_penalty"
|
175 |
+
data_type: TYPE_FP32
|
176 |
+
dims: [ 1 ]
|
177 |
+
reshape: { shape: [ ] }
|
178 |
+
optional: true
|
179 |
+
},
|
180 |
+
{
|
181 |
+
name: "early_stopping"
|
182 |
+
data_type: TYPE_BOOL
|
183 |
+
dims: [ 1 ]
|
184 |
+
reshape: { shape: [ ] }
|
185 |
+
optional: true
|
186 |
+
},
|
187 |
+
{
|
188 |
+
name: "repetition_penalty"
|
189 |
+
data_type: TYPE_FP32
|
190 |
+
dims: [ 1 ]
|
191 |
+
reshape: { shape: [ ] }
|
192 |
+
optional: true
|
193 |
+
},
|
194 |
+
{
|
195 |
+
name: "min_length"
|
196 |
+
data_type: TYPE_INT32
|
197 |
+
dims: [ 1 ]
|
198 |
+
reshape: { shape: [ ] }
|
199 |
+
optional: true
|
200 |
+
},
|
201 |
+
{
|
202 |
+
name: "beam_search_diversity_rate"
|
203 |
+
data_type: TYPE_FP32
|
204 |
+
dims: [ 1 ]
|
205 |
+
reshape: { shape: [ ] }
|
206 |
+
optional: true
|
207 |
+
},
|
208 |
+
{
|
209 |
+
name: "presence_penalty"
|
210 |
+
data_type: TYPE_FP32
|
211 |
+
dims: [ 1 ]
|
212 |
+
reshape: { shape: [ ] }
|
213 |
+
optional: true
|
214 |
+
},
|
215 |
+
{
|
216 |
+
name: "frequency_penalty"
|
217 |
+
data_type: TYPE_FP32
|
218 |
+
dims: [ 1 ]
|
219 |
+
reshape: { shape: [ ] }
|
220 |
+
optional: true
|
221 |
+
},
|
222 |
+
{
|
223 |
+
name: "random_seed"
|
224 |
+
data_type: TYPE_UINT64
|
225 |
+
dims: [ 1 ]
|
226 |
+
reshape: { shape: [ ] }
|
227 |
+
optional: true
|
228 |
+
},
|
229 |
+
{
|
230 |
+
name: "return_log_probs"
|
231 |
+
data_type: TYPE_BOOL
|
232 |
+
dims: [ 1 ]
|
233 |
+
reshape: { shape: [ ] }
|
234 |
+
optional: true
|
235 |
+
},
|
236 |
+
{
|
237 |
+
name: "return_context_logits"
|
238 |
+
data_type: TYPE_BOOL
|
239 |
+
dims: [ 1 ]
|
240 |
+
reshape: { shape: [ ] }
|
241 |
+
optional: true
|
242 |
+
},
|
243 |
+
{
|
244 |
+
name: "return_generation_logits"
|
245 |
+
data_type: TYPE_BOOL
|
246 |
+
dims: [ 1 ]
|
247 |
+
reshape: { shape: [ ] }
|
248 |
+
optional: true
|
249 |
+
},
|
250 |
+
{
|
251 |
+
name: "stop"
|
252 |
+
data_type: TYPE_BOOL
|
253 |
+
dims: [ 1 ]
|
254 |
+
reshape: { shape: [ ] }
|
255 |
+
optional: true
|
256 |
+
},
|
257 |
+
{
|
258 |
+
name: "streaming"
|
259 |
+
data_type: TYPE_BOOL
|
260 |
+
dims: [ 1 ]
|
261 |
+
reshape: { shape: [ ] }
|
262 |
+
optional: true
|
263 |
+
},
|
264 |
+
{
|
265 |
+
name: "prompt_embedding_table"
|
266 |
+
data_type: TYPE_FP16
|
267 |
+
dims: [ -1, -1 ]
|
268 |
+
optional: true
|
269 |
+
allow_ragged_batch: true
|
270 |
+
},
|
271 |
+
{
|
272 |
+
name: "prompt_vocab_size"
|
273 |
+
data_type: TYPE_INT32
|
274 |
+
dims: [ 1 ]
|
275 |
+
reshape: { shape: [ ] }
|
276 |
+
optional: true
|
277 |
+
},
|
278 |
+
# the unique task ID for the given LoRA.
|
279 |
+
# To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
|
280 |
+
# The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
|
281 |
+
# If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached.
|
282 |
+
{
|
283 |
+
name: "lora_task_id"
|
284 |
+
data_type: TYPE_UINT64
|
285 |
+
dims: [ 1 ]
|
286 |
+
reshape: { shape: [ ] }
|
287 |
+
optional: true
|
288 |
+
},
|
289 |
+
# weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
|
290 |
+
# where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
|
291 |
+
# each of the in / out tensors are first flattened and then concatenated together in the format above.
|
292 |
+
# D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
|
293 |
+
{
|
294 |
+
name: "lora_weights"
|
295 |
+
data_type: TYPE_FP16
|
296 |
+
dims: [ -1, -1 ]
|
297 |
+
optional: true
|
298 |
+
allow_ragged_batch: true
|
299 |
+
},
|
300 |
+
# module identifier (same size a first dimension of lora_weights)
|
301 |
+
# See LoraModule::ModuleType for model id mapping
|
302 |
+
#
|
303 |
+
# "attn_qkv": 0 # compbined qkv adapter
|
304 |
+
# "attn_q": 1 # q adapter
|
305 |
+
# "attn_k": 2 # k adapter
|
306 |
+
# "attn_v": 3 # v adapter
|
307 |
+
# "attn_dense": 4 # adapter for the dense layer in attention
|
308 |
+
# "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
|
309 |
+
# "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
|
310 |
+
# "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
|
311 |
+
#
|
312 |
+
# last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
|
313 |
+
{
|
314 |
+
name: "lora_config"
|
315 |
+
data_type: TYPE_INT32
|
316 |
+
dims: [ -1, 3 ]
|
317 |
+
optional: true
|
318 |
+
allow_ragged_batch: true
|
319 |
+
}
|
320 |
+
]
|
321 |
+
output [
|
322 |
+
{
|
323 |
+
name: "output_ids"
|
324 |
+
data_type: TYPE_INT32
|
325 |
+
dims: [ -1, -1 ]
|
326 |
+
},
|
327 |
+
{
|
328 |
+
name: "sequence_length"
|
329 |
+
data_type: TYPE_INT32
|
330 |
+
dims: [ -1 ]
|
331 |
+
},
|
332 |
+
{
|
333 |
+
name: "cum_log_probs"
|
334 |
+
data_type: TYPE_FP32
|
335 |
+
dims: [ -1 ]
|
336 |
+
},
|
337 |
+
{
|
338 |
+
name: "output_log_probs"
|
339 |
+
data_type: TYPE_FP32
|
340 |
+
dims: [ -1, -1 ]
|
341 |
+
},
|
342 |
+
{
|
343 |
+
name: "context_logits"
|
344 |
+
data_type: TYPE_FP32
|
345 |
+
dims: [ -1, -1 ]
|
346 |
+
},
|
347 |
+
{
|
348 |
+
name: "generation_logits"
|
349 |
+
data_type: TYPE_FP32
|
350 |
+
dims: [ -1, -1, -1 ]
|
351 |
+
},
|
352 |
+
{
|
353 |
+
name: "batch_index"
|
354 |
+
data_type: TYPE_INT32
|
355 |
+
dims: [ 1 ]
|
356 |
+
}
|
357 |
+
]
|
358 |
+
instance_group [
|
359 |
+
{
|
360 |
+
count: 1
|
361 |
+
kind : KIND_CPU
|
362 |
+
}
|
363 |
+
]
|
364 |
+
parameters: {
|
365 |
+
key: "max_beam_width"
|
366 |
+
value: {
|
367 |
+
string_value: "1"
|
368 |
+
}
|
369 |
+
}
|
370 |
+
parameters: {
|
371 |
+
key: "FORCE_CPU_ONLY_INPUT_TENSORS"
|
372 |
+
value: {
|
373 |
+
string_value: "no"
|
374 |
+
}
|
375 |
+
}
|
376 |
+
parameters: {
|
377 |
+
key: "gpt_model_type"
|
378 |
+
value: {
|
379 |
+
string_value: "inflight_fused_batching"
|
380 |
+
}
|
381 |
+
}
|
382 |
+
parameters: {
|
383 |
+
key: "gpt_model_path"
|
384 |
+
value: {
|
385 |
+
string_value: "/all_models/inflight_batcher_llm/tensorrt_llm/1"
|
386 |
+
}
|
387 |
+
}
|
388 |
+
parameters: {
|
389 |
+
key: "encoder_model_path"
|
390 |
+
value: {
|
391 |
+
string_value: "${encoder_engine_dir}"
|
392 |
+
}
|
393 |
+
}
|
394 |
+
parameters: {
|
395 |
+
key: "max_tokens_in_paged_kv_cache"
|
396 |
+
value: {
|
397 |
+
string_value: "${max_tokens_in_paged_kv_cache}"
|
398 |
+
}
|
399 |
+
}
|
400 |
+
parameters: {
|
401 |
+
key: "max_attention_window_size"
|
402 |
+
value: {
|
403 |
+
string_value: "${max_attention_window_size}"
|
404 |
+
}
|
405 |
+
}
|
406 |
+
parameters: {
|
407 |
+
key: "sink_token_length"
|
408 |
+
value: {
|
409 |
+
string_value: "${sink_token_length}"
|
410 |
+
}
|
411 |
+
}
|
412 |
+
parameters: {
|
413 |
+
key: "batch_scheduler_policy"
|
414 |
+
value: {
|
415 |
+
string_value: "guaranteed_no_evict"
|
416 |
+
}
|
417 |
+
}
|
418 |
+
parameters: {
|
419 |
+
key: "kv_cache_free_gpu_mem_fraction"
|
420 |
+
value: {
|
421 |
+
string_value: "0.1"
|
422 |
+
}
|
423 |
+
}
|
424 |
+
parameters: {
|
425 |
+
key: "kv_cache_host_memory_bytes"
|
426 |
+
value: {
|
427 |
+
string_value: "${kv_cache_host_memory_bytes}"
|
428 |
+
}
|
429 |
+
}
|
430 |
+
parameters: {
|
431 |
+
key: "kv_cache_onboard_blocks"
|
432 |
+
value: {
|
433 |
+
string_value: "${kv_cache_onboard_blocks}"
|
434 |
+
}
|
435 |
+
}
|
436 |
+
# enable_trt_overlap is deprecated and doesn't have any effect on the runtime
|
437 |
+
# parameters: {
|
438 |
+
# key: "enable_trt_overlap"
|
439 |
+
# value: {
|
440 |
+
# string_value: "${enable_trt_overlap}"
|
441 |
+
# }
|
442 |
+
# }
|
443 |
+
parameters: {
|
444 |
+
key: "exclude_input_in_output"
|
445 |
+
value: {
|
446 |
+
string_value: "True"
|
447 |
+
}
|
448 |
+
}
|
449 |
+
parameters: {
|
450 |
+
key: "cancellation_check_period_ms"
|
451 |
+
value: {
|
452 |
+
string_value: "${cancellation_check_period_ms}"
|
453 |
+
}
|
454 |
+
}
|
455 |
+
parameters: {
|
456 |
+
key: "stats_check_period_ms"
|
457 |
+
value: {
|
458 |
+
string_value: "${stats_check_period_ms}"
|
459 |
+
}
|
460 |
+
}
|
461 |
+
parameters: {
|
462 |
+
key: "iter_stats_max_iterations"
|
463 |
+
value: {
|
464 |
+
string_value: "${iter_stats_max_iterations}"
|
465 |
+
}
|
466 |
+
}
|
467 |
+
parameters: {
|
468 |
+
key: "request_stats_max_iterations"
|
469 |
+
value: {
|
470 |
+
string_value: "${request_stats_max_iterations}"
|
471 |
+
}
|
472 |
+
}
|
473 |
+
parameters: {
|
474 |
+
key: "enable_kv_cache_reuse"
|
475 |
+
value: {
|
476 |
+
string_value: "True"
|
477 |
+
}
|
478 |
+
}
|
479 |
+
parameters: {
|
480 |
+
key: "normalize_log_probs"
|
481 |
+
value: {
|
482 |
+
string_value: "${normalize_log_probs}"
|
483 |
+
}
|
484 |
+
}
|
485 |
+
parameters: {
|
486 |
+
key: "enable_chunked_context"
|
487 |
+
value: {
|
488 |
+
string_value: "${enable_chunked_context}"
|
489 |
+
}
|
490 |
+
}
|
491 |
+
parameters: {
|
492 |
+
key: "gpu_device_ids"
|
493 |
+
value: {
|
494 |
+
string_value: "0,1"
|
495 |
+
}
|
496 |
+
}
|
497 |
+
parameters: {
|
498 |
+
key: "lora_cache_optimal_adapter_size"
|
499 |
+
value: {
|
500 |
+
string_value: "${lora_cache_optimal_adapter_size}"
|
501 |
+
}
|
502 |
+
}
|
503 |
+
parameters: {
|
504 |
+
key: "lora_cache_max_adapter_size"
|
505 |
+
value: {
|
506 |
+
string_value: "${lora_cache_max_adapter_size}"
|
507 |
+
}
|
508 |
+
}
|
509 |
+
parameters: {
|
510 |
+
key: "lora_cache_gpu_memory_fraction"
|
511 |
+
value: {
|
512 |
+
string_value: "${lora_cache_gpu_memory_fraction}"
|
513 |
+
}
|
514 |
+
}
|
515 |
+
parameters: {
|
516 |
+
key: "lora_cache_host_memory_bytes"
|
517 |
+
value: {
|
518 |
+
string_value: "${lora_cache_host_memory_bytes}"
|
519 |
+
}
|
520 |
+
}
|
521 |
+
parameters: {
|
522 |
+
key: "decoding_mode"
|
523 |
+
value: {
|
524 |
+
string_value: "top_k_top_p"
|
525 |
+
}
|
526 |
+
}
|
527 |
+
parameters: {
|
528 |
+
key: "executor_worker_path"
|
529 |
+
value: {
|
530 |
+
string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
|
531 |
+
}
|
532 |
+
}
|
533 |
+
parameters: {
|
534 |
+
key: "medusa_choices"
|
535 |
+
value: {
|
536 |
+
string_value: "${medusa_choices}"
|
537 |
+
}
|
538 |
+
}
|
539 |
+
parameters: {
|
540 |
+
key: "gpu_weights_percent"
|
541 |
+
value: {
|
542 |
+
string_value: "${gpu_weights_percent}"
|
543 |
+
}
|
544 |
+
}
|
545 |
+
parameters: {
|
546 |
+
key: "enable_context_fmha_fp32_acc"
|
547 |
+
value: {
|
548 |
+
string_value: "${enable_context_fmha_fp32_acc}"
|
549 |
+
}
|
550 |
+
}
|
551 |
+
parameters: {
|
552 |
+
key: "multi_block_mode"
|
553 |
+
value: {
|
554 |
+
string_value: "${multi_block_mode}"
|
555 |
+
}
|
556 |
+
}
|
tensorrt_llm_bls/1/__pycache__/model.cpython-310.pyc
ADDED
Binary file (3.15 kB). View file
|
|
tensorrt_llm_bls/1/lib/__pycache__/decode.cpython-310.pyc
ADDED
Binary file (10.3 kB). View file
|
|
tensorrt_llm_bls/1/lib/__pycache__/triton_decoder.cpython-310.pyc
ADDED
Binary file (11.5 kB). View file
|
|
tensorrt_llm_bls/1/lib/decode.py
ADDED
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
from collections.abc import Generator
|
28 |
+
from dataclasses import dataclass
|
29 |
+
from typing import Optional
|
30 |
+
|
31 |
+
import numpy as np
|
32 |
+
import torch
|
33 |
+
|
34 |
+
|
35 |
+
class RequestValidationError(Exception):
|
36 |
+
pass
|
37 |
+
|
38 |
+
|
39 |
+
def _validate_that(condition: bool, msg: str):
|
40 |
+
if not condition:
|
41 |
+
raise RequestValidationError(msg)
|
42 |
+
|
43 |
+
|
44 |
+
def _validate_non_empty(data, msg: str):
|
45 |
+
if isinstance(data, torch.Tensor):
|
46 |
+
_validate_that(data is not None and data.numel() > 0, msg)
|
47 |
+
else:
|
48 |
+
_validate_that(data is not None and data.size > 0, msg)
|
49 |
+
|
50 |
+
|
51 |
+
def _validate_single_gt_0(data, msg: str):
|
52 |
+
_validate_non_empty(data, msg)
|
53 |
+
_validate_that(data.flatten()[0] > 0, msg)
|
54 |
+
|
55 |
+
|
56 |
+
def _single_value(data: Optional[np.ndarray]):
|
57 |
+
if data is None:
|
58 |
+
return None
|
59 |
+
return data.flatten()[0]
|
60 |
+
|
61 |
+
|
62 |
+
@dataclass
|
63 |
+
class Request:
|
64 |
+
text_input: np.ndarray = np.array([])
|
65 |
+
decoder_text_input: np.ndarray = None
|
66 |
+
image_input: Optional[np.ndarray] = None
|
67 |
+
max_tokens: Optional[np.ndarray] = None
|
68 |
+
bad_words: Optional[np.ndarray] = None
|
69 |
+
stop_words: Optional[np.ndarray] = None
|
70 |
+
end_id: Optional[np.ndarray] = None
|
71 |
+
pad_id: Optional[np.ndarray] = None
|
72 |
+
top_k: Optional[np.ndarray] = None
|
73 |
+
top_p: Optional[np.ndarray] = None
|
74 |
+
temperature: Optional[np.ndarray] = None
|
75 |
+
length_penalty: Optional[np.ndarray] = None
|
76 |
+
repetition_penalty: Optional[np.ndarray] = None
|
77 |
+
min_length: Optional[np.ndarray] = None
|
78 |
+
return_log_probs: Optional[np.ndarray] = None
|
79 |
+
prompt_embedding_table: Optional[np.ndarray] = None
|
80 |
+
prompt_vocab_size: Optional[np.ndarray] = None
|
81 |
+
embedding_bias_words: Optional[np.ndarray] = None
|
82 |
+
embedding_bias_weights: Optional[np.ndarray] = None
|
83 |
+
num_draft_tokens: Optional[np.ndarray] = None
|
84 |
+
use_draft_logits: Optional[np.ndarray] = None
|
85 |
+
stream: Optional[np.ndarray] = None
|
86 |
+
beam_width: Optional[np.ndarray] = None
|
87 |
+
return_context_logits: Optional[np.ndarray] = None
|
88 |
+
return_generation_logits: Optional[np.ndarray] = None
|
89 |
+
random_seed: Optional[np.ndarray] = None
|
90 |
+
presence_penalty: Optional[np.ndarray] = None
|
91 |
+
frequency_penalty: Optional[np.ndarray] = None
|
92 |
+
|
93 |
+
def validate(self):
|
94 |
+
_validate_non_empty(self.text_input, "text_input is required")
|
95 |
+
_validate_single_gt_0(self.max_tokens,
|
96 |
+
"max_tokens must be a single value > 0")
|
97 |
+
|
98 |
+
num_draft_tokens = _single_value(self.num_draft_tokens)
|
99 |
+
_single_value(self.return_generation_logits)
|
100 |
+
context_logits = _single_value(self.return_context_logits)
|
101 |
+
|
102 |
+
if num_draft_tokens:
|
103 |
+
_validate_that(
|
104 |
+
not self.stream.any(),
|
105 |
+
"streaming is not supported with speculative decoding")
|
106 |
+
_validate_that(
|
107 |
+
not context_logits,
|
108 |
+
"context logits are not supported with speculative decoding")
|
109 |
+
|
110 |
+
|
111 |
+
@dataclass
|
112 |
+
class DraftRequest:
|
113 |
+
draft_input_ids: Optional[np.ndarray] = None
|
114 |
+
draft_logits: Optional[np.ndarray] = None
|
115 |
+
|
116 |
+
|
117 |
+
@dataclass
|
118 |
+
class PreprocResponse:
|
119 |
+
input_ids: np.ndarray = np.array([])
|
120 |
+
decoder_input_ids: np.ndarray = None
|
121 |
+
input_lengths: np.ndarray = np.array([])
|
122 |
+
decoder_input_lengths: np.ndarray = None
|
123 |
+
bad_words_list: Optional[np.ndarray] = None
|
124 |
+
stop_words_list: Optional[np.ndarray] = None
|
125 |
+
embedding_bias: Optional[np.ndarray] = None
|
126 |
+
end_id: Optional[np.ndarray] = None
|
127 |
+
pad_id: Optional[np.ndarray] = None
|
128 |
+
|
129 |
+
@classmethod
|
130 |
+
def with_new_inputs(cls,
|
131 |
+
other,
|
132 |
+
input_ids: Optional[np.ndarray] = None,
|
133 |
+
input_lengths: Optional[np.ndarray] = None):
|
134 |
+
return cls(input_ids=(input_ids
|
135 |
+
if input_ids is not None else other.input_ids),
|
136 |
+
input_lengths=(input_lengths if input_lengths is not None
|
137 |
+
else other.input_lengths),
|
138 |
+
decoder_input_ids=other.decoder_input_ids,
|
139 |
+
decoder_input_lengths=other.decoder_input_lengths,
|
140 |
+
bad_words_list=other.bad_words_list,
|
141 |
+
stop_words_list=other.stop_words_list,
|
142 |
+
end_id=other.end_id,
|
143 |
+
pad_id=other.pad_id)
|
144 |
+
|
145 |
+
|
146 |
+
@dataclass
|
147 |
+
class MultimodalEncResponse:
|
148 |
+
prompt_embedding_table: Optional[torch.Tensor] = None
|
149 |
+
prompt_vocab_size: Optional[np.ndarray] = None
|
150 |
+
|
151 |
+
|
152 |
+
@dataclass
|
153 |
+
class GenerationResponse:
|
154 |
+
output_ids: np.ndarray = np.array([])
|
155 |
+
sequence_length: np.ndarray = np.array([])
|
156 |
+
cum_log_probs: Optional[np.ndarray] = None
|
157 |
+
output_log_probs: Optional[np.ndarray] = None
|
158 |
+
context_logits: Optional[np.ndarray] = None
|
159 |
+
generation_logits: Optional[np.ndarray] = None
|
160 |
+
batch_index: Optional[np.ndarray] = None
|
161 |
+
|
162 |
+
|
163 |
+
@dataclass
|
164 |
+
class Response:
|
165 |
+
text_output: np.ndarray = np.array([])
|
166 |
+
cum_log_probs: Optional[np.ndarray] = None
|
167 |
+
output_log_probs: Optional[np.ndarray] = None
|
168 |
+
context_logits: Optional[np.ndarray] = None
|
169 |
+
generation_logits: Optional[np.ndarray] = None
|
170 |
+
batch_index: Optional[np.ndarray] = None
|
171 |
+
|
172 |
+
def __eq__(self, o) -> bool:
|
173 |
+
"""Just for testing"""
|
174 |
+
if not isinstance(o, Response):
|
175 |
+
return False
|
176 |
+
return (np.array_equal(self.text_output, o.text_output)
|
177 |
+
and np.array_equal(self.cum_log_probs, o.cum_log_probs)
|
178 |
+
and np.array_equal(self.output_log_probs, o.output_log_probs)
|
179 |
+
and np.array_equal(self.context_logits, o.context_logits)
|
180 |
+
and np.array_equal(self.generation_logits, o.generation_logits)
|
181 |
+
and np.array_equal(self.batch_index, o.batch_index))
|
182 |
+
|
183 |
+
|
184 |
+
class Decoder:
|
185 |
+
|
186 |
+
def __init__(self, streaming=False, accumulate=False):
|
187 |
+
self._streaming = streaming
|
188 |
+
self._accumulate = accumulate
|
189 |
+
|
190 |
+
self._accumulated_tokens = []
|
191 |
+
|
192 |
+
def decode(self,
|
193 |
+
request: Request,
|
194 |
+
speculative_decoding=False,
|
195 |
+
is_multimodal=False) -> Generator[Response, None, None]:
|
196 |
+
|
197 |
+
batch_size = request.text_input.shape[0]
|
198 |
+
self._accumulated_tokens = [None] * batch_size
|
199 |
+
preproc_response = self.preprocess(request)
|
200 |
+
|
201 |
+
multimodal_enc_response = None
|
202 |
+
if is_multimodal:
|
203 |
+
multimodal_enc_response = self._multimodal_enc_generate(request)
|
204 |
+
|
205 |
+
if speculative_decoding:
|
206 |
+
if batch_size > 1:
|
207 |
+
raise Exception(
|
208 |
+
"speculative decoding is not supported with batch size > 1"
|
209 |
+
)
|
210 |
+
for gen_response in self._spec_generate(preproc_response, request):
|
211 |
+
yield self.postprocess(gen_response, batch_size)
|
212 |
+
else:
|
213 |
+
if not self._streaming and batch_size == 1:
|
214 |
+
gen_response = self._generate_non_streaming(
|
215 |
+
preproc_response,
|
216 |
+
request,
|
217 |
+
multimodal_enc_response=multimodal_enc_response)
|
218 |
+
yield self.postprocess(gen_response, batch_size)
|
219 |
+
else:
|
220 |
+
for gen_response in self._generate(
|
221 |
+
preproc_response,
|
222 |
+
request,
|
223 |
+
multimodal_enc_response=multimodal_enc_response):
|
224 |
+
yield self.postprocess(gen_response, batch_size)
|
225 |
+
|
226 |
+
def encountered_stop_words(self, input_ids, stop_words_ids):
|
227 |
+
for stop_word_ids in stop_words_ids:
|
228 |
+
if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids):
|
229 |
+
return True
|
230 |
+
return False
|
231 |
+
|
232 |
+
def _spec_generate(
|
233 |
+
self, preproc: PreprocResponse,
|
234 |
+
request: Request) -> Generator[GenerationResponse, None, None]:
|
235 |
+
|
236 |
+
if preproc.input_ids.shape[0] > 1:
|
237 |
+
raise Exception(
|
238 |
+
"Speculative decoding does not support batch size > 1.")
|
239 |
+
|
240 |
+
prompt_input_ids: np.ndarray = preproc.input_ids[0]
|
241 |
+
input_ids: np.ndarray = prompt_input_ids
|
242 |
+
output_len: int = request.max_tokens[0][0]
|
243 |
+
last_input_ids: np.ndarray = None
|
244 |
+
draft_output_ids: np.ndarray = None
|
245 |
+
draft_logits: np.ndarray = None
|
246 |
+
|
247 |
+
target_response: GenerationResponse = None
|
248 |
+
|
249 |
+
cur_preproc = preproc
|
250 |
+
|
251 |
+
counter = 0
|
252 |
+
while True:
|
253 |
+
counter += 1
|
254 |
+
num_draft_tokens = min(
|
255 |
+
request.num_draft_tokens[0][0],
|
256 |
+
len(prompt_input_ids) + output_len - len(input_ids) - 1)
|
257 |
+
|
258 |
+
draft_request = None
|
259 |
+
if num_draft_tokens > 0:
|
260 |
+
draft_response: GenerationResponse = self._draft_generate_non_streaming(
|
261 |
+
cur_preproc, request, num_draft_tokens)
|
262 |
+
seq_len: int = draft_response.sequence_length[0][0]
|
263 |
+
# [1, beamWidth, outputLength] -> [outputLen]
|
264 |
+
draft_output_ids = draft_response.output_ids[0][0]
|
265 |
+
# [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded]
|
266 |
+
if request.use_draft_logits is not None and request.use_draft_logits[
|
267 |
+
0]:
|
268 |
+
if draft_response.generation_logits is not None:
|
269 |
+
draft_logits = draft_response.generation_logits[0][0]
|
270 |
+
|
271 |
+
input_draft_tokens = draft_output_ids[len(input_ids):seq_len]
|
272 |
+
draft_request = DraftRequest(
|
273 |
+
draft_input_ids=np.expand_dims(input_draft_tokens, 0))
|
274 |
+
if request.use_draft_logits is not None and request.use_draft_logits[
|
275 |
+
0]:
|
276 |
+
draft_request.draft_logits = np.expand_dims(
|
277 |
+
draft_logits[-len(input_draft_tokens):], 0)
|
278 |
+
else:
|
279 |
+
draft_request = DraftRequest()
|
280 |
+
target_response = self._generate_non_streaming(
|
281 |
+
cur_preproc, request, draft_request)
|
282 |
+
last_input_ids = input_ids
|
283 |
+
input_ids = target_response.output_ids[0][0]
|
284 |
+
cur_preproc = PreprocResponse.with_new_inputs(
|
285 |
+
cur_preproc, np.expand_dims(input_ids, 0),
|
286 |
+
np.array([[len(input_ids)]], dtype=np.int32))
|
287 |
+
|
288 |
+
# Evaluate criteria to stop generation loop.
|
289 |
+
# If we've hit or exceeded the max output length, should stop
|
290 |
+
length_stop = (len(input_ids) >=
|
291 |
+
len(prompt_input_ids) + output_len)
|
292 |
+
if length_stop:
|
293 |
+
break
|
294 |
+
# If draft and target have same outputs, should stop. Normally target should return 1 more token.
|
295 |
+
# If they are the same length, they should differ at the last token
|
296 |
+
target_draft_equal = draft_output_ids is not None and np.array_equal(
|
297 |
+
draft_output_ids, input_ids)
|
298 |
+
if target_draft_equal:
|
299 |
+
break
|
300 |
+
# If tokens no longer change, should stop, means we have hit early stopping
|
301 |
+
last_current_equal = np.array_equal(last_input_ids, input_ids)
|
302 |
+
if last_current_equal:
|
303 |
+
break
|
304 |
+
# Need to check if stop words was encountered
|
305 |
+
hit_stop_words = self.encountered_stop_words(
|
306 |
+
input_ids, preproc.stop_words_list[0])
|
307 |
+
if hit_stop_words:
|
308 |
+
break
|
309 |
+
|
310 |
+
yield target_response
|
311 |
+
|
312 |
+
def _draft_generate_non_streaming(
|
313 |
+
self, preproc: PreprocResponse, request: Request,
|
314 |
+
num_draft_tokens: int) -> GenerationResponse:
|
315 |
+
raise NotImplementedError()
|
316 |
+
|
317 |
+
def _multimodal_enc_generate(
|
318 |
+
self,
|
319 |
+
request: Request,
|
320 |
+
) -> MultimodalEncResponse:
|
321 |
+
raise NotImplementedError()
|
322 |
+
|
323 |
+
def _generate(
|
324 |
+
self,
|
325 |
+
preproc: PreprocResponse,
|
326 |
+
request: Request,
|
327 |
+
draft_request: Optional[DraftRequest] = None,
|
328 |
+
multimodal_enc_response: Optional[MultimodalEncResponse] = None,
|
329 |
+
) -> Generator[GenerationResponse, None, None]:
|
330 |
+
raise NotImplementedError()
|
331 |
+
|
332 |
+
def _generate_non_streaming(
|
333 |
+
self,
|
334 |
+
preproc: PreprocResponse,
|
335 |
+
request: Request,
|
336 |
+
draft_request: Optional[DraftRequest] = None,
|
337 |
+
multimodal_enc_response: Optional[MultimodalEncResponse] = None,
|
338 |
+
) -> GenerationResponse:
|
339 |
+
raise NotImplementedError()
|
340 |
+
|
341 |
+
def postprocess(self, gen_response: GenerationResponse,
|
342 |
+
batch_size) -> Response:
|
343 |
+
if self._accumulate and self._streaming:
|
344 |
+
new_tokens: np.ndarray = gen_response.output_ids
|
345 |
+
if new_tokens.ndim != 3:
|
346 |
+
raise Exception("Expected output_ids tensor to have 3 dims.")
|
347 |
+
if new_tokens.shape[0] != 1:
|
348 |
+
raise Exception("Expected batch size of 1")
|
349 |
+
if new_tokens.shape[1] != 1:
|
350 |
+
raise Exception(
|
351 |
+
"Accumulation of tokens is only implemented for beam width = 1"
|
352 |
+
)
|
353 |
+
|
354 |
+
batch_index = gen_response.batch_index
|
355 |
+
if batch_index.ndim != 2:
|
356 |
+
raise Exception("Expected batch_index tensor to have 2 dims.")
|
357 |
+
if batch_index.shape[0] != 1:
|
358 |
+
raise Exception("Expected batch size of 1")
|
359 |
+
if batch_index.shape[1] != 1:
|
360 |
+
raise Exception("Expected only one batch_index")
|
361 |
+
|
362 |
+
batch_index = batch_index[0][0]
|
363 |
+
|
364 |
+
self._accumulated_tokens[batch_index] = new_tokens if (
|
365 |
+
self._accumulated_tokens[batch_index] is None
|
366 |
+
) else np.concatenate(
|
367 |
+
(self._accumulated_tokens[batch_index], new_tokens), axis=2)
|
368 |
+
sequence_lengths = np.array(
|
369 |
+
[[self._accumulated_tokens[batch_index].shape[2]]],
|
370 |
+
dtype=np.int32)
|
371 |
+
return self._postprocess(self._accumulated_tokens[batch_index],
|
372 |
+
sequence_lengths, gen_response)
|
373 |
+
else:
|
374 |
+
return self._postprocess(gen_response.output_ids, None,
|
375 |
+
gen_response)
|
376 |
+
|
377 |
+
def _postprocess(self, tokens: np.ndarray,
|
378 |
+
sequence_lengths: Optional[np.ndarray],
|
379 |
+
gen_response: GenerationResponse) -> Response:
|
380 |
+
raise NotImplementedError()
|
381 |
+
|
382 |
+
def preprocess(self, request: Request) -> PreprocResponse:
|
383 |
+
raise NotImplementedError()
|
384 |
+
|
385 |
+
def reset_decoder(self):
|
386 |
+
self._accumulated_tokens = []
|
tensorrt_llm_bls/1/lib/triton_decoder.py
ADDED
@@ -0,0 +1,523 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
from collections.abc import Callable
|
28 |
+
from typing import Dict, Optional
|
29 |
+
|
30 |
+
import numpy as np
|
31 |
+
import triton_python_backend_utils as pb_utils
|
32 |
+
from lib.decode import *
|
33 |
+
from torch.utils.dlpack import from_dlpack, to_dlpack
|
34 |
+
from typing_extensions import override
|
35 |
+
|
36 |
+
|
37 |
+
class TritonDecoder(Decoder):
|
38 |
+
|
39 |
+
def __init__(self,
|
40 |
+
streaming=False,
|
41 |
+
accumulate=False,
|
42 |
+
preproc_model_name="preprocessing",
|
43 |
+
postproc_model_name="postprocessing",
|
44 |
+
llm_model_name="tensorrt_llm",
|
45 |
+
draft_llm_model_name: Optional[str] = None,
|
46 |
+
multimodal_encoders_name: Optional[str] = None):
|
47 |
+
super().__init__(streaming=streaming, accumulate=accumulate)
|
48 |
+
self.preproc_model_name = preproc_model_name
|
49 |
+
self.postproc_model_name = postproc_model_name
|
50 |
+
self.llm_model_name = llm_model_name
|
51 |
+
self.draft_llm_model_name = draft_llm_model_name
|
52 |
+
self.multimodal_encoders_name = multimodal_encoders_name
|
53 |
+
|
54 |
+
self._preproc_outputs = [
|
55 |
+
"INPUT_ID",
|
56 |
+
"DECODER_INPUT_ID",
|
57 |
+
"REQUEST_INPUT_LEN",
|
58 |
+
"REQUEST_DECODER_INPUT_LEN",
|
59 |
+
"BAD_WORDS_IDS",
|
60 |
+
"STOP_WORDS_IDS",
|
61 |
+
"EMBEDDING_BIAS",
|
62 |
+
"OUT_PAD_ID",
|
63 |
+
"OUT_END_ID",
|
64 |
+
]
|
65 |
+
|
66 |
+
self._multimodal_enc_outputs = [
|
67 |
+
"OUT_PROMPT_EMBEDDING_TABLE", "OUT_PROMPT_VOCAB_SIZE"
|
68 |
+
]
|
69 |
+
|
70 |
+
self._llm_outputs = [
|
71 |
+
"output_ids", "sequence_length", "cum_log_probs",
|
72 |
+
"output_log_probs", "context_logits", "generation_logits",
|
73 |
+
"batch_index"
|
74 |
+
]
|
75 |
+
|
76 |
+
self._postproc_outputs = [
|
77 |
+
"OUTPUT",
|
78 |
+
]
|
79 |
+
|
80 |
+
self.input_names = [
|
81 |
+
"text_input",
|
82 |
+
"decoder_text_input",
|
83 |
+
"image_input",
|
84 |
+
"max_tokens",
|
85 |
+
"bad_words",
|
86 |
+
"stop_words",
|
87 |
+
"end_id",
|
88 |
+
"pad_id",
|
89 |
+
"top_k",
|
90 |
+
"top_p",
|
91 |
+
"temperature",
|
92 |
+
"length_penalty",
|
93 |
+
"repetition_penalty",
|
94 |
+
"min_length",
|
95 |
+
"presence_penalty",
|
96 |
+
"frequency_penalty",
|
97 |
+
"random_seed",
|
98 |
+
"return_log_probs",
|
99 |
+
"return_context_logits",
|
100 |
+
"return_generation_logits",
|
101 |
+
"beam_width",
|
102 |
+
"stream",
|
103 |
+
"prompt_embedding_table",
|
104 |
+
"prompt_vocab_size",
|
105 |
+
"embedding_bias_words",
|
106 |
+
"embedding_bias_weights",
|
107 |
+
"num_draft_tokens",
|
108 |
+
"use_draft_logits",
|
109 |
+
]
|
110 |
+
|
111 |
+
self.__undo_reshape_whitelist = {
|
112 |
+
"max_tokens",
|
113 |
+
"end_id",
|
114 |
+
"pad_id",
|
115 |
+
"top_k",
|
116 |
+
"top_p",
|
117 |
+
"temperature",
|
118 |
+
"length_penalty",
|
119 |
+
"repetition_penalty",
|
120 |
+
"min_length",
|
121 |
+
"presence_penalty",
|
122 |
+
"frequency_penalty",
|
123 |
+
"random_seed",
|
124 |
+
"return_log_probs",
|
125 |
+
"return_context_logits",
|
126 |
+
"return_generation_logits",
|
127 |
+
"beam_width",
|
128 |
+
"stream",
|
129 |
+
"prompt_vocab_size",
|
130 |
+
"num_draft_tokens",
|
131 |
+
"use_draft_logits",
|
132 |
+
}
|
133 |
+
|
134 |
+
def _exec_triton_request(self, request):
|
135 |
+
responses = request.exec(decoupled=True)
|
136 |
+
for r in responses:
|
137 |
+
if r.has_error():
|
138 |
+
raise pb_utils.TritonModelException(r.error().message())
|
139 |
+
yield r
|
140 |
+
|
141 |
+
def _exec_triton_request_single(self, request):
|
142 |
+
responses = request.exec(decoupled=False)
|
143 |
+
if responses.has_error():
|
144 |
+
raise pb_utils.TritonModelException(responses.error().message())
|
145 |
+
return responses
|
146 |
+
|
147 |
+
def create_triton_response(self, response: Response):
|
148 |
+
name_map = {
|
149 |
+
"text_output": "text_output",
|
150 |
+
"cum_log_probs": "cum_log_probs",
|
151 |
+
"output_log_probs": "output_log_probs",
|
152 |
+
"context_logits": "context_logits",
|
153 |
+
"generation_logits": "generation_logits",
|
154 |
+
"batch_index": "batch_index"
|
155 |
+
}
|
156 |
+
tensors = self.create_triton_tensors(response, name_map)
|
157 |
+
return pb_utils.InferenceResponse(output_tensors=tensors)
|
158 |
+
|
159 |
+
def convert_triton_request(self, triton_request) -> Request:
|
160 |
+
request = Request()
|
161 |
+
for triton_name in self.input_names:
|
162 |
+
tensor = pb_utils.get_input_tensor_by_name(triton_request,
|
163 |
+
triton_name)
|
164 |
+
target_name = triton_name
|
165 |
+
if tensor is None:
|
166 |
+
continue
|
167 |
+
if not hasattr(request, target_name):
|
168 |
+
raise AttributeError(
|
169 |
+
f"Request has no attribute '{target_name}'")
|
170 |
+
setattr(request, target_name, tensor.as_numpy())
|
171 |
+
return request
|
172 |
+
|
173 |
+
def convert_triton_response(self,
|
174 |
+
triton_response,
|
175 |
+
response_factory: Callable,
|
176 |
+
name_map=None):
|
177 |
+
response = response_factory()
|
178 |
+
for tensor in triton_response.output_tensors():
|
179 |
+
if tensor is None:
|
180 |
+
continue
|
181 |
+
triton_name = tensor.name()
|
182 |
+
if tensor.is_cpu():
|
183 |
+
value = tensor.as_numpy()
|
184 |
+
else:
|
185 |
+
# If the tensor is in GPU memory make it torch.Tensor type
|
186 |
+
value = from_dlpack(tensor.to_dlpack())
|
187 |
+
target_name = triton_name
|
188 |
+
if name_map and triton_name in name_map:
|
189 |
+
target_name = name_map[triton_name]
|
190 |
+
if name_map and not triton_name in name_map:
|
191 |
+
continue
|
192 |
+
if target_name is None:
|
193 |
+
# explicitly ignore this triton input
|
194 |
+
continue
|
195 |
+
if not hasattr(response, target_name):
|
196 |
+
raise AttributeError(
|
197 |
+
f"response object has not attribute '{target_name}'")
|
198 |
+
setattr(response, target_name, value)
|
199 |
+
return response
|
200 |
+
|
201 |
+
def __undo_reshape(self, x, name):
|
202 |
+
if name in self.__undo_reshape_whitelist and len(x.shape) == 1:
|
203 |
+
# handle reshapes
|
204 |
+
return np.expand_dims(x, 0)
|
205 |
+
else:
|
206 |
+
return x
|
207 |
+
|
208 |
+
def create_triton_tensors(self, obj, name_map: dict):
|
209 |
+
tensors = []
|
210 |
+
for name, triton_name in name_map.items():
|
211 |
+
if triton_name is None:
|
212 |
+
continue
|
213 |
+
value = getattr(obj, name)
|
214 |
+
if value is None:
|
215 |
+
continue
|
216 |
+
if isinstance(value, np.ndarray):
|
217 |
+
t = pb_utils.Tensor(triton_name,
|
218 |
+
self.__undo_reshape(value, name))
|
219 |
+
elif isinstance(value, torch.Tensor):
|
220 |
+
t = pb_utils.Tensor.from_dlpack(
|
221 |
+
triton_name, to_dlpack(self.__undo_reshape(value, name)))
|
222 |
+
tensors.append(t)
|
223 |
+
return tensors
|
224 |
+
|
225 |
+
@override
|
226 |
+
def preprocess(self, request: Request) -> PreprocResponse:
|
227 |
+
input_tensors = self._get_preproc_tensors(request)
|
228 |
+
triton_req = pb_utils.InferenceRequest(
|
229 |
+
model_name=self.preproc_model_name,
|
230 |
+
inputs=input_tensors,
|
231 |
+
requested_output_names=self._preproc_outputs)
|
232 |
+
triton_output = self._exec_triton_request_single(triton_req)
|
233 |
+
return self._get_preproc_response(triton_output)
|
234 |
+
|
235 |
+
def _get_preproc_tensors(self, request: Request):
|
236 |
+
name_map = {
|
237 |
+
"text_input": "QUERY",
|
238 |
+
"decoder_text_input": "DECODER_QUERY",
|
239 |
+
"max_tokens": "REQUEST_OUTPUT_LEN",
|
240 |
+
"bad_words": "BAD_WORDS_DICT",
|
241 |
+
"stop_words": "STOP_WORDS_DICT",
|
242 |
+
"embedding_bias_words": "EMBEDDING_BIAS_WORDS",
|
243 |
+
"embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS",
|
244 |
+
"pad_id": "PAD_ID",
|
245 |
+
"end_id": "END_ID",
|
246 |
+
}
|
247 |
+
return self.create_triton_tensors(request, name_map)
|
248 |
+
|
249 |
+
def _get_preproc_response(self, triton_output):
|
250 |
+
name_map = {
|
251 |
+
"INPUT_ID": "input_ids",
|
252 |
+
"DECODER_INPUT_ID": "decoder_input_ids",
|
253 |
+
"REQUEST_INPUT_LEN": "input_lengths",
|
254 |
+
"REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths",
|
255 |
+
"BAD_WORDS_IDS": "bad_words_list",
|
256 |
+
"STOP_WORDS_IDS": "stop_words_list",
|
257 |
+
"EMBEDDING_BIAS": "embedding_bias",
|
258 |
+
"OUT_PAD_ID": "pad_id",
|
259 |
+
"OUT_END_ID": "end_id",
|
260 |
+
}
|
261 |
+
return self.convert_triton_response(triton_output, PreprocResponse,
|
262 |
+
name_map)
|
263 |
+
|
264 |
+
@override
|
265 |
+
def _multimodal_enc_generate(self,
|
266 |
+
request: Request) -> MultimodalEncResponse:
|
267 |
+
input_tensors = self._get_multimodal_enc_tensors(request)
|
268 |
+
triton_req = pb_utils.InferenceRequest(
|
269 |
+
model_name=self.multimodal_encoders_name,
|
270 |
+
inputs=input_tensors,
|
271 |
+
requested_output_names=self._multimodal_enc_outputs)
|
272 |
+
triton_output = self._exec_triton_request_single(triton_req)
|
273 |
+
return self._get_multimodal_enc_response(triton_output)
|
274 |
+
|
275 |
+
def _get_multimodal_enc_tensors(self, preproc: PreprocResponse):
|
276 |
+
name_map = {
|
277 |
+
"image_input": "IMAGE",
|
278 |
+
}
|
279 |
+
return self.create_triton_tensors(preproc, name_map)
|
280 |
+
|
281 |
+
def _get_multimodal_enc_response(self, triton_output):
|
282 |
+
name_map = {
|
283 |
+
"OUT_PROMPT_EMBEDDING_TABLE": "prompt_embedding_table",
|
284 |
+
"OUT_PROMPT_VOCAB_SIZE": "prompt_vocab_size",
|
285 |
+
}
|
286 |
+
return self.convert_triton_response(triton_output,
|
287 |
+
MultimodalEncResponse, name_map)
|
288 |
+
|
289 |
+
@override
|
290 |
+
def _draft_generate_non_streaming(
|
291 |
+
self, preproc: PreprocResponse, request: Request,
|
292 |
+
num_draft_tokens: int) -> GenerationResponse:
|
293 |
+
input_tensors = self._get_llm_tensors(preproc, request,
|
294 |
+
num_draft_tokens, None, True)
|
295 |
+
triton_req = pb_utils.InferenceRequest(
|
296 |
+
model_name=self.draft_llm_model_name,
|
297 |
+
inputs=input_tensors,
|
298 |
+
requested_output_names=self._llm_outputs)
|
299 |
+
triton_response = self._exec_triton_request_single(triton_req)
|
300 |
+
llm_response = self._get_llm_response(triton_response)
|
301 |
+
return llm_response
|
302 |
+
|
303 |
+
@override
|
304 |
+
def _generate(
|
305 |
+
self,
|
306 |
+
preproc: PreprocResponse,
|
307 |
+
request: Request,
|
308 |
+
draft_request: Optional[DraftRequest] = None,
|
309 |
+
multimodal_enc_response: Optional[MultimodalEncResponse] = None
|
310 |
+
) -> Generator[GenerationResponse, None, None]:
|
311 |
+
input_tensors = self._get_llm_tensors(
|
312 |
+
preproc,
|
313 |
+
request,
|
314 |
+
None,
|
315 |
+
draft_request,
|
316 |
+
multimodal_enc_response=multimodal_enc_response)
|
317 |
+
triton_req = pb_utils.InferenceRequest(
|
318 |
+
model_name=self.llm_model_name,
|
319 |
+
inputs=input_tensors,
|
320 |
+
requested_output_names=self._llm_outputs)
|
321 |
+
for r in self._exec_triton_request(triton_req):
|
322 |
+
yield self._get_llm_response(r)
|
323 |
+
|
324 |
+
@override
|
325 |
+
def _generate_non_streaming(
|
326 |
+
self,
|
327 |
+
preproc: PreprocResponse,
|
328 |
+
request: Request,
|
329 |
+
draft_request: Optional[DraftRequest] = None,
|
330 |
+
multimodal_enc_response: Optional[MultimodalEncResponse] = None
|
331 |
+
) -> GenerationResponse:
|
332 |
+
input_tensors = self._get_llm_tensors(
|
333 |
+
preproc,
|
334 |
+
request,
|
335 |
+
None,
|
336 |
+
draft_request,
|
337 |
+
multimodal_enc_response=multimodal_enc_response)
|
338 |
+
triton_req = pb_utils.InferenceRequest(
|
339 |
+
model_name=self.llm_model_name,
|
340 |
+
inputs=input_tensors,
|
341 |
+
requested_output_names=self._llm_outputs)
|
342 |
+
r = self._exec_triton_request_single(triton_req)
|
343 |
+
return self._get_llm_response(r)
|
344 |
+
|
345 |
+
def _get_llm_tensors(
|
346 |
+
self,
|
347 |
+
preproc: PreprocResponse,
|
348 |
+
request: Request,
|
349 |
+
num_output_tokens: Optional[int] = None,
|
350 |
+
draft_request: Optional[DraftRequest] = None,
|
351 |
+
is_draft_model_request: bool = False,
|
352 |
+
multimodal_enc_response: MultimodalEncResponse = None):
|
353 |
+
tensors = []
|
354 |
+
tensors.extend(self._get_tensors_from_preproc(preproc))
|
355 |
+
if multimodal_enc_response is not None:
|
356 |
+
tensors.extend(
|
357 |
+
self._get_tensors_from_multimodal_enc(multimodal_enc_response))
|
358 |
+
tensors.extend(
|
359 |
+
self._get_llm_tensors_from_request(request, num_output_tokens,
|
360 |
+
draft_request,
|
361 |
+
is_draft_model_request))
|
362 |
+
return tensors
|
363 |
+
|
364 |
+
def _get_tensors_from_preproc(self, preproc: PreprocResponse):
|
365 |
+
name_map = {
|
366 |
+
"input_ids": "input_ids",
|
367 |
+
"decoder_input_ids": "decoder_input_ids",
|
368 |
+
"input_lengths": "input_lengths",
|
369 |
+
"bad_words_list": "bad_words_list",
|
370 |
+
"stop_words_list": "stop_words_list",
|
371 |
+
"embedding_bias": "embedding_bias",
|
372 |
+
"pad_id": "pad_id",
|
373 |
+
"end_id": "end_id",
|
374 |
+
}
|
375 |
+
return self.create_triton_tensors(preproc, name_map)
|
376 |
+
|
377 |
+
def _get_tensors_from_multimodal_enc(
|
378 |
+
self, multimodal_enc_response: MultimodalEncResponse):
|
379 |
+
name_map = {
|
380 |
+
"prompt_embedding_table": "prompt_embedding_table",
|
381 |
+
"prompt_vocab_size": "prompt_vocab_size",
|
382 |
+
}
|
383 |
+
return self.create_triton_tensors(multimodal_enc_response, name_map)
|
384 |
+
|
385 |
+
def _get_llm_tensors_from_request(
|
386 |
+
self,
|
387 |
+
request: Request,
|
388 |
+
num_output_tokens: Optional[int] = None,
|
389 |
+
draft_request: Optional[DraftRequest] = None,
|
390 |
+
is_draft_model_request: bool = False):
|
391 |
+
name_map: Dict[str, Optional[str]] = {
|
392 |
+
"beam_width": "beam_width",
|
393 |
+
"top_k": "runtime_top_k",
|
394 |
+
"top_p": "runtime_top_p",
|
395 |
+
"temperature": "temperature",
|
396 |
+
"length_penalty": "len_penalty",
|
397 |
+
"repetition_penalty": "repetition_penalty",
|
398 |
+
"min_length": "min_length",
|
399 |
+
"presence_penalty": "presence_penalty",
|
400 |
+
"frequency_penalty": "frequency_penalty",
|
401 |
+
"random_seed": "random_seed",
|
402 |
+
"return_log_probs": "return_log_probs",
|
403 |
+
"stream": "streaming",
|
404 |
+
"prompt_embedding_table": "prompt_embedding_table",
|
405 |
+
"prompt_vocab_size": "prompt_vocab_size",
|
406 |
+
}
|
407 |
+
batch_size = request.text_input.shape[0]
|
408 |
+
tensors = self.create_triton_tensors(request, name_map)
|
409 |
+
out_len_tensor = None
|
410 |
+
if request.max_tokens is not None:
|
411 |
+
out_len_tensor = request.max_tokens
|
412 |
+
|
413 |
+
out_len = None
|
414 |
+
if num_output_tokens is not None:
|
415 |
+
out_len = num_output_tokens
|
416 |
+
elif draft_request:
|
417 |
+
out_len = len(
|
418 |
+
draft_request.draft_input_ids[0]
|
419 |
+
) + 1 if draft_request.draft_input_ids is not None else 1
|
420 |
+
|
421 |
+
if out_len is not None:
|
422 |
+
out_len_tensor = [[out_len]] * batch_size
|
423 |
+
|
424 |
+
if out_len_tensor is None:
|
425 |
+
raise Exception("Could not determine request_output_len")
|
426 |
+
else:
|
427 |
+
tensors.append(
|
428 |
+
pb_utils.Tensor("request_output_len",
|
429 |
+
np.array(out_len_tensor, dtype=np.int32)))
|
430 |
+
|
431 |
+
if draft_request:
|
432 |
+
if draft_request.draft_input_ids is not None:
|
433 |
+
tensors.append(
|
434 |
+
pb_utils.Tensor("draft_input_ids",
|
435 |
+
draft_request.draft_input_ids))
|
436 |
+
if draft_request.draft_logits is not None and request.use_draft_logits is not None and request.use_draft_logits[
|
437 |
+
0]:
|
438 |
+
tensors.append(
|
439 |
+
pb_utils.Tensor("draft_logits",
|
440 |
+
draft_request.draft_logits))
|
441 |
+
|
442 |
+
return_context_logits_data = [False]
|
443 |
+
return_generation_logits_data = [False]
|
444 |
+
if draft_request is None:
|
445 |
+
if is_draft_model_request:
|
446 |
+
return_generation_logits_data = request.use_draft_logits if request.use_draft_logits is not None else [
|
447 |
+
False
|
448 |
+
]
|
449 |
+
else:
|
450 |
+
return_context_logits_data = request.return_context_logits if request.return_context_logits is not None else [
|
451 |
+
False
|
452 |
+
]
|
453 |
+
return_generation_logits_data = request.return_generation_logits if request.return_generation_logits is not None else [
|
454 |
+
False
|
455 |
+
]
|
456 |
+
return_context_logits = np.array([return_context_logits_data] *
|
457 |
+
batch_size,
|
458 |
+
dtype=bool)
|
459 |
+
return_generation_logits = np.array([return_generation_logits_data] *
|
460 |
+
batch_size,
|
461 |
+
dtype=bool)
|
462 |
+
|
463 |
+
assert len(return_context_logits.shape) == 2
|
464 |
+
assert len(return_generation_logits.shape) == 2
|
465 |
+
|
466 |
+
tensors.append(
|
467 |
+
pb_utils.Tensor("return_context_logits", return_context_logits))
|
468 |
+
tensors.append(
|
469 |
+
pb_utils.Tensor("return_generation_logits",
|
470 |
+
return_generation_logits))
|
471 |
+
return tensors
|
472 |
+
|
473 |
+
def _get_llm_response(self, triton_output):
|
474 |
+
name_map = {
|
475 |
+
"output_ids": "output_ids",
|
476 |
+
"sequence_length": "sequence_length",
|
477 |
+
"cum_log_probs": "cum_log_probs",
|
478 |
+
"output_log_probs": "output_log_probs",
|
479 |
+
"context_logits": "context_logits",
|
480 |
+
"generation_logits": "generation_logits",
|
481 |
+
"batch_index": "batch_index",
|
482 |
+
}
|
483 |
+
return self.convert_triton_response(triton_output, GenerationResponse,
|
484 |
+
name_map)
|
485 |
+
|
486 |
+
def _postprocess(self, tokens: np.ndarray,
|
487 |
+
sequence_lengths: Optional[np.ndarray],
|
488 |
+
gen_response: GenerationResponse) -> Response:
|
489 |
+
input_tensors = self._get_postproc_tensors(tokens, sequence_lengths,
|
490 |
+
gen_response)
|
491 |
+
triton_req = pb_utils.InferenceRequest(
|
492 |
+
model_name=self.postproc_model_name,
|
493 |
+
inputs=input_tensors,
|
494 |
+
requested_output_names=self._postproc_outputs)
|
495 |
+
r = self._exec_triton_request_single(triton_req)
|
496 |
+
response = self._get_response(r, gen_response)
|
497 |
+
return response
|
498 |
+
|
499 |
+
def _get_postproc_tensors(self, tokens: np.ndarray,
|
500 |
+
sequence_lengths: Optional[np.ndarray],
|
501 |
+
gen_response: GenerationResponse):
|
502 |
+
tensors = [
|
503 |
+
pb_utils.Tensor("TOKENS_BATCH", tokens),
|
504 |
+
pb_utils.Tensor(
|
505 |
+
"SEQUENCE_LENGTH", sequence_lengths
|
506 |
+
if sequence_lengths else gen_response.sequence_length)
|
507 |
+
]
|
508 |
+
return tensors
|
509 |
+
|
510 |
+
def _get_response(self, triton_output, gen_res: GenerationResponse):
|
511 |
+
tensors = triton_output.output_tensors()
|
512 |
+
t_map = {}
|
513 |
+
for named_t in tensors:
|
514 |
+
name = named_t.name()
|
515 |
+
t = named_t.as_numpy()
|
516 |
+
t_map[name] = t
|
517 |
+
response = Response(text_output=t_map["OUTPUT"],
|
518 |
+
cum_log_probs=gen_res.cum_log_probs,
|
519 |
+
output_log_probs=gen_res.output_log_probs,
|
520 |
+
context_logits=gen_res.context_logits,
|
521 |
+
generation_logits=gen_res.generation_logits,
|
522 |
+
batch_index=gen_res.batch_index)
|
523 |
+
return response
|
tensorrt_llm_bls/1/model.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
import json
|
28 |
+
import traceback
|
29 |
+
|
30 |
+
import triton_python_backend_utils as pb_utils
|
31 |
+
from lib.triton_decoder import TritonDecoder
|
32 |
+
|
33 |
+
|
34 |
+
def get_valid_param_value(param, default_value=''):
|
35 |
+
value = param.get('string_value', '')
|
36 |
+
return default_value if value.startswith('${') or value == '' else value
|
37 |
+
|
38 |
+
|
39 |
+
class TritonPythonModel:
|
40 |
+
|
41 |
+
def initialize(self, args):
|
42 |
+
|
43 |
+
# Parse model configs
|
44 |
+
model_config = json.loads(args['model_config'])
|
45 |
+
|
46 |
+
params = model_config['parameters']
|
47 |
+
|
48 |
+
accumulate_tokens_str = get_valid_param_value(
|
49 |
+
params.get('accumulate_tokens', {}))
|
50 |
+
self.accumulate_tokens = accumulate_tokens_str.lower() in [
|
51 |
+
'true', 'yes', '1', 't'
|
52 |
+
]
|
53 |
+
|
54 |
+
self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
|
55 |
+
model_config)
|
56 |
+
|
57 |
+
self.logger = pb_utils.Logger
|
58 |
+
|
59 |
+
default_tensorrt_llm_model_name = 'tensorrt_llm'
|
60 |
+
self.llm_model_name = get_valid_param_value(
|
61 |
+
params.get('tensorrt_llm_model_name', {}),
|
62 |
+
default_tensorrt_llm_model_name)
|
63 |
+
|
64 |
+
self.draft_llm_model_name = get_valid_param_value(
|
65 |
+
params.get('tensorrt_llm_draft_model_name', {}), None)
|
66 |
+
|
67 |
+
self.multimodal_encoders_name = get_valid_param_value(
|
68 |
+
params.get('multimodal_encoders_name', {}), None)
|
69 |
+
|
70 |
+
self.decoder = TritonDecoder(
|
71 |
+
streaming=self.decoupled,
|
72 |
+
accumulate=self.accumulate_tokens,
|
73 |
+
preproc_model_name="preprocessing",
|
74 |
+
postproc_model_name="postprocessing",
|
75 |
+
llm_model_name=self.llm_model_name,
|
76 |
+
draft_llm_model_name=self.draft_llm_model_name,
|
77 |
+
multimodal_encoders_name=self.multimodal_encoders_name)
|
78 |
+
|
79 |
+
def execute(self, requests):
|
80 |
+
|
81 |
+
responses = []
|
82 |
+
|
83 |
+
for request in requests:
|
84 |
+
if self.decoupled:
|
85 |
+
response_sender = request.get_response_sender()
|
86 |
+
try:
|
87 |
+
|
88 |
+
req = self.decoder.convert_triton_request(request)
|
89 |
+
req.validate()
|
90 |
+
speculative_decode = (req.num_draft_tokens is not None
|
91 |
+
and req.num_draft_tokens[0][0] > 0)
|
92 |
+
if speculative_decode and (self.draft_llm_model_name is None
|
93 |
+
or self.draft_llm_model_name == ""):
|
94 |
+
raise Exception(
|
95 |
+
"cannot perform speculative decoding without draft model"
|
96 |
+
)
|
97 |
+
is_multimodal = req.image_input is not None
|
98 |
+
|
99 |
+
if speculative_decode and is_multimodal:
|
100 |
+
raise Exception(
|
101 |
+
"Multimodal and speculative decoding is not currently supported"
|
102 |
+
)
|
103 |
+
res_gen = self.decoder.decode(
|
104 |
+
req,
|
105 |
+
speculative_decoding=speculative_decode,
|
106 |
+
is_multimodal=is_multimodal)
|
107 |
+
|
108 |
+
for res in res_gen:
|
109 |
+
triton_response = self.decoder.create_triton_response(res)
|
110 |
+
if self.decoupled:
|
111 |
+
response_sender.send(triton_response)
|
112 |
+
else:
|
113 |
+
responses.append(triton_response)
|
114 |
+
|
115 |
+
if self.decoupled:
|
116 |
+
response_sender.send(
|
117 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
118 |
+
|
119 |
+
except Exception:
|
120 |
+
self.logger.log_error(traceback.format_exc())
|
121 |
+
# If encountering an error, send a response with err msg
|
122 |
+
error_response = pb_utils.InferenceResponse(
|
123 |
+
output_tensors=[],
|
124 |
+
error=pb_utils.TritonError(traceback.format_exc()))
|
125 |
+
|
126 |
+
if self.decoupled:
|
127 |
+
response_sender.send(error_response)
|
128 |
+
response_sender.send(
|
129 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
130 |
+
else:
|
131 |
+
responses.append(error_response)
|
132 |
+
|
133 |
+
self.decoder.reset_decoder()
|
134 |
+
if self.decoupled:
|
135 |
+
return None
|
136 |
+
else:
|
137 |
+
assert len(responses) == len(requests)
|
138 |
+
return responses
|
139 |
+
|
140 |
+
def finalize(self):
|
141 |
+
"""`finalize` is called only once when the model is being unloaded.
|
142 |
+
Implementing `finalize` function is optional. This function allows
|
143 |
+
the model to perform any necessary clean ups before exit.
|
144 |
+
"""
|
145 |
+
print('Cleaning up...')
|
tensorrt_llm_bls/config.pbtxt
ADDED
@@ -0,0 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "tensorrt_llm_bls"
|
28 |
+
backend: "python"
|
29 |
+
max_batch_size: 32
|
30 |
+
|
31 |
+
model_transaction_policy {
|
32 |
+
decoupled: True
|
33 |
+
}
|
34 |
+
|
35 |
+
input [
|
36 |
+
{
|
37 |
+
name: "text_input"
|
38 |
+
data_type: TYPE_STRING
|
39 |
+
dims: [ 1 ]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
name: "decoder_text_input"
|
43 |
+
data_type: TYPE_STRING
|
44 |
+
dims: [ 1 ]
|
45 |
+
optional: true
|
46 |
+
},
|
47 |
+
{
|
48 |
+
name: "image_input"
|
49 |
+
data_type: TYPE_FP16
|
50 |
+
dims: [ 3, -1, -1 ]
|
51 |
+
optional: true
|
52 |
+
},
|
53 |
+
{
|
54 |
+
name: "max_tokens"
|
55 |
+
data_type: TYPE_INT32
|
56 |
+
dims: [ 1 ]
|
57 |
+
},
|
58 |
+
{
|
59 |
+
name: "bad_words"
|
60 |
+
data_type: TYPE_STRING
|
61 |
+
dims: [ -1 ]
|
62 |
+
optional: true
|
63 |
+
},
|
64 |
+
{
|
65 |
+
name: "stop_words"
|
66 |
+
data_type: TYPE_STRING
|
67 |
+
dims: [ -1 ]
|
68 |
+
optional: true
|
69 |
+
},
|
70 |
+
{
|
71 |
+
name: "end_id"
|
72 |
+
data_type: TYPE_INT32
|
73 |
+
dims: [ 1 ]
|
74 |
+
optional: true
|
75 |
+
},
|
76 |
+
{
|
77 |
+
name: "pad_id"
|
78 |
+
data_type: TYPE_INT32
|
79 |
+
dims: [ 1 ]
|
80 |
+
optional: true
|
81 |
+
},
|
82 |
+
{
|
83 |
+
name: "top_k"
|
84 |
+
data_type: TYPE_INT32
|
85 |
+
dims: [ 1 ]
|
86 |
+
optional: true
|
87 |
+
},
|
88 |
+
{
|
89 |
+
name: "top_p"
|
90 |
+
data_type: TYPE_FP32
|
91 |
+
dims: [ 1 ]
|
92 |
+
optional: true
|
93 |
+
},
|
94 |
+
{
|
95 |
+
name: "temperature"
|
96 |
+
data_type: TYPE_FP32
|
97 |
+
dims: [ 1 ]
|
98 |
+
optional: true
|
99 |
+
},
|
100 |
+
{
|
101 |
+
name: "length_penalty"
|
102 |
+
data_type: TYPE_FP32
|
103 |
+
dims: [ 1 ]
|
104 |
+
optional: true
|
105 |
+
},
|
106 |
+
{
|
107 |
+
name: "repetition_penalty"
|
108 |
+
data_type: TYPE_FP32
|
109 |
+
dims: [ 1 ]
|
110 |
+
optional: true
|
111 |
+
},
|
112 |
+
{
|
113 |
+
name: "min_length"
|
114 |
+
data_type: TYPE_INT32
|
115 |
+
dims: [ 1 ]
|
116 |
+
optional: true
|
117 |
+
},
|
118 |
+
{
|
119 |
+
name: "presence_penalty"
|
120 |
+
data_type: TYPE_FP32
|
121 |
+
dims: [ 1 ]
|
122 |
+
optional: true
|
123 |
+
},
|
124 |
+
{
|
125 |
+
name: "frequency_penalty"
|
126 |
+
data_type: TYPE_FP32
|
127 |
+
dims: [ 1 ]
|
128 |
+
optional: true
|
129 |
+
},
|
130 |
+
{
|
131 |
+
name: "random_seed"
|
132 |
+
data_type: TYPE_UINT64
|
133 |
+
dims: [ 1 ]
|
134 |
+
optional: true
|
135 |
+
},
|
136 |
+
{
|
137 |
+
name: "return_log_probs"
|
138 |
+
data_type: TYPE_BOOL
|
139 |
+
dims: [ 1 ]
|
140 |
+
reshape: { shape: [ ] }
|
141 |
+
optional: true
|
142 |
+
},
|
143 |
+
{
|
144 |
+
name: "return_context_logits"
|
145 |
+
data_type: TYPE_BOOL
|
146 |
+
dims: [ 1 ]
|
147 |
+
reshape: { shape: [ ] }
|
148 |
+
optional: true
|
149 |
+
},
|
150 |
+
{
|
151 |
+
name: "return_generation_logits"
|
152 |
+
data_type: TYPE_BOOL
|
153 |
+
dims: [ 1 ]
|
154 |
+
reshape: { shape: [ ] }
|
155 |
+
optional: true
|
156 |
+
},
|
157 |
+
{
|
158 |
+
name: "beam_width"
|
159 |
+
data_type: TYPE_INT32
|
160 |
+
dims: [ 1 ]
|
161 |
+
optional: true
|
162 |
+
},
|
163 |
+
{
|
164 |
+
name: "stream"
|
165 |
+
data_type: TYPE_BOOL
|
166 |
+
dims: [ 1 ]
|
167 |
+
optional: true
|
168 |
+
},
|
169 |
+
{
|
170 |
+
name: "prompt_embedding_table"
|
171 |
+
data_type: TYPE_FP16
|
172 |
+
dims: [ -1, -1 ]
|
173 |
+
optional: true
|
174 |
+
},
|
175 |
+
{
|
176 |
+
name: "prompt_vocab_size"
|
177 |
+
data_type: TYPE_INT32
|
178 |
+
dims: [ 1 ]
|
179 |
+
optional: true
|
180 |
+
},
|
181 |
+
{
|
182 |
+
name: "embedding_bias_words"
|
183 |
+
data_type: TYPE_STRING
|
184 |
+
dims: [ -1 ]
|
185 |
+
optional: true
|
186 |
+
},
|
187 |
+
{
|
188 |
+
name: "embedding_bias_weights"
|
189 |
+
data_type: TYPE_FP32
|
190 |
+
dims: [ -1 ]
|
191 |
+
optional: true
|
192 |
+
},
|
193 |
+
{
|
194 |
+
name: "num_draft_tokens",
|
195 |
+
data_type: TYPE_INT32,
|
196 |
+
dims: [ 1 ]
|
197 |
+
optional: true
|
198 |
+
},
|
199 |
+
{
|
200 |
+
name: "use_draft_logits",
|
201 |
+
data_type: TYPE_BOOL,
|
202 |
+
dims: [ 1 ]
|
203 |
+
reshape: { shape: [ ] }
|
204 |
+
optional: true
|
205 |
+
}
|
206 |
+
]
|
207 |
+
output [
|
208 |
+
{
|
209 |
+
name: "text_output"
|
210 |
+
data_type: TYPE_STRING
|
211 |
+
dims: [ -1 ]
|
212 |
+
},
|
213 |
+
{
|
214 |
+
name: "cum_log_probs"
|
215 |
+
data_type: TYPE_FP32
|
216 |
+
dims: [ -1 ]
|
217 |
+
},
|
218 |
+
{
|
219 |
+
name: "output_log_probs"
|
220 |
+
data_type: TYPE_FP32
|
221 |
+
dims: [ -1, -1 ]
|
222 |
+
},
|
223 |
+
{
|
224 |
+
name: "context_logits"
|
225 |
+
data_type: TYPE_FP32
|
226 |
+
dims: [ -1, -1 ]
|
227 |
+
},
|
228 |
+
{
|
229 |
+
name: "generation_logits"
|
230 |
+
data_type: TYPE_FP32
|
231 |
+
dims: [ -1, -1, -1 ]
|
232 |
+
},
|
233 |
+
{
|
234 |
+
name: "batch_index"
|
235 |
+
data_type: TYPE_INT32
|
236 |
+
dims: [ 1 ]
|
237 |
+
}
|
238 |
+
]
|
239 |
+
|
240 |
+
parameters: {
|
241 |
+
key: "accumulate_tokens"
|
242 |
+
value: {
|
243 |
+
string_value: "${accumulate_tokens}"
|
244 |
+
}
|
245 |
+
}
|
246 |
+
parameters: {
|
247 |
+
key: "tensorrt_llm_model_name"
|
248 |
+
value: {
|
249 |
+
string_value: "tensorrt_llm"
|
250 |
+
}
|
251 |
+
}
|
252 |
+
parameters: {
|
253 |
+
key: "tensorrt_llm_draft_model_name"
|
254 |
+
value: {
|
255 |
+
string_value: "${tensorrt_llm_draft_model_name}"
|
256 |
+
}
|
257 |
+
}
|
258 |
+
parameters: {
|
259 |
+
key: "multimodal_encoders_name"
|
260 |
+
value: {
|
261 |
+
string_value: "${multimodal_encoders_name}"
|
262 |
+
}
|
263 |
+
}
|
264 |
+
|
265 |
+
instance_group [
|
266 |
+
{
|
267 |
+
count: 1
|
268 |
+
kind : KIND_CPU
|
269 |
+
}
|
270 |
+
]
|