otherhalf-dev commited on
Commit
7567662
·
verified ·
1 Parent(s): f6f26d0

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tensorrt_llm/1/rank0.engine filter=lfs diff=lfs merge=lfs -text
37
+ tensorrt_llm/1/rank1.engine filter=lfs diff=lfs merge=lfs -text
ensemble/1/.tmp ADDED
File without changes
ensemble/config.pbtxt ADDED
@@ -0,0 +1,487 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "ensemble"
28
+ platform: "ensemble"
29
+ max_batch_size: 32
30
+ input [
31
+ {
32
+ name: "text_input"
33
+ data_type: TYPE_STRING
34
+ dims: [ 1 ]
35
+ },
36
+ {
37
+ name: "decoder_text_input"
38
+ data_type: TYPE_STRING
39
+ dims: [ 1 ]
40
+ optional: true
41
+ },
42
+ {
43
+ name: "max_tokens"
44
+ data_type: TYPE_INT32
45
+ dims: [ 1 ]
46
+ },
47
+ {
48
+ name: "bad_words"
49
+ data_type: TYPE_STRING
50
+ dims: [ -1 ]
51
+ optional: true
52
+ },
53
+ {
54
+ name: "stop_words"
55
+ data_type: TYPE_STRING
56
+ dims: [ -1 ]
57
+ optional: true
58
+ },
59
+ {
60
+ name: "end_id"
61
+ data_type: TYPE_INT32
62
+ dims: [ 1 ]
63
+ optional: true
64
+ },
65
+ {
66
+ name: "pad_id"
67
+ data_type: TYPE_INT32
68
+ dims: [ 1 ]
69
+ optional: true
70
+ },
71
+ {
72
+ name: "top_k"
73
+ data_type: TYPE_INT32
74
+ dims: [ 1 ]
75
+ optional: true
76
+ },
77
+ {
78
+ name: "top_p"
79
+ data_type: TYPE_FP32
80
+ dims: [ 1 ]
81
+ optional: true
82
+ },
83
+ {
84
+ name: "temperature"
85
+ data_type: TYPE_FP32
86
+ dims: [ 1 ]
87
+ optional: true
88
+ },
89
+ {
90
+ name: "length_penalty"
91
+ data_type: TYPE_FP32
92
+ dims: [ 1 ]
93
+ optional: true
94
+ },
95
+ {
96
+ name: "repetition_penalty"
97
+ data_type: TYPE_FP32
98
+ dims: [ 1 ]
99
+ optional: true
100
+ },
101
+ {
102
+ name: "min_length"
103
+ data_type: TYPE_INT32
104
+ dims: [ 1 ]
105
+ optional: true
106
+ },
107
+ {
108
+ name: "presence_penalty"
109
+ data_type: TYPE_FP32
110
+ dims: [ 1 ]
111
+ optional: true
112
+ },
113
+ {
114
+ name: "frequency_penalty"
115
+ data_type: TYPE_FP32
116
+ dims: [ 1 ]
117
+ optional: true
118
+ },
119
+ {
120
+ name: "random_seed"
121
+ data_type: TYPE_UINT64
122
+ dims: [ 1 ]
123
+ optional: true
124
+ },
125
+ {
126
+ name: "return_log_probs"
127
+ data_type: TYPE_BOOL
128
+ dims: [ 1 ]
129
+ optional: true
130
+ },
131
+ {
132
+ name: "return_context_logits"
133
+ data_type: TYPE_BOOL
134
+ dims: [ 1 ]
135
+ optional: true
136
+ },
137
+ {
138
+ name: "return_generation_logits"
139
+ data_type: TYPE_BOOL
140
+ dims: [ 1 ]
141
+ optional: true
142
+ },
143
+ {
144
+ name: "beam_width"
145
+ data_type: TYPE_INT32
146
+ dims: [ 1 ]
147
+ optional: true
148
+ },
149
+ {
150
+ name: "stream"
151
+ data_type: TYPE_BOOL
152
+ dims: [ 1 ]
153
+ optional: true
154
+ },
155
+ {
156
+ name: "prompt_embedding_table"
157
+ data_type: TYPE_FP16
158
+ dims: [ -1, -1 ]
159
+ optional: true
160
+ },
161
+ {
162
+ name: "prompt_vocab_size"
163
+ data_type: TYPE_INT32
164
+ dims: [ 1 ]
165
+ optional: true
166
+ },
167
+ {
168
+ name: "embedding_bias_words"
169
+ data_type: TYPE_STRING
170
+ dims: [ -1 ]
171
+ optional: true
172
+ },
173
+ {
174
+ name: "embedding_bias_weights"
175
+ data_type: TYPE_FP32
176
+ dims: [ -1 ]
177
+ optional: true
178
+ }
179
+ ]
180
+ output [
181
+ {
182
+ name: "text_output"
183
+ data_type: TYPE_STRING
184
+ dims: [ -1 ]
185
+ },
186
+ {
187
+ name: "cum_log_probs"
188
+ data_type: TYPE_FP32
189
+ dims: [ -1 ]
190
+ },
191
+ {
192
+ name: "output_log_probs"
193
+ data_type: TYPE_FP32
194
+ dims: [ -1, -1 ]
195
+ },
196
+ {
197
+ name: "context_logits"
198
+ data_type: TYPE_FP32
199
+ dims: [ -1, -1 ]
200
+ },
201
+ {
202
+ name: "generation_logits"
203
+ data_type: TYPE_FP32
204
+ dims: [ -1, -1, -1 ]
205
+ },
206
+ {
207
+ name: "batch_index"
208
+ data_type: TYPE_INT32
209
+ dims: [ 1 ]
210
+ }
211
+ ]
212
+ ensemble_scheduling {
213
+ step [
214
+ {
215
+ model_name: "preprocessing"
216
+ model_version: -1
217
+ input_map {
218
+ key: "QUERY"
219
+ value: "text_input"
220
+ }
221
+ input_map {
222
+ key: "DECODER_QUERY"
223
+ value: "decoder_text_input"
224
+ }
225
+ input_map {
226
+ key: "REQUEST_OUTPUT_LEN"
227
+ value: "max_tokens"
228
+ }
229
+ input_map {
230
+ key: "BAD_WORDS_DICT"
231
+ value: "bad_words"
232
+ }
233
+ input_map {
234
+ key: "STOP_WORDS_DICT"
235
+ value: "stop_words"
236
+ }
237
+ input_map {
238
+ key: "EMBEDDING_BIAS_WORDS"
239
+ value: "embedding_bias_words"
240
+ }
241
+ input_map {
242
+ key: "EMBEDDING_BIAS_WEIGHTS"
243
+ value: "embedding_bias_weights"
244
+ }
245
+ input_map {
246
+ key: "END_ID"
247
+ value: "end_id"
248
+ }
249
+ input_map {
250
+ key: "PAD_ID"
251
+ value: "pad_id"
252
+ }
253
+ output_map {
254
+ key: "REQUEST_INPUT_LEN"
255
+ value: "_REQUEST_INPUT_LEN"
256
+ }
257
+ output_map {
258
+ key: "INPUT_ID"
259
+ value: "_INPUT_ID"
260
+ }
261
+ output_map {
262
+ key: "REQUEST_DECODER_INPUT_LEN"
263
+ value: "_REQUEST_DECODER_INPUT_LEN"
264
+ }
265
+ output_map {
266
+ key: "DECODER_INPUT_ID"
267
+ value: "_DECODER_INPUT_ID"
268
+ }
269
+ output_map {
270
+ key: "REQUEST_OUTPUT_LEN"
271
+ value: "_REQUEST_OUTPUT_LEN"
272
+ }
273
+ output_map {
274
+ key: "STOP_WORDS_IDS"
275
+ value: "_STOP_WORDS_IDS"
276
+ }
277
+ output_map {
278
+ key: "BAD_WORDS_IDS"
279
+ value: "_BAD_WORDS_IDS"
280
+ }
281
+ output_map {
282
+ key: "EMBEDDING_BIAS"
283
+ value: "_EMBEDDING_BIAS"
284
+ }
285
+ output_map {
286
+ key: "OUT_END_ID"
287
+ value: "_PREPROCESSOR_END_ID"
288
+ }
289
+ output_map {
290
+ key: "OUT_PAD_ID"
291
+ value: "_PREPROCESSOR_PAD_ID"
292
+ }
293
+ },
294
+ {
295
+ model_name: "tensorrt_llm"
296
+ model_version: -1
297
+ input_map {
298
+ key: "input_ids"
299
+ value: "_INPUT_ID"
300
+ }
301
+ input_map {
302
+ key: "decoder_input_ids"
303
+ value: "_DECODER_INPUT_ID"
304
+ }
305
+ input_map {
306
+ key: "input_lengths"
307
+ value: "_REQUEST_INPUT_LEN"
308
+ }
309
+ input_map {
310
+ key: "decoder_input_lengths"
311
+ value: "_REQUEST_DECODER_INPUT_LEN"
312
+ }
313
+ input_map {
314
+ key: "request_output_len"
315
+ value: "_REQUEST_OUTPUT_LEN"
316
+ }
317
+ input_map {
318
+ key: "end_id"
319
+ value: "_PREPROCESSOR_END_ID"
320
+ }
321
+ input_map {
322
+ key: "pad_id"
323
+ value: "_PREPROCESSOR_PAD_ID"
324
+ }
325
+ input_map {
326
+ key: "embedding_bias"
327
+ value: "_EMBEDDING_BIAS"
328
+ }
329
+ input_map {
330
+ key: "runtime_top_k"
331
+ value: "top_k"
332
+ }
333
+ input_map {
334
+ key: "runtime_top_p"
335
+ value: "top_p"
336
+ }
337
+ input_map {
338
+ key: "temperature"
339
+ value: "temperature"
340
+ }
341
+ input_map {
342
+ key: "len_penalty"
343
+ value: "length_penalty"
344
+ }
345
+ input_map {
346
+ key: "repetition_penalty"
347
+ value: "repetition_penalty"
348
+ }
349
+ input_map {
350
+ key: "min_length"
351
+ value: "min_length"
352
+ }
353
+ input_map {
354
+ key: "presence_penalty"
355
+ value: "presence_penalty"
356
+ }
357
+ input_map {
358
+ key: "frequency_penalty"
359
+ value: "frequency_penalty"
360
+ }
361
+ input_map {
362
+ key: "random_seed"
363
+ value: "random_seed"
364
+ }
365
+ input_map {
366
+ key: "return_log_probs"
367
+ value: "return_log_probs"
368
+ }
369
+ input_map {
370
+ key: "return_context_logits"
371
+ value: "return_context_logits"
372
+ }
373
+ input_map {
374
+ key: "return_generation_logits"
375
+ value: "return_generation_logits"
376
+ }
377
+ input_map {
378
+ key: "beam_width"
379
+ value: "beam_width"
380
+ }
381
+ input_map {
382
+ key: "streaming"
383
+ value: "stream"
384
+ }
385
+ input_map {
386
+ key: "prompt_embedding_table"
387
+ value: "prompt_embedding_table"
388
+ }
389
+ input_map {
390
+ key: "prompt_vocab_size"
391
+ value: "prompt_vocab_size"
392
+ }
393
+ input_map {
394
+ key: "stop_words_list"
395
+ value: "_STOP_WORDS_IDS"
396
+ }
397
+ input_map {
398
+ key: "bad_words_list"
399
+ value: "_BAD_WORDS_IDS"
400
+ }
401
+ output_map {
402
+ key: "output_ids"
403
+ value: "_TOKENS_BATCH"
404
+ }
405
+ output_map {
406
+ key: "sequence_length"
407
+ value: "_SEQUENCE_LENGTH"
408
+ },
409
+ output_map {
410
+ key: "cum_log_probs"
411
+ value: "_CUM_LOG_PROBS"
412
+ }
413
+ output_map {
414
+ key: "output_log_probs"
415
+ value: "_OUTPUT_LOG_PROBS"
416
+ },
417
+ output_map {
418
+ key: "context_logits"
419
+ value: "_CONTEXT_LOGITS"
420
+ },
421
+ output_map {
422
+ key: "generation_logits"
423
+ value: "_GENERATION_LOGITS"
424
+ },
425
+ output_map {
426
+ key: "batch_index"
427
+ value: "_BATCH_INDEX"
428
+ }
429
+ },
430
+ {
431
+ model_name: "postprocessing"
432
+ model_version: -1
433
+ input_map {
434
+ key: "TOKENS_BATCH"
435
+ value: "_TOKENS_BATCH"
436
+ }
437
+ input_map {
438
+ key: "CUM_LOG_PROBS"
439
+ value: "_CUM_LOG_PROBS"
440
+ }
441
+ input_map {
442
+ key: "OUTPUT_LOG_PROBS"
443
+ value: "_OUTPUT_LOG_PROBS"
444
+ }
445
+ input_map {
446
+ key: "CONTEXT_LOGITS"
447
+ value: "_CONTEXT_LOGITS"
448
+ }
449
+ input_map {
450
+ key: "GENERATION_LOGITS"
451
+ value: "_GENERATION_LOGITS"
452
+ }
453
+ input_map {
454
+ key: "SEQUENCE_LENGTH"
455
+ value: "_SEQUENCE_LENGTH"
456
+ }
457
+ input_map {
458
+ key: "BATCH_INDEX"
459
+ value: "_BATCH_INDEX"
460
+ }
461
+ output_map {
462
+ key: "OUTPUT"
463
+ value: "text_output"
464
+ }
465
+ output_map {
466
+ key: "OUT_OUTPUT_LOG_PROBS"
467
+ value: "output_log_probs"
468
+ }
469
+ output_map {
470
+ key: "OUT_CUM_LOG_PROBS"
471
+ value: "cum_log_probs"
472
+ }
473
+ output_map {
474
+ key: "OUT_CONTEXT_LOGITS"
475
+ value: "context_logits"
476
+ }
477
+ output_map {
478
+ key: "OUT_GENERATION_LOGITS"
479
+ value: "generation_logits"
480
+ }
481
+ output_map {
482
+ key: "OUT_BATCH_INDEX"
483
+ value: "batch_index"
484
+ }
485
+ }
486
+ ]
487
+ }
postprocessing/1/__pycache__/model.cpython-310.pyc ADDED
Binary file (5.61 kB). View file
 
postprocessing/1/model.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ import json
28
+
29
+ import numpy as np
30
+ import triton_python_backend_utils as pb_utils
31
+ from transformers import AutoTokenizer
32
+
33
+
34
+ class TritonPythonModel:
35
+ """Your Python model must use the same class name. Every Python model
36
+ that is created must have "TritonPythonModel" as the class name.
37
+ """
38
+
39
+ def initialize(self, args):
40
+ """`initialize` is called only once when the model is being loaded.
41
+ Implementing `initialize` function is optional. This function allows
42
+ the model to initialize any state associated with this model.
43
+ Parameters
44
+ ----------
45
+ args : dict
46
+ Both keys and values are strings. The dictionary keys and values are:
47
+ * model_config: A JSON string containing the model configuration
48
+ * model_instance_kind: A string containing model instance kind
49
+ * model_instance_device_id: A string containing model instance device ID
50
+ * model_repository: Model repository path
51
+ * model_version: Model version
52
+ * model_name: Model name
53
+ """
54
+ # Parse model configs
55
+ model_config = json.loads(args['model_config'])
56
+ tokenizer_dir = model_config['parameters']['tokenizer_dir'][
57
+ 'string_value']
58
+
59
+ skip_special_tokens = model_config['parameters'].get(
60
+ 'skip_special_tokens')
61
+ if skip_special_tokens is not None:
62
+ skip_special_tokens_str = skip_special_tokens[
63
+ 'string_value'].lower()
64
+ if skip_special_tokens_str in [
65
+ 'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
66
+ ]:
67
+ self.skip_special_tokens = skip_special_tokens_str in [
68
+ 'true', '1', 't', 'y', 'yes'
69
+ ]
70
+ else:
71
+ print(
72
+ f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default."
73
+ )
74
+ self.skip_special_tokens = True
75
+ else:
76
+ print(
77
+ f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default."
78
+ )
79
+ self.skip_special_tokens = True
80
+
81
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
82
+ legacy=False,
83
+ padding_side='left',
84
+ trust_remote_code=True)
85
+ if not self.tokenizer.pad_token:
86
+ self.tokenizer.pad_token = self.tokenizer.eos_token
87
+
88
+ # Parse model output configs
89
+ output_config = pb_utils.get_output_config_by_name(
90
+ model_config, "OUTPUT")
91
+
92
+ # Convert Triton types to numpy types
93
+ self.output_dtype = pb_utils.triton_string_to_numpy(
94
+ output_config['data_type'])
95
+
96
+ def execute(self, requests):
97
+ """`execute` must be implemented in every Python model. `execute`
98
+ function receives a list of pb_utils.InferenceRequest as the only
99
+ argument. This function is called when an inference is requested
100
+ for this model. Depending on the batching configuration (e.g. Dynamic
101
+ Batching) used, `requests` may contain multiple requests. Every
102
+ Python model, must create one pb_utils.InferenceResponse for every
103
+ pb_utils.InferenceRequest in `requests`. If there is an error, you can
104
+ set the error argument when creating a pb_utils.InferenceResponse.
105
+ Parameters
106
+ ----------
107
+ requests : list
108
+ A list of pb_utils.InferenceRequest
109
+ Returns
110
+ -------
111
+ list
112
+ A list of pb_utils.InferenceResponse. The length of this list must
113
+ be the same as `requests`
114
+ """
115
+
116
+ responses = []
117
+
118
+ # Every Python backend must iterate over everyone of the requests
119
+ # and create a pb_utils.InferenceResponse for each of them.
120
+ for idx, request in enumerate(requests):
121
+ # Get input tensors
122
+ tokens_batch = pb_utils.get_input_tensor_by_name(
123
+ request, 'TOKENS_BATCH').as_numpy()
124
+
125
+ # Get sequence length
126
+ sequence_lengths = pb_utils.get_input_tensor_by_name(
127
+ request, 'SEQUENCE_LENGTH').as_numpy()
128
+
129
+ # Get cum log probs
130
+ cum_log_probs = pb_utils.get_input_tensor_by_name(
131
+ request, 'CUM_LOG_PROBS')
132
+
133
+ # Get sequence length
134
+ output_log_probs = pb_utils.get_input_tensor_by_name(
135
+ request, 'OUTPUT_LOG_PROBS')
136
+
137
+ # Get context logits
138
+ context_logits = pb_utils.get_input_tensor_by_name(
139
+ request, 'CONTEXT_LOGITS')
140
+
141
+ # Get generation logits
142
+ generation_logits = pb_utils.get_input_tensor_by_name(
143
+ request, 'GENERATION_LOGITS')
144
+
145
+ # Get the batch index
146
+ batch_index = pb_utils.get_input_tensor_by_name(
147
+ request, 'BATCH_INDEX')
148
+
149
+ # Reshape Input
150
+ # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
151
+ # tokens_batch = tokens_batch.T
152
+
153
+ # Postprocessing output data.
154
+ outputs = self._postprocessing(tokens_batch, sequence_lengths)
155
+
156
+ # Create output tensors. You need pb_utils.Tensor
157
+ # objects to create pb_utils.InferenceResponse.
158
+ output_tensor = pb_utils.Tensor(
159
+ 'OUTPUT',
160
+ np.array(outputs).astype(self.output_dtype))
161
+
162
+ outputs = []
163
+ outputs.append(output_tensor)
164
+
165
+ if cum_log_probs:
166
+ out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
167
+ cum_log_probs.as_numpy())
168
+ outputs.append(out_cum_log_probs)
169
+ else:
170
+ out_cum_log_probs = pb_utils.Tensor(
171
+ 'OUT_CUM_LOG_PROBS', np.array([[0.0]], dtype=np.float32))
172
+ outputs.append(out_cum_log_probs)
173
+
174
+ if output_log_probs:
175
+ out_output_log_probs = pb_utils.Tensor(
176
+ 'OUT_OUTPUT_LOG_PROBS', output_log_probs.as_numpy())
177
+ outputs.append(out_output_log_probs)
178
+ else:
179
+ out_output_log_probs = pb_utils.Tensor(
180
+ 'OUT_OUTPUT_LOG_PROBS',
181
+ np.array([[[0.0]]], dtype=np.float32))
182
+ outputs.append(out_output_log_probs)
183
+
184
+ if context_logits:
185
+ out_context_logits = pb_utils.Tensor('OUT_CONTEXT_LOGITS',
186
+ context_logits.as_numpy())
187
+ outputs.append(out_context_logits)
188
+ else:
189
+ out_context_logits = pb_utils.Tensor(
190
+ 'OUT_CONTEXT_LOGITS', np.array([[[0.0]]],
191
+ dtype=np.float32))
192
+ outputs.append(out_context_logits)
193
+
194
+ if generation_logits:
195
+ out_generation_logits = pb_utils.Tensor(
196
+ 'OUT_GENERATION_LOGITS', generation_logits.as_numpy())
197
+ outputs.append(out_generation_logits)
198
+ else:
199
+ out_generation_logits = pb_utils.Tensor(
200
+ 'OUT_GENERATION_LOGITS',
201
+ np.array([[[[0.0]]]], dtype=np.float32))
202
+ outputs.append(out_generation_logits)
203
+
204
+ if batch_index:
205
+ out_batch_index = pb_utils.Tensor('OUT_BATCH_INDEX',
206
+ batch_index.as_numpy())
207
+ outputs.append(out_batch_index)
208
+ else:
209
+ out_batch_index = pb_utils.Tensor(
210
+ 'OUT_BATCH_INDEX', np.array([[0]], dtype=np.int32))
211
+ outputs.append(out_batch_index)
212
+
213
+ # Create InferenceResponse. You can set an error here in case
214
+ # there was a problem with handling this inference request.
215
+ # Below is an example of how you can set errors in inference
216
+ # response:
217
+ #
218
+ # pb_utils.InferenceResponse(
219
+ # output_tensors=..., TritonError("An error occurred"))
220
+ inference_response = pb_utils.InferenceResponse(
221
+ output_tensors=outputs)
222
+ responses.append(inference_response)
223
+
224
+ # You should return a list of pb_utils.InferenceResponse. Length
225
+ # of this list must match the length of `requests` list.
226
+ return responses
227
+
228
+ def finalize(self):
229
+ """`finalize` is called only once when the model is being unloaded.
230
+ Implementing `finalize` function is optional. This function allows
231
+ the model to perform any necessary clean ups before exit.
232
+ """
233
+ print('Cleaning up...')
234
+
235
+ def _postprocessing(self, tokens_batch, sequence_lengths):
236
+ outputs = []
237
+ for batch_idx, beam_tokens in enumerate(tokens_batch):
238
+ for beam_idx, tokens in enumerate(beam_tokens):
239
+ seq_len = sequence_lengths[batch_idx][beam_idx]
240
+ # Exclude fake ids in multimodal models
241
+ fake_id_len = 0
242
+ for i in range(seq_len):
243
+ if tokens[i] < self.tokenizer.vocab_size:
244
+ fake_id_len = i
245
+ break
246
+ output = self.tokenizer.decode(
247
+ tokens[fake_id_len:seq_len],
248
+ skip_special_tokens=self.skip_special_tokens)
249
+ outputs.append(output.encode('utf8'))
250
+ return outputs
postprocessing/config.pbtxt ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "postprocessing"
28
+ backend: "python"
29
+ max_batch_size: 32
30
+ input [
31
+ {
32
+ name: "TOKENS_BATCH"
33
+ data_type: TYPE_INT32
34
+ dims: [ -1, -1 ]
35
+ },
36
+ {
37
+ name: "SEQUENCE_LENGTH"
38
+ data_type: TYPE_INT32
39
+ dims: [ -1 ]
40
+ },
41
+ {
42
+ name: "CUM_LOG_PROBS"
43
+ data_type: TYPE_FP32
44
+ dims: [ -1 ]
45
+ optional: true
46
+ },
47
+ {
48
+ name: "OUTPUT_LOG_PROBS"
49
+ data_type: TYPE_FP32
50
+ dims: [ -1, -1 ]
51
+ optional: true
52
+ },
53
+ {
54
+ name: "CONTEXT_LOGITS"
55
+ data_type: TYPE_FP32
56
+ dims: [ -1, -1 ]
57
+ optional: true
58
+ },
59
+ {
60
+ name: "GENERATION_LOGITS"
61
+ data_type: TYPE_FP32
62
+ dims: [ -1, -1, -1 ]
63
+ optional: true
64
+ },
65
+ {
66
+ name: "BATCH_INDEX"
67
+ data_type: TYPE_INT32
68
+ dims: [ 1 ]
69
+ optional: true
70
+ }
71
+ ]
72
+ output [
73
+ {
74
+ name: "OUTPUT"
75
+ data_type: TYPE_STRING
76
+ dims: [ -1 ]
77
+ },
78
+ {
79
+ name: "OUT_CUM_LOG_PROBS"
80
+ data_type: TYPE_FP32
81
+ dims: [ -1 ]
82
+ },
83
+ {
84
+ name: "OUT_OUTPUT_LOG_PROBS"
85
+ data_type: TYPE_FP32
86
+ dims: [ -1, -1 ]
87
+ },
88
+ {
89
+ name: "OUT_CONTEXT_LOGITS"
90
+ data_type: TYPE_FP32
91
+ dims: [ -1, -1 ]
92
+ },
93
+ {
94
+ name: "OUT_GENERATION_LOGITS"
95
+ data_type: TYPE_FP32
96
+ dims: [ -1, -1, -1 ]
97
+ },
98
+ {
99
+ name: "OUT_BATCH_INDEX"
100
+ data_type: TYPE_INT32
101
+ dims: [ 1 ]
102
+ }
103
+ ]
104
+
105
+ parameters {
106
+ key: "tokenizer_dir"
107
+ value: {
108
+ string_value: "mlabonne/Llama-3.1-70B-Instruct-lorablated"
109
+ }
110
+ }
111
+
112
+ parameters {
113
+ key: "skip_special_tokens"
114
+ value: {
115
+ string_value: "True"
116
+ }
117
+ }
118
+
119
+ instance_group [
120
+ {
121
+ count: 1
122
+ kind: KIND_CPU
123
+ }
124
+ ]
preprocessing/1/__pycache__/model.cpython-310.pyc ADDED
Binary file (11.1 kB). View file
 
preprocessing/1/model.py ADDED
@@ -0,0 +1,439 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ import json
28
+ import os
29
+ from typing import List
30
+
31
+ import numpy as np
32
+ import triton_python_backend_utils as pb_utils
33
+ from transformers import AutoTokenizer, T5Tokenizer
34
+
35
+
36
+ class TritonPythonModel:
37
+ """Your Python model must use the same class name. Every Python model
38
+ that is created must have "TritonPythonModel" as the class name.
39
+ """
40
+
41
+ def initialize(self, args):
42
+ """`initialize` is called only once when the model is being loaded.
43
+ Implementing `initialize` function is optional. This function allows
44
+ the model to initialize any state associated with this model.
45
+ Parameters
46
+ ----------
47
+ args : dict
48
+ Both keys and values are strings. The dictionary keys and values are:
49
+ * model_config: A JSON string containing the model configuration
50
+ * model_instance_kind: A string containing model instance kind
51
+ * model_instance_device_id: A string containing model instance device ID
52
+ * model_repository: Model repository path
53
+ * model_version: Model version
54
+ * model_name: Model name
55
+ """
56
+ # Parse model configs
57
+ model_config = json.loads(args['model_config'])
58
+ tokenizer_dir = model_config['parameters']['tokenizer_dir'][
59
+ 'string_value']
60
+
61
+ add_special_tokens = model_config['parameters'].get(
62
+ 'add_special_tokens')
63
+ visual_model_path = model_config['parameters']['visual_model_path'][
64
+ 'string_value']
65
+ if visual_model_path == "${visual_model_path}" or visual_model_path == "":
66
+ visual_model_path = None
67
+
68
+ if add_special_tokens is not None:
69
+ add_special_tokens_str = add_special_tokens['string_value'].lower()
70
+ if add_special_tokens_str in [
71
+ 'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
72
+ ]:
73
+ self.add_special_tokens = add_special_tokens_str in [
74
+ 'true', '1', 't', 'y', 'yes'
75
+ ]
76
+ else:
77
+ print(
78
+ f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default."
79
+ )
80
+ self.add_special_tokens = True
81
+ else:
82
+ print(
83
+ f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default."
84
+ )
85
+ self.add_special_tokens = True
86
+
87
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
88
+ legacy=False,
89
+ padding_side='left',
90
+ trust_remote_code=True)
91
+ if isinstance(self.tokenizer, T5Tokenizer):
92
+ self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id()
93
+
94
+ if not self.tokenizer.pad_token:
95
+ self.tokenizer.pad_token = self.tokenizer.eos_token
96
+
97
+ self.tokenizer_end_id = self.tokenizer.encode(
98
+ self.tokenizer.eos_token, add_special_tokens=False)[0]
99
+ self.tokenizer_pad_id = self.tokenizer.encode(
100
+ self.tokenizer.pad_token, add_special_tokens=False)[0]
101
+
102
+ self.is_multimodal = False
103
+ if visual_model_path is not None:
104
+ self.is_multimodal = True
105
+ visual_model_path = os.path.join(visual_model_path, 'config.json')
106
+ with open(visual_model_path, 'r') as f:
107
+ visual_model_config = json.load(f)
108
+ self.model_type = visual_model_config['builder_config'][
109
+ 'model_type']
110
+
111
+ assert self.model_type in [
112
+ 'llava', 'blip2-opt'
113
+ ], f"[TensorRT-LLM][ERROR] Currently supported multi-modal models are llava and blip2-opt"
114
+
115
+ llm_model_path = model_config['parameters']['gpt_model_path'][
116
+ 'string_value']
117
+ llm_model_path = os.path.join(llm_model_path, 'config.json')
118
+ with open(llm_model_path, 'r') as f:
119
+ llm_model_config = json.load(f)
120
+ self.vocab_size = int(
121
+ llm_model_config["pretrained_config"]["vocab_size"])
122
+ self._setup_ptable_shape(llm_model_config)
123
+
124
+ # Parse model output configs and convert Triton types to numpy types
125
+ output_names = [
126
+ "INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN",
127
+ "REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS",
128
+ "OUT_END_ID", "OUT_PAD_ID"
129
+ ]
130
+ input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"]
131
+ for input_name in input_names:
132
+ setattr(
133
+ self,
134
+ input_name.lower() + "_dtype",
135
+ pb_utils.triton_string_to_numpy(
136
+ pb_utils.get_input_config_by_name(
137
+ model_config, input_name)['data_type']))
138
+
139
+ for output_name in output_names:
140
+ setattr(
141
+ self,
142
+ output_name.lower() + "_dtype",
143
+ pb_utils.triton_string_to_numpy(
144
+ pb_utils.get_output_config_by_name(
145
+ model_config, output_name)['data_type']))
146
+
147
+ def _setup_ptable_shape(self, llm_model_config):
148
+ max_prompt_embedding_table_size = llm_model_config['build_config'][
149
+ 'max_prompt_embedding_table_size']
150
+ max_batch_size = llm_model_config['build_config']['max_batch_size']
151
+
152
+ num_visual_features = max_prompt_embedding_table_size // max_batch_size
153
+ hidden_size = llm_model_config['pretrained_config']['hidden_size']
154
+
155
+ self.ptable_shape = (-1, num_visual_features, hidden_size)
156
+
157
+ def execute(self, requests):
158
+ """`execute` must be implemented in every Python model. `execute`
159
+ function receives a list of pb_utils.InferenceRequest as the only
160
+ argument. This function is called when an inference is requested
161
+ for this model. Depending on the batching configuration (e.g. Dynamic
162
+ Batching) used, `requests` may contain multiple requests. Every
163
+ Python model, must create one pb_utils.InferenceResponse for every
164
+ pb_utils.InferenceRequest in `requests`. If there is an error, you can
165
+ set the error argument when creating a pb_utils.InferenceResponse.
166
+ Parameters
167
+ ----------
168
+ requests : list
169
+ A list of pb_utils.InferenceRequest
170
+ Returns
171
+ -------
172
+ list
173
+ A list of pb_utils.InferenceResponse. The length of this list must
174
+ be the same as `requests`
175
+ """
176
+
177
+ responses = []
178
+
179
+ # Every Python backend must iterate over everyone of the requests
180
+ # and create a pb_utils.InferenceResponse for each of them.
181
+ for idx, request in enumerate(requests):
182
+ # Get input tensors
183
+ query = pb_utils.get_input_tensor_by_name(request,
184
+ 'QUERY').as_numpy()
185
+ batch_size = query.shape[0]
186
+
187
+ decoder_query = pb_utils.get_input_tensor_by_name(
188
+ request, 'DECODER_QUERY')
189
+ if decoder_query is not None:
190
+ decoder_query = decoder_query.as_numpy()
191
+
192
+ request_output_len = pb_utils.get_input_tensor_by_name(
193
+ request, 'REQUEST_OUTPUT_LEN').as_numpy()
194
+
195
+ bad_words_dict = pb_utils.get_input_tensor_by_name(
196
+ request, 'BAD_WORDS_DICT')
197
+ if bad_words_dict is not None:
198
+ bad_words_dict = bad_words_dict.as_numpy()
199
+
200
+ stop_words_dict = pb_utils.get_input_tensor_by_name(
201
+ request, 'STOP_WORDS_DICT')
202
+ if stop_words_dict is not None:
203
+ stop_words_dict = stop_words_dict.as_numpy()
204
+
205
+ embedding_bias_words = pb_utils.get_input_tensor_by_name(
206
+ request, 'EMBEDDING_BIAS_WORDS')
207
+ if embedding_bias_words is not None:
208
+ embedding_bias_words = embedding_bias_words.as_numpy()
209
+
210
+ embedding_bias_weights = pb_utils.get_input_tensor_by_name(
211
+ request, 'EMBEDDING_BIAS_WEIGHTS')
212
+ if embedding_bias_weights is not None:
213
+ embedding_bias_weights = embedding_bias_weights.as_numpy()
214
+
215
+ # Take the end_id from the input tensors
216
+ # If not specified, use tokenizer to get end_id
217
+ end_id = pb_utils.get_input_tensor_by_name(request, 'END_ID')
218
+ if end_id is not None:
219
+ end_id = end_id.as_numpy()
220
+ else:
221
+ end_id = [[self.tokenizer_end_id]] * batch_size
222
+
223
+ # Take the pad_id from the input tensors
224
+ # If not specified, use tokenizer to get pad_id
225
+ pad_id = pb_utils.get_input_tensor_by_name(request, 'PAD_ID')
226
+ if pad_id is not None:
227
+ pad_id = pad_id.as_numpy()
228
+ else:
229
+ pad_id = [[self.tokenizer_pad_id]] * batch_size
230
+
231
+ # Preprocessing input data.
232
+ input_id, request_input_len = self._create_request(query)
233
+ if decoder_query is not None:
234
+ decoder_input_id, request_decoder_input_len = self._create_request(
235
+ decoder_query)
236
+ else:
237
+ decoder_input_id = pad_id * np.ones((batch_size, 1), np.int32)
238
+ request_decoder_input_len = 1 * np.ones(
239
+ (batch_size, 1), np.int32)
240
+
241
+ bad_words = self._to_word_list_format(bad_words_dict, batch_size)
242
+ stop_words = self._to_word_list_format(stop_words_dict, batch_size)
243
+
244
+ embedding_bias = self._get_embedding_bias(
245
+ embedding_bias_words, embedding_bias_weights,
246
+ self.embedding_bias_weights_dtype, batch_size)
247
+
248
+ # Create output tensors. You need pb_utils.Tensor
249
+ # objects to create pb_utils.InferenceResponse.
250
+ input_id_tensor = pb_utils.Tensor(
251
+ 'INPUT_ID', input_id.astype(self.input_id_dtype))
252
+ request_input_len_tensor = pb_utils.Tensor(
253
+ 'REQUEST_INPUT_LEN',
254
+ request_input_len.astype(self.request_input_len_dtype))
255
+ decoder_input_id_tensor = pb_utils.Tensor(
256
+ 'DECODER_INPUT_ID',
257
+ decoder_input_id.astype(self.decoder_input_id_dtype))
258
+ request_decoder_input_len_tensor = pb_utils.Tensor(
259
+ 'REQUEST_DECODER_INPUT_LEN',
260
+ request_decoder_input_len.astype(
261
+ self.request_decoder_input_len_dtype))
262
+ request_output_len_tensor = pb_utils.Tensor(
263
+ 'REQUEST_OUTPUT_LEN', request_output_len)
264
+ bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
265
+ stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
266
+ stop_words)
267
+ embedding_bias_tensor = pb_utils.Tensor('EMBEDDING_BIAS',
268
+ embedding_bias)
269
+ end_id_tensor = pb_utils.Tensor('OUT_END_ID',
270
+ np.array(end_id, dtype=np.int32))
271
+ pad_id_tensor = pb_utils.Tensor('OUT_PAD_ID',
272
+ np.array(pad_id, dtype=np.int32))
273
+
274
+ inference_response = pb_utils.InferenceResponse(output_tensors=[
275
+ input_id_tensor, decoder_input_id_tensor, bad_words_ids_tensor,
276
+ stop_words_ids_tensor, request_input_len_tensor,
277
+ request_decoder_input_len_tensor, request_output_len_tensor,
278
+ embedding_bias_tensor, end_id_tensor, pad_id_tensor
279
+ ])
280
+ responses.append(inference_response)
281
+
282
+ # You should return a list of pb_utils.InferenceResponse. Length
283
+ # of this list must match the length of `requests` list.
284
+ return responses
285
+
286
+ def finalize(self):
287
+ """`finalize` is called only once when the model is being unloaded.
288
+ Implementing `finalize` function is optional. This function allows
289
+ the model to perform any necessary clean ups before exit.
290
+ """
291
+ print('Cleaning up...')
292
+
293
+ def _create_request(self, query):
294
+ """
295
+ query : batch string (2D numpy array)
296
+ """
297
+ if isinstance(self.tokenizer, T5Tokenizer):
298
+ start_ids = [
299
+ np.array([self.tokenizer_bos_id] + self.tokenizer.encode(
300
+ s[0].decode(), add_special_tokens=self.add_special_tokens)
301
+ ).astype(int) for s in query
302
+ ]
303
+ else:
304
+ start_ids = [
305
+ np.array(
306
+ self.tokenizer.encode(
307
+ s[0].decode(),
308
+ add_special_tokens=self.add_special_tokens)).astype(
309
+ int) for s in query
310
+ ]
311
+
312
+ if self.is_multimodal:
313
+ if 'blip2' in self.model_type:
314
+ pre_prompt = None
315
+ post_prompt = None
316
+ elif 'llava' == self.model_type:
317
+ pre_prompt = "USER:\n"
318
+ post_prompt = " ASSISTANT:"
319
+
320
+ fake_prompt_id = np.arange(self.vocab_size,
321
+ self.vocab_size + self.ptable_shape[1])
322
+
323
+ if pre_prompt is not None:
324
+ pre_prompt_id = np.array(
325
+ self.tokenizer.encode(
326
+ pre_prompt,
327
+ add_special_tokens=self.add_special_tokens,
328
+ padding=True))
329
+
330
+ if post_prompt is not None:
331
+ post_prompt_id = np.array(
332
+ self.tokenizer.encode(
333
+ post_prompt,
334
+ add_special_tokens=self.add_special_tokens,
335
+ padding=True))
336
+
337
+ if post_prompt is None:
338
+ start_ids = [
339
+ np.concatenate((fake_prompt_id, ids), axis=0)
340
+ for ids in start_ids
341
+ ]
342
+ else:
343
+ start_ids = [
344
+ np.concatenate(
345
+ (pre_prompt_id, fake_prompt_id, ids, post_prompt_id),
346
+ axis=0) for ids in start_ids
347
+ ]
348
+ start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int)
349
+
350
+ max_len = 0
351
+ for seq in start_ids:
352
+ max_len = max(max_len, seq.shape[0])
353
+ start_ids = np.stack([
354
+ np.pad(seq, (0, max_len - seq.shape[0]),
355
+ 'constant',
356
+ constant_values=(0, self.tokenizer_pad_id))
357
+ for seq in start_ids
358
+ ])
359
+
360
+ return start_ids, start_lengths
361
+
362
+ def _to_word_list_format(self, word_lists: List[List[str | bytes]],
363
+ batch_size):
364
+ '''
365
+ word_lists format:
366
+ len(word_lists) == batch_size
367
+ word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
368
+ '''
369
+ assert self.tokenizer != None, "need to set tokenizer"
370
+
371
+ if word_lists is None:
372
+ # Return an empty array of shape (1,2,0)
373
+ return np.empty([batch_size, 2, 0], dtype="int32")
374
+
375
+ flat_ids = []
376
+ offsets = []
377
+ for word_list in word_lists:
378
+ item_flat_ids = []
379
+ item_offsets = []
380
+
381
+ for word in word_list:
382
+ if isinstance(word, bytes):
383
+ word = word.decode()
384
+
385
+ ids = self.tokenizer.encode(word, add_special_tokens=False)
386
+ if len(ids) == 0:
387
+ continue
388
+
389
+ item_flat_ids += ids
390
+ item_offsets.append(len(ids))
391
+
392
+ flat_ids.append(np.array(item_flat_ids))
393
+ offsets.append(np.cumsum(np.array(item_offsets)))
394
+
395
+ pad_to = max(1, max(len(ids) for ids in flat_ids))
396
+
397
+ for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
398
+ flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)),
399
+ constant_values=0)
400
+ offsets[i] = np.pad(offs, (0, pad_to - len(offs)),
401
+ constant_values=-1)
402
+
403
+ return np.array([flat_ids, offsets], dtype="int32").transpose(
404
+ (1, 0, 2))
405
+
406
+ def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights,
407
+ bias_dtype, batch_size):
408
+
409
+ assert self.tokenizer != None, "need to set tokenizer"
410
+
411
+ if embedding_bias_words is None or embedding_bias_weights is None:
412
+ return np.empty([batch_size, 0],
413
+ dtype=self.embedding_bias_weights_dtype)
414
+
415
+ batch_embedding_bias = []
416
+ for words, weights in zip(embedding_bias_words,
417
+ embedding_bias_weights):
418
+
419
+ vocab_size = self.tokenizer.vocab_size
420
+ embedding_bias = [0.] * vocab_size
421
+
422
+ assert len(words) == len(
423
+ weights
424
+ ), "Embedding bias words must have same dimension as embedding bias weights"
425
+
426
+ for word, weight in zip(words, weights):
427
+ if isinstance(word, bytes):
428
+ word = word.decode()
429
+ ids = self.tokenizer.encode(word)
430
+
431
+ if len(ids) == 0:
432
+ continue
433
+
434
+ for id in ids:
435
+ embedding_bias[id] += weight
436
+
437
+ batch_embedding_bias.append(np.array(embedding_bias))
438
+
439
+ return np.array(batch_embedding_bias, dtype=bias_dtype)
preprocessing/config.pbtxt ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "preprocessing"
28
+ backend: "python"
29
+ max_batch_size: 32
30
+ input [
31
+ {
32
+ name: "QUERY"
33
+ data_type: TYPE_STRING
34
+ dims: [ 1 ]
35
+ },
36
+ {
37
+ name: "DECODER_QUERY"
38
+ data_type: TYPE_STRING
39
+ dims: [ 1 ]
40
+ optional: true
41
+ },
42
+ {
43
+ name: "REQUEST_OUTPUT_LEN"
44
+ data_type: TYPE_INT32
45
+ dims: [ 1 ]
46
+ },
47
+ {
48
+ name: "BAD_WORDS_DICT"
49
+ data_type: TYPE_STRING
50
+ dims: [ -1 ]
51
+ optional: true
52
+ },
53
+ {
54
+ name: "STOP_WORDS_DICT"
55
+ data_type: TYPE_STRING
56
+ dims: [ -1 ]
57
+ optional: true
58
+ },
59
+ {
60
+ name: "EMBEDDING_BIAS_WORDS"
61
+ data_type: TYPE_STRING
62
+ dims: [ -1 ]
63
+ optional: true
64
+ },
65
+ {
66
+ name: "EMBEDDING_BIAS_WEIGHTS"
67
+ data_type: TYPE_FP32
68
+ dims: [ -1 ]
69
+ optional: true
70
+ },
71
+ {
72
+ name: "END_ID"
73
+ data_type: TYPE_INT32
74
+ dims: [ 1 ]
75
+ optional: true
76
+ },
77
+ {
78
+ name: "PAD_ID"
79
+ data_type: TYPE_INT32
80
+ dims: [ 1 ]
81
+ optional: true
82
+ }
83
+ ]
84
+ output [
85
+ {
86
+ name: "INPUT_ID"
87
+ data_type: TYPE_INT32
88
+ dims: [ -1 ]
89
+ },
90
+ {
91
+ name: "REQUEST_INPUT_LEN"
92
+ data_type: TYPE_INT32
93
+ dims: [ 1 ]
94
+ },
95
+ {
96
+ name: "DECODER_INPUT_ID"
97
+ data_type: TYPE_INT32
98
+ dims: [ -1 ]
99
+ },
100
+ {
101
+ name: "REQUEST_DECODER_INPUT_LEN"
102
+ data_type: TYPE_INT32
103
+ dims: [ 1 ]
104
+ },
105
+ {
106
+ name: "BAD_WORDS_IDS"
107
+ data_type: TYPE_INT32
108
+ dims: [ 2, -1 ]
109
+ },
110
+ {
111
+ name: "STOP_WORDS_IDS"
112
+ data_type: TYPE_INT32
113
+ dims: [ 2, -1 ]
114
+ },
115
+ {
116
+ name: "EMBEDDING_BIAS"
117
+ data_type: TYPE_FP32
118
+ dims: [ -1 ]
119
+ },
120
+ {
121
+ name: "REQUEST_OUTPUT_LEN"
122
+ data_type: TYPE_INT32
123
+ dims: [ -1 ]
124
+ },
125
+ {
126
+ name: "OUT_END_ID"
127
+ data_type: TYPE_INT32
128
+ dims: [ 1 ]
129
+ },
130
+ {
131
+ name: "OUT_PAD_ID"
132
+ data_type: TYPE_INT32
133
+ dims: [ 1 ]
134
+ }
135
+ ]
136
+
137
+ parameters {
138
+ key: "tokenizer_dir"
139
+ value: {
140
+ string_value: "mlabonne/Llama-3.1-70B-Instruct-lorablated"
141
+ }
142
+ }
143
+
144
+ parameters {
145
+ key: "add_special_tokens"
146
+ value: {
147
+ string_value: "False"
148
+ }
149
+ }
150
+
151
+ parameters {
152
+ key: "visual_model_path"
153
+ value: {
154
+ string_value: "${visual_model_path}"
155
+ }
156
+ }
157
+
158
+ parameters: {
159
+ key: "gpt_model_path"
160
+ value: {
161
+ string_value: "/all_models/inflight_batcher_llm/tensorrt_llm/1"
162
+ }
163
+ }
164
+
165
+ instance_group [
166
+ {
167
+ count: 1
168
+ kind: KIND_CPU
169
+ }
170
+ ]
tensorrt_llm/1/.gitkeep ADDED
File without changes
tensorrt_llm/1/config.json ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "0.13.0.dev2024082000",
3
+ "pretrained_config": {
4
+ "mlp_bias": false,
5
+ "attn_bias": false,
6
+ "rotary_base": 500000.0,
7
+ "rotary_scaling": {
8
+ "factor": 32.0,
9
+ "high_freq_factor": 4.0,
10
+ "low_freq_factor": 1.0,
11
+ "original_max_position_embeddings": 8192,
12
+ "rope_type": "llama3"
13
+ },
14
+ "residual_mlp": false,
15
+ "disable_weight_only_quant_plugin": false,
16
+ "moe": {
17
+ "num_experts": 0,
18
+ "top_k": 0,
19
+ "normalization_mode": 1,
20
+ "tp_mode": 0
21
+ },
22
+ "remove_duplicated_kv_heads": false,
23
+ "architecture": "LlamaForCausalLM",
24
+ "dtype": "float16",
25
+ "vocab_size": 128256,
26
+ "hidden_size": 2048,
27
+ "num_hidden_layers": 16,
28
+ "num_attention_heads": 32,
29
+ "hidden_act": "silu",
30
+ "logits_dtype": "float16",
31
+ "norm_epsilon": 1e-05,
32
+ "position_embedding_type": "rope_gpt_neox",
33
+ "max_position_embeddings": 131072,
34
+ "num_key_value_heads": 8,
35
+ "intermediate_size": 8192,
36
+ "mapping": {
37
+ "world_size": 2,
38
+ "gpus_per_node": 8,
39
+ "cp_size": 1,
40
+ "tp_size": 2,
41
+ "pp_size": 1,
42
+ "moe_tp_size": 2,
43
+ "moe_ep_size": 1
44
+ },
45
+ "quantization": {
46
+ "quant_algo": "FP8",
47
+ "kv_cache_quant_algo": "FP8",
48
+ "group_size": 128,
49
+ "smoothquant_val": 0.5,
50
+ "clamp_val": null,
51
+ "has_zero_point": false,
52
+ "pre_quant_scale": false,
53
+ "exclude_modules": null
54
+ },
55
+ "use_parallel_embedding": true,
56
+ "embedding_sharding_dim": 0,
57
+ "share_embedding_table": false,
58
+ "head_size": 64,
59
+ "qk_layernorm": false,
60
+ "producer": {
61
+ "name": "modelopt",
62
+ "version": "0.15.1"
63
+ },
64
+ "bias": false,
65
+ "rotary_pct": 1.0,
66
+ "rank": 0,
67
+ "decoder": "llama",
68
+ "rmsnorm": true,
69
+ "lm_head_bias": false
70
+ },
71
+ "build_config": {
72
+ "max_input_len": 124000,
73
+ "max_seq_len": 4194304,
74
+ "opt_batch_size": null,
75
+ "max_batch_size": 32,
76
+ "max_beam_width": 1,
77
+ "max_num_tokens": 128000,
78
+ "opt_num_tokens": null,
79
+ "max_prompt_embedding_table_size": 0,
80
+ "kv_cache_type": "PAGED",
81
+ "gather_context_logits": false,
82
+ "gather_generation_logits": false,
83
+ "strongly_typed": true,
84
+ "builder_opt": null,
85
+ "force_num_profiles": null,
86
+ "profiling_verbosity": "layer_names_only",
87
+ "enable_debug_output": false,
88
+ "max_draft_len": 0,
89
+ "speculative_decoding_mode": 1,
90
+ "use_refit": false,
91
+ "input_timing_cache": null,
92
+ "output_timing_cache": "model.cache",
93
+ "lora_config": {
94
+ "lora_dir": [],
95
+ "lora_ckpt_source": "hf",
96
+ "max_lora_rank": 64,
97
+ "lora_target_modules": [],
98
+ "trtllm_modules_to_hf_modules": {}
99
+ },
100
+ "auto_parallel_config": {
101
+ "world_size": 1,
102
+ "gpus_per_node": 8,
103
+ "cluster_key": "H100-PCIe",
104
+ "cluster_info": null,
105
+ "sharding_cost_model": "alpha_beta",
106
+ "comm_cost_model": "alpha_beta",
107
+ "enable_pipeline_parallelism": false,
108
+ "enable_shard_unbalanced_shape": false,
109
+ "enable_shard_dynamic_shape": false,
110
+ "enable_reduce_scatter": true,
111
+ "builder_flags": null,
112
+ "debug_mode": false,
113
+ "infer_shape": true,
114
+ "validation_mode": false,
115
+ "same_buffer_io": {
116
+ "past_key_value_(\\d+)": "present_key_value_\\1"
117
+ },
118
+ "same_spec_io": {},
119
+ "sharded_io_allowlist": [
120
+ "past_key_value_\\d+",
121
+ "present_key_value_\\d*"
122
+ ],
123
+ "fill_weights": false,
124
+ "parallel_config_cache": null,
125
+ "profile_cache": null,
126
+ "dump_path": null,
127
+ "debug_outputs": []
128
+ },
129
+ "weight_sparsity": false,
130
+ "weight_streaming": false,
131
+ "plugin_config": {
132
+ "dtype": "float16",
133
+ "bert_attention_plugin": "auto",
134
+ "gpt_attention_plugin": "float16",
135
+ "gemm_plugin": "fp8",
136
+ "gemm_swiglu_plugin": null,
137
+ "fp8_rowwise_gemm_plugin": null,
138
+ "smooth_quant_gemm_plugin": null,
139
+ "identity_plugin": null,
140
+ "layernorm_quantization_plugin": null,
141
+ "rmsnorm_quantization_plugin": null,
142
+ "nccl_plugin": "float16",
143
+ "lookup_plugin": null,
144
+ "lora_plugin": null,
145
+ "weight_only_groupwise_quant_matmul_plugin": null,
146
+ "weight_only_quant_matmul_plugin": null,
147
+ "quantize_per_token_plugin": false,
148
+ "quantize_tensor_plugin": false,
149
+ "moe_plugin": "auto",
150
+ "mamba_conv1d_plugin": "auto",
151
+ "context_fmha": true,
152
+ "bert_context_fmha_fp32_acc": false,
153
+ "paged_kv_cache": true,
154
+ "remove_input_padding": true,
155
+ "reduce_fusion": false,
156
+ "enable_xqa": true,
157
+ "tokens_per_block": 64,
158
+ "use_paged_context_fmha": true,
159
+ "use_fp8_context_fmha": true,
160
+ "multiple_profiles": true,
161
+ "paged_state": false,
162
+ "streamingllm": false,
163
+ "manage_weights": false,
164
+ "use_fused_mlp": true
165
+ },
166
+ "use_strip_plan": false,
167
+ "max_encoder_input_len": 1024,
168
+ "use_fused_mlp": true
169
+ }
170
+ }
tensorrt_llm/1/model.py ADDED
@@ -0,0 +1,947 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import json
3
+ import os
4
+ import sys
5
+ import time
6
+ from random import randint
7
+ from threading import Lock, Thread
8
+
9
+ import numpy as np
10
+ import torch
11
+ import triton_python_backend_utils as pb_utils
12
+ from torch import from_numpy
13
+ from torch.utils.dlpack import from_dlpack
14
+
15
+ import tensorrt_llm.bindings.executor as trtllm
16
+
17
+
18
+ def get_input_tensor_by_name(request,
19
+ name,
20
+ expected_batch_size=None,
21
+ batch_index=None):
22
+ tensor = pb_utils.get_input_tensor_by_name(request, name)
23
+ if tensor is None:
24
+ return None
25
+
26
+ if tensor.is_cpu():
27
+ tensor = tensor.as_numpy()
28
+ else:
29
+ tensor = from_dlpack(tensor.to_dlpack())
30
+
31
+ if expected_batch_size is not None and tensor.shape[
32
+ 0] != expected_batch_size:
33
+ raise pb_utils.TritonModelException(
34
+ f"Expected batch size doesn't match batch size for tensor {name}. Expected {expected_batch_size} got {tensor.shape[0]}"
35
+ )
36
+
37
+ if batch_index is not None and expected_batch_size is not None and batch_index >= expected_batch_size:
38
+ raise pb_utils.TritonModelException(
39
+ f"Invalid batch index in get_input_tensor_by_name for {name}")
40
+
41
+ if batch_index is not None:
42
+ # Add leading 1 batch dimension
43
+ if isinstance(tensor, np.ndarray):
44
+ return np.expand_dims(tensor[batch_index], axis=0)
45
+ elif isinstance(tensor, torch.Tensor):
46
+ return torch.unsqueeze(tensor[batch_index], dim=0)
47
+ else:
48
+ return tensor
49
+
50
+
51
+ def get_input_scalar_by_name(request,
52
+ name,
53
+ expected_batch_size=1,
54
+ batch_index=0):
55
+ tensor = pb_utils.get_input_tensor_by_name(request, name)
56
+ if tensor is None:
57
+ return None
58
+ tensor = tensor.as_numpy()
59
+
60
+ if tensor.size != expected_batch_size:
61
+ raise pb_utils.TritonModelException(
62
+ f"Expected a scalar tensor for tensor {name}")
63
+
64
+ return tensor.item(batch_index)
65
+
66
+
67
+ def read_parameter_as_type(value, name, pytype=str):
68
+ if value == "":
69
+ return None
70
+ if value.startswith("${") and value.endswith("}"):
71
+ return None
72
+ if pytype is bool:
73
+ return value.lower() in ["1", "true"]
74
+ try:
75
+ result = pytype(value)
76
+ return result
77
+ except:
78
+ pb_utils.Logger.log_warning(
79
+ f"Could not read parameter '{name}' with value '{value}', will use default."
80
+ )
81
+ return None
82
+
83
+
84
+ def get_parameter(model_config, name, pytype=str):
85
+ if name not in model_config['parameters']:
86
+ return None
87
+ return read_parameter_as_type(
88
+ model_config['parameters'][name]['string_value'], name, pytype)
89
+
90
+
91
+ def convert_word_list(word_list):
92
+ if word_list is None:
93
+ return None
94
+ word_list = word_list.tolist()
95
+ if len(word_list) == 0 or len(word_list[0]) != 2:
96
+ raise pb_utils.TritonModelException(f"Invalid format for word list.")
97
+ words, indices = word_list[0]
98
+ result = []
99
+ current_index = 0
100
+ for i in indices:
101
+ if i == -1:
102
+ continue
103
+ if i > len(words):
104
+ raise pb_utils.TritonModelException(
105
+ f"Invalid format for word list.")
106
+ current_word = []
107
+ while current_index < i:
108
+ current_word.append(words[current_index])
109
+ current_index += 1
110
+ result.append(current_word)
111
+ return result
112
+
113
+
114
+ def parse_medusa_choices(medusa_choices):
115
+ if medusa_choices is None:
116
+ return None
117
+ try:
118
+ result = json.loads(
119
+ "[" + medusa_choices.replace("{", "[").replace("}", "]") + "]")
120
+ assert isinstance(result, list) and len(result) > 0
121
+ assert all([isinstance(x, list) for x in result])
122
+ assert all([isinstance(y, int) for x in result for y in x])
123
+ except Exception:
124
+ raise pb_utils.TritonModelException(
125
+ "Invalid format for medusa_choices")
126
+ return result
127
+
128
+
129
+ def get_sampling_config_from_request(request, batch_size=1, batch_index=0):
130
+ kwargs = {}
131
+ kwargs['beam_width'] = get_input_scalar_by_name(
132
+ request, 'beam_width', batch_size, batch_index) or 1
133
+ kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k',
134
+ batch_size, batch_index)
135
+ kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p',
136
+ batch_size, batch_index)
137
+ kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[
138
+ 'top_p'] <= 0 else kwargs['top_p']
139
+ kwargs['random_seed'] = get_input_scalar_by_name(request, 'random_seed',
140
+ batch_size, batch_index)
141
+ kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature',
142
+ batch_size, batch_index)
143
+ kwargs['min_length'] = get_input_scalar_by_name(request, 'min_length',
144
+ batch_size, batch_index)
145
+ kwargs['repetition_penalty'] = get_input_scalar_by_name(
146
+ request, 'repetition_penalty', batch_size, batch_index)
147
+ kwargs['presence_penalty'] = get_input_scalar_by_name(
148
+ request, 'presence_penalty', batch_size, batch_index)
149
+ kwargs['frequency_penalty'] = get_input_scalar_by_name(
150
+ request, 'frequency_penalty', batch_size, batch_index)
151
+ kwargs['length_penalty'] = get_input_scalar_by_name(
152
+ request, 'len_penalty', batch_size, batch_index)
153
+ kwargs['top_p_min'] = get_input_scalar_by_name(request,
154
+ 'runtime_top_p_min',
155
+ batch_size, batch_index)
156
+ kwargs['top_p_reset_ids'] = get_input_scalar_by_name(
157
+ request, 'runtime_top_p_reset_ids', batch_size, batch_index)
158
+ kwargs['top_p_decay'] = get_input_scalar_by_name(request,
159
+ 'runtime_top_p_decay',
160
+ batch_size, batch_index)
161
+ kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name(
162
+ request, 'beam_search_diversity_rate', batch_size, batch_index)
163
+ kwargs['early_stopping'] = get_input_scalar_by_name(
164
+ request, 'early_stopping', batch_size, batch_index)
165
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
166
+ return trtllm.SamplingConfig(**kwargs)
167
+
168
+
169
+ def get_output_config_from_request(request,
170
+ exclude_input_from_output,
171
+ batch_size=1,
172
+ batch_index=0):
173
+ kwargs = {}
174
+ kwargs["return_log_probs"] = get_input_scalar_by_name(
175
+ request, 'return_log_probs', batch_size, batch_index)
176
+ kwargs["return_context_logits"] = get_input_scalar_by_name(
177
+ request, 'return_context_logits', batch_size, batch_index)
178
+ kwargs["return_generation_logits"] = get_input_scalar_by_name(
179
+ request, 'return_generation_logits', batch_size, batch_index)
180
+ kwargs["exclude_input_from_output"] = exclude_input_from_output
181
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
182
+ return trtllm.OutputConfig(**kwargs)
183
+
184
+
185
+ def get_external_draft_tokens_config_from_request(request,
186
+ batch_size=1,
187
+ batch_index=0):
188
+ kwargs = {}
189
+ draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids',
190
+ batch_size, batch_index)
191
+ if draft_input_ids is not None:
192
+ kwargs['tokens'] = draft_input_ids[0].tolist()
193
+ draft_logits = get_input_tensor_by_name(request, 'draft_logits',
194
+ batch_size, batch_index)
195
+ if draft_logits is not None:
196
+ kwargs['logits'] = from_numpy(draft_logits).squeeze()
197
+ kwargs['acceptance_threshold'] = get_input_scalar_by_name(
198
+ request, 'draft_acceptance_threshold', batch_size, batch_index)
199
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
200
+ if len(kwargs) > 0:
201
+ return trtllm.ExternalDraftTokensConfig(**kwargs)
202
+ return None
203
+
204
+
205
+ def get_prompt_tuning_config_from_request(request,
206
+ batch_size=1,
207
+ batch_index=0):
208
+ # prompt_vocab_size is unused by executor.
209
+ kwargs = {}
210
+ prompt_embedding_table = get_input_tensor_by_name(
211
+ request, 'prompt_embedding_table', batch_size, batch_index)
212
+ if prompt_embedding_table is not None:
213
+ if isinstance(prompt_embedding_table, np.ndarray):
214
+ kwargs["embedding_table"] = from_numpy(
215
+ prompt_embedding_table).squeeze()
216
+ elif isinstance(prompt_embedding_table, torch.Tensor):
217
+ kwargs["embedding_table"] = from_dlpack(
218
+ prompt_embedding_table.to_dlpack()).squeeze(dim=0)
219
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
220
+ if len(kwargs) > 0:
221
+ return trtllm.PromptTuningConfig(**kwargs)
222
+ return None
223
+
224
+
225
+ def get_lora_config_from_request(request, batch_size=1, batch_index=0):
226
+ kwargs = {}
227
+ kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id',
228
+ batch_size, batch_index)
229
+ lora_weights = get_input_tensor_by_name(request, 'lora_weights',
230
+ batch_size, batch_index)
231
+ if lora_weights is not None:
232
+ kwargs["weights"] = from_numpy(lora_weights).squeeze()
233
+ lora_config = get_input_tensor_by_name(request, 'lora_config', batch_size,
234
+ batch_index)
235
+ if lora_config is not None:
236
+ kwargs["config"] = from_numpy(lora_config).squeeze()
237
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
238
+ if len(kwargs) > 0:
239
+ return trtllm.LoraConfig(**kwargs)
240
+ return None
241
+
242
+
243
+ def convert_request(request, exclude_input_from_output, decoupled):
244
+ inputs = {}
245
+ input_token_ids = get_input_tensor_by_name(request, 'input_ids')
246
+ if input_token_ids is None:
247
+ raise pb_utils.TritonModelException(
248
+ "A value is required for input_ids")
249
+ if len(input_token_ids.shape) != 2:
250
+ raise pb_utils.TritonModelException(f"Invalid format for input_ids")
251
+ batch_size = input_token_ids.shape[0]
252
+ requests = []
253
+ for batch_index in range(0, batch_size):
254
+ input_token_ids = get_input_tensor_by_name(request, 'input_ids',
255
+ batch_size, batch_index)[0]
256
+ if input_token_ids is None:
257
+ raise pb_utils.TritonModelException(
258
+ "A value is required for input_ids")
259
+ input_token_ids = input_token_ids.tolist()
260
+ if len(input_token_ids) == 0:
261
+ raise pb_utils.TritonModelException(
262
+ f"Invalid format for input_ids")
263
+
264
+ input_length = get_input_scalar_by_name(request, 'input_lengths',
265
+ batch_size, batch_index)
266
+ if input_length is None:
267
+ input_length = len(input_token_ids)
268
+ # Trim input token ids with input_lengths
269
+ inputs['input_token_ids'] = input_token_ids[0:input_length]
270
+
271
+ inputs['max_new_tokens'] = get_input_scalar_by_name(
272
+ request, 'request_output_len', batch_size, batch_index)
273
+ if inputs['max_new_tokens'] is None:
274
+ raise pb_utils.TritonModelException(
275
+ "A value is required for request_output_len")
276
+ inputs['streaming'] = get_input_scalar_by_name(request, 'streaming',
277
+ batch_size, batch_index)
278
+ if inputs['streaming'] and not decoupled:
279
+ raise pb_utils.TritonModelException(
280
+ "Streaming is only supported in decoupled mode.")
281
+ inputs['end_id'] = get_input_scalar_by_name(request, 'end_id',
282
+ batch_size, batch_index)
283
+ inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id',
284
+ batch_size, batch_index)
285
+ inputs['stop_words'] = convert_word_list(
286
+ get_input_tensor_by_name(request, 'stop_words_list', batch_size,
287
+ batch_index))
288
+ inputs['bad_words'] = convert_word_list(
289
+ get_input_tensor_by_name(request, 'bad_words_list', batch_size,
290
+ batch_index))
291
+ embedding_bias = get_input_tensor_by_name(request, 'embedding_bias',
292
+ batch_size, batch_index)
293
+ if embedding_bias is not None and embedding_bias.size != 0:
294
+ inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze()
295
+
296
+ sampling_config = get_sampling_config_from_request(
297
+ request, batch_size, batch_index)
298
+ output_config = get_output_config_from_request(
299
+ request, exclude_input_from_output, batch_size, batch_index)
300
+ external_draft_tokens_config = get_external_draft_tokens_config_from_request(
301
+ request, batch_size, batch_index)
302
+ prompt_tuning_config = get_prompt_tuning_config_from_request(
303
+ request, batch_size, batch_index)
304
+ lora_config = get_lora_config_from_request(request, batch_size,
305
+ batch_index)
306
+
307
+ requests.append(
308
+ trtllm.Request(
309
+ **inputs,
310
+ sampling_config=sampling_config,
311
+ output_config=output_config,
312
+ external_draft_tokens_config=external_draft_tokens_config,
313
+ prompt_tuning_config=prompt_tuning_config,
314
+ lora_config=lora_config,
315
+ ))
316
+ return requests
317
+
318
+
319
+ def convert_response(response, batch_index):
320
+ if response.has_error():
321
+ return pb_utils.InferenceResponse(output_tensors=[],
322
+ error=pb_utils.TritonError(
323
+ response.error_msg)), True
324
+ result = response.result
325
+ beam_lengths = np.expand_dims(
326
+ np.array([len(beam) for beam in result.output_token_ids], np.int32), 0)
327
+ max_beam_length = max([len(beam) for beam in result.output_token_ids])
328
+ output_ids = np.full((1, len(result.output_token_ids), max_beam_length),
329
+ -1, np.int32)
330
+ for idx, beam in enumerate(result.output_token_ids):
331
+ output_ids[0, idx, :len(beam)] = beam
332
+ output_tensors = [
333
+ pb_utils.Tensor("output_ids", output_ids),
334
+ pb_utils.Tensor("sequence_length", beam_lengths),
335
+ ]
336
+ output_tensors.append(
337
+ pb_utils.Tensor(
338
+ "cum_log_probs",
339
+ np.expand_dims(np.array(result.cum_log_probs, np.float32), 0)
340
+ if result.cum_log_probs is not None else np.zeros(
341
+ (1, 1), np.float32)))
342
+ output_tensors.append(
343
+ pb_utils.Tensor(
344
+ "output_log_probs",
345
+ np.expand_dims(np.array(result.log_probs, np.float32), 0) if
346
+ result.log_probs is not None else np.zeros((1, 1, 1), np.float32)))
347
+ output_tensors.append(
348
+ pb_utils.Tensor(
349
+ "context_logits",
350
+ np.expand_dims(np.array(result.context_logits, np.float32), 0)
351
+ if result.context_logits is not None else np.zeros(
352
+ (1, 1, 1), np.float32)))
353
+ output_tensors.append(
354
+ pb_utils.Tensor(
355
+ "generation_logits",
356
+ np.expand_dims(np.array(result.generation_logits, np.float32), 0)
357
+ if result.generation_logits is not None else np.zeros(
358
+ (1, 1, 1, 1), np.float32)))
359
+ output_tensors.append(
360
+ pb_utils.Tensor("batch_index",
361
+ np.expand_dims(np.array([batch_index], np.int32), 0)))
362
+
363
+ return pb_utils.InferenceResponse(output_tensors), result.is_final
364
+
365
+
366
+ def convert_scheduler_policy(batch_scheduler_policy: str):
367
+ if batch_scheduler_policy.lower() == "max_utilization":
368
+ return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
369
+ elif batch_scheduler_policy.lower() == "guaranteed_no_evict":
370
+ return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
371
+ raise pb_utils.TritonModelException(
372
+ f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported."
373
+ )
374
+
375
+
376
+ def convert_batching_type(gpt_model_type: str):
377
+ if gpt_model_type is None:
378
+ return None
379
+ if gpt_model_type.lower(
380
+ ) == "inflight_fused_batching" or gpt_model_type.lower(
381
+ ) == "inflight_batching":
382
+ return trtllm.BatchingType.INFLIGHT
383
+ elif gpt_model_type.lower() == "v1":
384
+ return trtllm.BatchingType.STATIC
385
+ raise pb_utils.TritonModelException(
386
+ f"gpt_model_type value of '{gpt_model_type}' is not supported.")
387
+
388
+
389
+ def convert_decoding_mode(decoding_mode: str):
390
+ if decoding_mode is None:
391
+ return None
392
+ elif decoding_mode == "auto":
393
+ return trtllm.DecodingMode.Auto()
394
+ elif decoding_mode == "top_k":
395
+ return trtllm.DecodingMode.TopK()
396
+ elif decoding_mode == "top_p":
397
+ return trtllm.DecodingMode.TopP()
398
+ elif decoding_mode == "top_k_top_p":
399
+ return trtllm.DecodingMode.TopKTopP()
400
+ elif decoding_mode == "beam_search":
401
+ return trtllm.DecodingMode.BeamSearch()
402
+ elif decoding_mode == "medusa":
403
+ return trtllm.DecodingMode.Medusa()
404
+ raise pb_utils.TritonModelException(
405
+ f"decoding_mode value of '{decoding_mode}' is not supported.")
406
+
407
+
408
+ def convert_timestamp_to_seconds(timestamp: str):
409
+ return int(
410
+ datetime.datetime.strptime(timestamp,
411
+ "%m-%d-%Y %H:%M:%S.%f").timestamp())
412
+
413
+
414
+ class TritonPythonModel:
415
+ """Your Python model must use the same class name. Every Python model
416
+ that is created must have "TritonPythonModel" as the class name.
417
+ """
418
+
419
+ def get_scheduler_config(self, model_config):
420
+ batch_scheduler_policy = get_parameter(model_config,
421
+ "batch_scheduler_policy")
422
+ if batch_scheduler_policy is None:
423
+ return trtllm.SchedulerConfig()
424
+ return trtllm.SchedulerConfig(
425
+ convert_scheduler_policy(batch_scheduler_policy))
426
+
427
+ def get_kv_cache_config(self, model_config):
428
+ kwargs = {
429
+ "enable_block_reuse":
430
+ get_parameter(model_config, "enable_kv_cache_reuse", bool),
431
+ "max_tokens":
432
+ get_parameter(model_config, "max_tokens_in_paged_kv_cache", int),
433
+ "sink_token_length":
434
+ get_parameter(model_config, "sink_token_length", int),
435
+ "free_gpu_memory_fraction":
436
+ get_parameter(model_config, "kv_cache_free_gpu_mem_fraction",
437
+ float),
438
+ "host_cache_size":
439
+ get_parameter(model_config, "kv_cache_host_memory_bytes", int),
440
+ "onboard_blocks":
441
+ get_parameter(model_config, "kv_cache_onboard_blocks", bool),
442
+ }
443
+ max_attention_window_size = get_parameter(model_config,
444
+ "max_attention_window_size")
445
+ if max_attention_window_size:
446
+ kwargs["max_attention_window"] = [
447
+ int(x) for x in max_attention_window_size.split(",")
448
+ ]
449
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
450
+ return trtllm.KvCacheConfig(**kwargs)
451
+
452
+ def get_parallel_config(self, model_config):
453
+ kwargs = {}
454
+ gpu_device_ids = get_parameter(model_config, "gpu_device_ids")
455
+ if gpu_device_ids:
456
+ kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")]
457
+ self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR",
458
+ "0") == "1"
459
+ if self.use_orchestrator_mode:
460
+ kwargs[
461
+ "communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR
462
+ worker_path = get_parameter(model_config, "worker_path")
463
+ if worker_path is not None:
464
+ raise pb_utils.TritonModelException(
465
+ "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable."
466
+ )
467
+ executor_worker_path = get_parameter(model_config,
468
+ "executor_worker_path")
469
+ kwargs["orchestrator_config"] = trtllm.OrchestratorConfig(
470
+ True, executor_worker_path)
471
+ if len(kwargs) > 0:
472
+ return trtllm.ParallelConfig(**kwargs)
473
+ return None
474
+
475
+ def get_peft_cache_config(self, model_config):
476
+ kwargs = {
477
+ "optimal_adapter_size":
478
+ get_parameter(model_config, "lora_cache_optimal_adapter_size",
479
+ int),
480
+ "max_adapter_size":
481
+ get_parameter(model_config, "lora_cache_max_adapter_size", int),
482
+ "device_cache_percent":
483
+ get_parameter(model_config, "lora_cache_gpu_memory_fraction",
484
+ float),
485
+ "host_cache_size":
486
+ get_parameter(model_config, "lora_cache_host_memory_bytes", int),
487
+ }
488
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
489
+ return trtllm.PeftCacheConfig(**kwargs)
490
+
491
+ def get_decoding_config(self, model_config):
492
+ kwargs = {
493
+ "medusa_choices":
494
+ parse_medusa_choices(get_parameter(model_config,
495
+ "medusa_choices")),
496
+ "decoding_mode":
497
+ convert_decoding_mode(get_parameter(model_config,
498
+ "decoding_mode")),
499
+ }
500
+ print(kwargs)
501
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
502
+ return trtllm.DecodingConfig(**kwargs)
503
+
504
+ def get_extended_runtime_perf_knob_config(self, model_config):
505
+ kwargs = {
506
+ "multi_block_mode":
507
+ get_parameter(model_config, "multi_block_mode", bool),
508
+ "enable_context_fmha_fp32_acc":
509
+ get_parameter(model_config, "enable_context_fmha_fp32_acc", bool)
510
+ }
511
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
512
+ return trtllm.ExtendedRuntimePerfKnobConfig(**kwargs)
513
+
514
+ def get_executor_config(self, model_config):
515
+ kwargs = {
516
+ "max_beam_width":
517
+ get_parameter(model_config, "max_beam_width", int),
518
+ "scheduler_config":
519
+ self.get_scheduler_config(model_config),
520
+ "kv_cache_config":
521
+ self.get_kv_cache_config(model_config),
522
+ "enable_chunked_context":
523
+ get_parameter(model_config, "enable_chunked_context", bool),
524
+ "normalize_log_probs":
525
+ get_parameter(model_config, "normalize_log_probs", bool),
526
+ "batching_type":
527
+ convert_batching_type(get_parameter(model_config,
528
+ "gpt_model_type")),
529
+ "parallel_config":
530
+ self.get_parallel_config(model_config),
531
+ "peft_cache_config":
532
+ self.get_peft_cache_config(model_config),
533
+ "decoding_config":
534
+ self.get_decoding_config(model_config),
535
+ "max_queue_size":
536
+ model_config.get(
537
+ "dynamic_batching",
538
+ {},
539
+ ).get(
540
+ "default_queue_policy",
541
+ {},
542
+ ).get("max_queue_size"),
543
+ "extended_runtime_perf_knob_config":
544
+ self.get_extended_runtime_perf_knob_config(model_config)
545
+ }
546
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
547
+ return trtllm.ExecutorConfig(**kwargs)
548
+
549
+ def create_metrics(self, model: str, version: str, is_v1_model: bool):
550
+ self.request_metric_family = pb_utils.MetricFamily(
551
+ name="nv_trt_llm_request_metrics",
552
+ description="TRT LLM request metrics",
553
+ kind=pb_utils.MetricFamily.GAUGE,
554
+ )
555
+ self.runtime_memory_metric_family = pb_utils.MetricFamily(
556
+ name="nv_trt_llm_runtime_memory_metrics",
557
+ description="TRT LLM runtime memory metrics",
558
+ kind=pb_utils.MetricFamily.GAUGE,
559
+ )
560
+ self.kv_cache_metric_family = pb_utils.MetricFamily(
561
+ name="nv_trt_llm_kv_cache_block_metrics",
562
+ description="TRT LLM KV cache block metrics",
563
+ kind=pb_utils.MetricFamily.GAUGE,
564
+ )
565
+ model_type = "v1" if is_v1_model else "inflight_batcher"
566
+ self.model_type_metric_family = pb_utils.MetricFamily(
567
+ name=f"nv_trt_llm_{model_type}_metrics",
568
+ description=f"TRT LLM {model_type}-specific metrics",
569
+ kind=pb_utils.MetricFamily.GAUGE,
570
+ )
571
+ self.general_metric_family = pb_utils.MetricFamily(
572
+ name="nv_trt_llm_general_metrics",
573
+ description="General TRT LLM metrics",
574
+ kind=pb_utils.MetricFamily.GAUGE,
575
+ )
576
+ common_labels = {"model": model, "version": version}
577
+ self.all_metrics = {
578
+ # Request metrics
579
+ "num_active_requests":
580
+ self.request_metric_family.Metric(labels={
581
+ "request_type": "active",
582
+ **common_labels
583
+ }),
584
+ "max_num_active_requests":
585
+ self.request_metric_family.Metric(labels={
586
+ "request_type": "max",
587
+ **common_labels
588
+ }),
589
+ "num_scheduled_requests":
590
+ self.request_metric_family.Metric(labels={
591
+ "request_type": "scheduled",
592
+ **common_labels
593
+ }),
594
+ "num_context_requests":
595
+ self.request_metric_family.Metric(labels={
596
+ "request_type": "context",
597
+ **common_labels
598
+ }),
599
+ # Runtime metrics
600
+ "cpu_mem_usage":
601
+ self.runtime_memory_metric_family.Metric(labels={
602
+ "memory_type": "cpu",
603
+ **common_labels
604
+ }),
605
+ "gpu_mem_usage":
606
+ self.runtime_memory_metric_family.Metric(labels={
607
+ "memory_type": "gpu",
608
+ **common_labels
609
+ }),
610
+ "pinned_mem_usage":
611
+ self.runtime_memory_metric_family.Metric(labels={
612
+ "memory_type": "pinned",
613
+ **common_labels
614
+ }),
615
+ # KV cache metrics
616
+ "max_num_blocks":
617
+ self.kv_cache_metric_family.Metric(labels={
618
+ "kv_cache_block_type": "max",
619
+ **common_labels
620
+ }),
621
+ "free_num_blocks":
622
+ self.kv_cache_metric_family.Metric(labels={
623
+ "kv_cache_block_type": "free",
624
+ **common_labels
625
+ }),
626
+ "used_num_blocks":
627
+ self.kv_cache_metric_family.Metric(labels={
628
+ "kv_cache_block_type": "used",
629
+ **common_labels
630
+ }),
631
+ "tokens_per_block":
632
+ self.kv_cache_metric_family.Metric(labels={
633
+ "kv_cache_block_type": "tokens_per",
634
+ **common_labels
635
+ }),
636
+ # General metrics
637
+ "timestamp":
638
+ self.general_metric_family.Metric(labels={
639
+ "general_type": "timestamp",
640
+ **common_labels
641
+ }),
642
+ "iter":
643
+ self.general_metric_family.Metric(labels={
644
+ "general_type": "iteration_counter",
645
+ **common_labels
646
+ }),
647
+ }
648
+ if is_v1_model:
649
+ self.all_metrics.update({
650
+ "num_ctx_tokens":
651
+ self.model_type_metric_family.Metric(labels={
652
+ "v1_specific_metric": "total_context_tokens",
653
+ **common_labels
654
+ }),
655
+ "num_gen_tokens":
656
+ self.model_type_metric_family.Metric(
657
+ labels={
658
+ "v1_specific_metric": "total_generation_tokens",
659
+ **common_labels
660
+ }),
661
+ "empty_gen_slots":
662
+ self.model_type_metric_family.Metric(
663
+ labels={
664
+ "v1_specific_metric": "empty_generation_slots",
665
+ **common_labels
666
+ }),
667
+ })
668
+ else:
669
+ self.all_metrics.update({
670
+ "num_ctx_tokens":
671
+ self.model_type_metric_family.Metric(
672
+ labels={
673
+ "inflight_batcher_specific_metric":
674
+ "total_context_tokens",
675
+ **common_labels
676
+ }),
677
+ "num_gen_requests":
678
+ self.model_type_metric_family.Metric(
679
+ labels={
680
+ "inflight_batcher_specific_metric":
681
+ "generation_requests",
682
+ **common_labels
683
+ }),
684
+ "micro_batch_id":
685
+ self.model_type_metric_family.Metric(
686
+ labels={
687
+ "inflight_batcher_specific_metric": "micro_batch_id",
688
+ **common_labels
689
+ }),
690
+ "num_paused_requests":
691
+ self.model_type_metric_family.Metric(
692
+ labels={
693
+ "inflight_batcher_specific_metric": "paused_requests",
694
+ **common_labels
695
+ }),
696
+ })
697
+
698
+ def initialize(self, args):
699
+ """`initialize` is called only once when the model is being loaded.
700
+ Implementing `initialize` function is optional. This function allows
701
+ the model to initialize any state associated with this model.
702
+
703
+ Parameters
704
+ ----------
705
+ args : dict
706
+ Both keys and values are strings. The dictionary keys and values are:
707
+ * model_config: A JSON string containing the model configuration
708
+ * model_instance_kind: A string containing model instance kind
709
+ * model_instance_device_id: A string containing model instance device ID
710
+ * model_repository: Model repository path
711
+ * model_version: Model version
712
+ * model_name: Model name
713
+ """
714
+ model_config = json.loads(args['model_config'])
715
+ gpt_model_path = get_parameter(model_config, "gpt_model_path")
716
+ if get_parameter(model_config, "enable_trt_overlap", bool):
717
+ raise pb_utils.TritonModelException(
718
+ f"enable_trt_overlap=true is not supported.")
719
+ self.exclude_input_from_output = get_parameter(
720
+ model_config, "exclude_input_in_output", bool)
721
+ executor_config = self.get_executor_config(model_config)
722
+ self.executor = trtllm.Executor(gpt_model_path,
723
+ trtllm.ModelType.DECODER_ONLY,
724
+ executor_config)
725
+ self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
726
+ model_config)
727
+ self.cancellation_check_period_ms = get_parameter(
728
+ model_config, "cancellation_check_period_ms", int) or 100
729
+ self.stats_check_period_ms = get_parameter(
730
+ model_config, "stats_check_period_ms", int) or 100
731
+
732
+ if not self.decoupled:
733
+ raise pb_utils.TritonModelException(
734
+ "Please enable decoupled transaction policy in the model configuration to serve this model"
735
+ )
736
+
737
+ self.create_metrics(args["model_name"],
738
+ args["model_version"],
739
+ is_v1_model=executor_config.batching_type ==
740
+ trtllm.BatchingType.STATIC)
741
+ self.triton_user_id_to_req_ids = {}
742
+ self.triton_req_id_to_req_ids = {}
743
+ self.req_id_to_request_data = {}
744
+ self.lock = Lock()
745
+ self.running = False
746
+ self.awaiter_thread = Thread(target=self.awaiter_loop)
747
+ self.cancellation_thread = Thread(target=self.cancellation_loop)
748
+ self.metrics_thread = Thread(target=self.metrics_loop)
749
+ if self.executor.can_enqueue_requests():
750
+ self.running = True
751
+ self.awaiter_thread.start()
752
+ self.cancellation_thread.start()
753
+ self.metrics_thread.start()
754
+ else:
755
+ # In leader mode, worker ranks will wait here until leader is done.
756
+ self.executor.shutdown()
757
+
758
+ def handle_stop_request(self, triton_user_id, response_sender):
759
+ if triton_user_id is None or triton_user_id == "":
760
+ response_sender.send(
761
+ pb_utils.InferenceResponse(error=pb_utils.TritonError(
762
+ "A request id must be provided for request cancellation")),
763
+ flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
764
+ return
765
+
766
+ with self.lock:
767
+ if triton_user_id in self.triton_user_id_to_req_ids:
768
+ req_ids = self.triton_user_id_to_req_ids[triton_user_id]
769
+ for req_id in req_ids:
770
+ self.executor.cancel_request(req_id)
771
+
772
+ response_sender.send(
773
+ pb_utils.InferenceResponse(),
774
+ flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
775
+
776
+ def execute(self, requests):
777
+ """`execute` must be implemented in every Python model. `execute`
778
+ function receives a list of pb_utils.InferenceRequest as the only
779
+ argument. This function is called when an inference is requested
780
+ for this model.
781
+
782
+ Parameters
783
+ ----------
784
+ requests : list
785
+ A list of pb_utils.InferenceRequest
786
+
787
+ Returns
788
+ -------
789
+ list
790
+ A list of pb_utils.InferenceResponse. The length of this list must
791
+ be the same as `requests`
792
+ """
793
+ if not self.executor.can_enqueue_requests():
794
+ return
795
+
796
+ # Convert to executor requests.
797
+
798
+ triton_requests = []
799
+ executor_requests = []
800
+ batch_indices = []
801
+ triton_user_ids = []
802
+ triton_req_ids = []
803
+
804
+ for request in requests:
805
+
806
+ triton_user_id = request.request_id()
807
+
808
+ response_sender = request.get_response_sender()
809
+ stop = get_input_scalar_by_name(request, 'stop')
810
+
811
+ if stop:
812
+ self.handle_stop_request(triton_user_id, response_sender)
813
+ else:
814
+ #Unique request id used to identify each triton request
815
+ triton_req_id = str(randint(0, sys.maxsize))
816
+ self.triton_req_id_to_req_ids[triton_req_id] = set()
817
+ if triton_user_id is not None and triton_user_id != "":
818
+ self.triton_user_id_to_req_ids[triton_user_id] = set()
819
+
820
+ try:
821
+ converted_reqs = convert_request(
822
+ request, self.exclude_input_from_output,
823
+ self.decoupled)
824
+ except Exception as e:
825
+ response_sender.send(
826
+ pb_utils.InferenceResponse(error=pb_utils.TritonError(
827
+ f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'"
828
+ )),
829
+ flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
830
+ else:
831
+ for batch_index, converted_req in enumerate(
832
+ converted_reqs):
833
+ triton_requests.append(request)
834
+ executor_requests.append(converted_req)
835
+ triton_user_ids.append(triton_user_id)
836
+ triton_req_ids.append(triton_req_id)
837
+ batch_indices.append(batch_index)
838
+
839
+ with self.lock:
840
+ request_ids = self.executor.enqueue_requests(executor_requests)
841
+ for req_id, triton_req_id, triton_user_id, triton_request, batch_index in zip(
842
+ request_ids, triton_req_ids, triton_user_ids,
843
+ triton_requests, batch_indices):
844
+ self.req_id_to_request_data[
845
+ req_id] = triton_req_id, triton_user_id, batch_index, triton_request.get_response_sender(
846
+ )
847
+ self.triton_req_id_to_req_ids[triton_req_id].add(req_id)
848
+ if triton_user_id is not None and triton_user_id != "":
849
+ self.triton_user_id_to_req_ids[triton_user_id].add(req_id)
850
+
851
+ return None
852
+
853
+ def awaiter_loop(self):
854
+ """Gets responses from executor and returns the results."""
855
+ while self.running:
856
+ for response in self.executor.await_responses(
857
+ timeout=datetime.timedelta(milliseconds=1)):
858
+ req_id = response.request_id
859
+ with self.lock:
860
+ if req_id not in self.req_id_to_request_data:
861
+ continue
862
+ triton_req_id, triton_user_id, batch_index, response_sender = self.req_id_to_request_data[
863
+ req_id]
864
+
865
+ triton_response, is_final = convert_response(
866
+ response, batch_index)
867
+
868
+ triton_request_final = False
869
+ if is_final:
870
+ with self.lock:
871
+ # Check if all executor requests part of that triton request are finished
872
+ self.triton_req_id_to_req_ids[triton_req_id].remove(
873
+ req_id)
874
+ if len(self.triton_req_id_to_req_ids[triton_req_id]
875
+ ) == 0:
876
+ pb_utils.Logger.log_info(
877
+ f"DELETING Req id {req_id}, triton_req_id {triton_req_id} "
878
+ )
879
+ triton_request_final = True
880
+ del self.triton_req_id_to_req_ids[triton_req_id]
881
+ if triton_user_id is not None and triton_user_id != "":
882
+ del self.triton_user_id_to_req_ids[
883
+ triton_user_id]
884
+ del self.req_id_to_request_data[req_id]
885
+
886
+ response_sender.send(
887
+ triton_response,
888
+ flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
889
+ if triton_request_final else 0)
890
+
891
+ # Remove local reference so response_sender can be cleaned properly.
892
+ del response_sender
893
+
894
+ def cancellation_loop(self):
895
+ """Checks if any pending requests have been cancelled."""
896
+ while self.running:
897
+ time.sleep(self.cancellation_check_period_ms / 1000.0)
898
+ with self.lock:
899
+ for req_id, (triton_req_id, triton_user_id, batch_index,
900
+ response_sender
901
+ ) in self.req_id_to_request_data.items():
902
+ if response_sender.is_cancelled():
903
+ self.executor.cancel_request(req_id)
904
+ # Remove local reference so response_sender can be cleaned properly.
905
+ del response_sender
906
+
907
+ def metrics_loop(self):
908
+ """Updates triton metrics using stats from the executor."""
909
+ while self.running:
910
+ time.sleep(self.stats_check_period_ms / 1000.0)
911
+ for stat in self.executor.get_latest_iteration_stats():
912
+ try:
913
+ for key, metric in self.all_metrics.items():
914
+ value = None
915
+ if hasattr(stat, key):
916
+ value = getattr(stat, key)
917
+ elif stat.kv_cache_stats is not None and hasattr(
918
+ stat.kv_cache_stats, key):
919
+ value = getattr(stat.kv_cache_stats, key)
920
+ elif stat.static_batching_stats is not None and hasattr(
921
+ stat.static_batching_stats, key):
922
+ value = getattr(stat.static_batching_stats, key)
923
+ elif stat.inflight_batching_stats is not None and hasattr(
924
+ stat.inflight_batching_stats, key):
925
+ value = getattr(stat.inflight_batching_stats, key)
926
+ if value is not None:
927
+ if key == "timestamp":
928
+ value = convert_timestamp_to_seconds(value)
929
+ metric.set(value)
930
+ else:
931
+ pb_utils.Logger.log_warn(
932
+ f"Metric \"{key}\" not found.")
933
+ except Exception as e:
934
+ pb_utils.Logger.log_warn(
935
+ f"Error while processing metrics: {e}")
936
+
937
+ def finalize(self):
938
+ """`finalize` is called only once when the model is being unloaded.
939
+ Implementing `finalize` function is optional. This function allows
940
+ the model to perform any necessary clean ups before exit.
941
+ """
942
+ if self.executor.can_enqueue_requests():
943
+ self.running = False
944
+ self.awaiter_thread.join()
945
+ self.cancellation_thread.join()
946
+ self.metrics_thread.join()
947
+ self.executor.shutdown()
tensorrt_llm/1/rank0.engine ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b8418460f6786395ac4ace17e6dafad6c2b60a021fb247da853718db2c4fd13
3
+ size 1065214420
tensorrt_llm/1/rank1.engine ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c40598916cbd21bcfa434ae02004e9e8b1d6f50445a3f5bdd4bb3971634072cf
3
+ size 1065215172
tensorrt_llm/config.pbtxt ADDED
@@ -0,0 +1,556 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "tensorrt_llm"
28
+ backend: "tensorrtllm"
29
+ max_batch_size: 32
30
+
31
+ model_transaction_policy {
32
+ decoupled: True
33
+ }
34
+
35
+ input [
36
+ {
37
+ name: "input_ids"
38
+ data_type: TYPE_INT32
39
+ dims: [ -1 ]
40
+ allow_ragged_batch: true
41
+ },
42
+ {
43
+ name: "input_lengths"
44
+ data_type: TYPE_INT32
45
+ dims: [ 1 ]
46
+ reshape: { shape: [ ] }
47
+ },
48
+ {
49
+ name: "request_output_len"
50
+ data_type: TYPE_INT32
51
+ dims: [ 1 ]
52
+ reshape: { shape: [ ] }
53
+ },
54
+ {
55
+ name: "draft_input_ids"
56
+ data_type: TYPE_INT32
57
+ dims: [ -1 ]
58
+ optional: true
59
+ allow_ragged_batch: true
60
+ },
61
+ {
62
+ name: "decoder_input_ids"
63
+ data_type: TYPE_INT32
64
+ dims: [ -1 ]
65
+ optional: true
66
+ allow_ragged_batch: true
67
+ },
68
+ {
69
+ name: "decoder_input_lengths"
70
+ data_type: TYPE_INT32
71
+ dims: [ 1 ]
72
+ optional: true
73
+ reshape: { shape: [ ] }
74
+ },
75
+ {
76
+ name: "draft_logits"
77
+ data_type: TYPE_FP32
78
+ dims: [ -1, -1 ]
79
+ optional: true
80
+ allow_ragged_batch: true
81
+ },
82
+ {
83
+ name: "draft_acceptance_threshold"
84
+ data_type: TYPE_FP32
85
+ dims: [ 1 ]
86
+ reshape: { shape: [ ] }
87
+ optional: true
88
+ },
89
+ {
90
+ name: "end_id"
91
+ data_type: TYPE_INT32
92
+ dims: [ 1 ]
93
+ reshape: { shape: [ ] }
94
+ optional: true
95
+ },
96
+ {
97
+ name: "pad_id"
98
+ data_type: TYPE_INT32
99
+ dims: [ 1 ]
100
+ reshape: { shape: [ ] }
101
+ optional: true
102
+ },
103
+ {
104
+ name: "stop_words_list"
105
+ data_type: TYPE_INT32
106
+ dims: [ 2, -1 ]
107
+ optional: true
108
+ allow_ragged_batch: true
109
+ },
110
+ {
111
+ name: "bad_words_list"
112
+ data_type: TYPE_INT32
113
+ dims: [ 2, -1 ]
114
+ optional: true
115
+ allow_ragged_batch: true
116
+ },
117
+ {
118
+ name: "embedding_bias"
119
+ data_type: TYPE_FP32
120
+ dims: [ -1 ]
121
+ optional: true
122
+ allow_ragged_batch: true
123
+ },
124
+ {
125
+ name: "beam_width"
126
+ data_type: TYPE_INT32
127
+ dims: [ 1 ]
128
+ reshape: { shape: [ ] }
129
+ optional: true
130
+ },
131
+ {
132
+ name: "temperature"
133
+ data_type: TYPE_FP32
134
+ dims: [ 1 ]
135
+ reshape: { shape: [ ] }
136
+ optional: true
137
+ },
138
+ {
139
+ name: "runtime_top_k"
140
+ data_type: TYPE_INT32
141
+ dims: [ 1 ]
142
+ reshape: { shape: [ ] }
143
+ optional: true
144
+ },
145
+ {
146
+ name: "runtime_top_p"
147
+ data_type: TYPE_FP32
148
+ dims: [ 1 ]
149
+ reshape: { shape: [ ] }
150
+ optional: true
151
+ },
152
+ {
153
+ name: "runtime_top_p_min"
154
+ data_type: TYPE_FP32
155
+ dims: [ 1 ]
156
+ reshape: { shape: [ ] }
157
+ optional: true
158
+ },
159
+ {
160
+ name: "runtime_top_p_decay"
161
+ data_type: TYPE_FP32
162
+ dims: [ 1 ]
163
+ reshape: { shape: [ ] }
164
+ optional: true
165
+ },
166
+ {
167
+ name: "runtime_top_p_reset_ids"
168
+ data_type: TYPE_INT32
169
+ dims: [ 1 ]
170
+ reshape: { shape: [ ] }
171
+ optional: true
172
+ },
173
+ {
174
+ name: "len_penalty"
175
+ data_type: TYPE_FP32
176
+ dims: [ 1 ]
177
+ reshape: { shape: [ ] }
178
+ optional: true
179
+ },
180
+ {
181
+ name: "early_stopping"
182
+ data_type: TYPE_BOOL
183
+ dims: [ 1 ]
184
+ reshape: { shape: [ ] }
185
+ optional: true
186
+ },
187
+ {
188
+ name: "repetition_penalty"
189
+ data_type: TYPE_FP32
190
+ dims: [ 1 ]
191
+ reshape: { shape: [ ] }
192
+ optional: true
193
+ },
194
+ {
195
+ name: "min_length"
196
+ data_type: TYPE_INT32
197
+ dims: [ 1 ]
198
+ reshape: { shape: [ ] }
199
+ optional: true
200
+ },
201
+ {
202
+ name: "beam_search_diversity_rate"
203
+ data_type: TYPE_FP32
204
+ dims: [ 1 ]
205
+ reshape: { shape: [ ] }
206
+ optional: true
207
+ },
208
+ {
209
+ name: "presence_penalty"
210
+ data_type: TYPE_FP32
211
+ dims: [ 1 ]
212
+ reshape: { shape: [ ] }
213
+ optional: true
214
+ },
215
+ {
216
+ name: "frequency_penalty"
217
+ data_type: TYPE_FP32
218
+ dims: [ 1 ]
219
+ reshape: { shape: [ ] }
220
+ optional: true
221
+ },
222
+ {
223
+ name: "random_seed"
224
+ data_type: TYPE_UINT64
225
+ dims: [ 1 ]
226
+ reshape: { shape: [ ] }
227
+ optional: true
228
+ },
229
+ {
230
+ name: "return_log_probs"
231
+ data_type: TYPE_BOOL
232
+ dims: [ 1 ]
233
+ reshape: { shape: [ ] }
234
+ optional: true
235
+ },
236
+ {
237
+ name: "return_context_logits"
238
+ data_type: TYPE_BOOL
239
+ dims: [ 1 ]
240
+ reshape: { shape: [ ] }
241
+ optional: true
242
+ },
243
+ {
244
+ name: "return_generation_logits"
245
+ data_type: TYPE_BOOL
246
+ dims: [ 1 ]
247
+ reshape: { shape: [ ] }
248
+ optional: true
249
+ },
250
+ {
251
+ name: "stop"
252
+ data_type: TYPE_BOOL
253
+ dims: [ 1 ]
254
+ reshape: { shape: [ ] }
255
+ optional: true
256
+ },
257
+ {
258
+ name: "streaming"
259
+ data_type: TYPE_BOOL
260
+ dims: [ 1 ]
261
+ reshape: { shape: [ ] }
262
+ optional: true
263
+ },
264
+ {
265
+ name: "prompt_embedding_table"
266
+ data_type: TYPE_FP16
267
+ dims: [ -1, -1 ]
268
+ optional: true
269
+ allow_ragged_batch: true
270
+ },
271
+ {
272
+ name: "prompt_vocab_size"
273
+ data_type: TYPE_INT32
274
+ dims: [ 1 ]
275
+ reshape: { shape: [ ] }
276
+ optional: true
277
+ },
278
+ # the unique task ID for the given LoRA.
279
+ # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
280
+ # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
281
+ # If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached.
282
+ {
283
+ name: "lora_task_id"
284
+ data_type: TYPE_UINT64
285
+ dims: [ 1 ]
286
+ reshape: { shape: [ ] }
287
+ optional: true
288
+ },
289
+ # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
290
+ # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
291
+ # each of the in / out tensors are first flattened and then concatenated together in the format above.
292
+ # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
293
+ {
294
+ name: "lora_weights"
295
+ data_type: TYPE_FP16
296
+ dims: [ -1, -1 ]
297
+ optional: true
298
+ allow_ragged_batch: true
299
+ },
300
+ # module identifier (same size a first dimension of lora_weights)
301
+ # See LoraModule::ModuleType for model id mapping
302
+ #
303
+ # "attn_qkv": 0 # compbined qkv adapter
304
+ # "attn_q": 1 # q adapter
305
+ # "attn_k": 2 # k adapter
306
+ # "attn_v": 3 # v adapter
307
+ # "attn_dense": 4 # adapter for the dense layer in attention
308
+ # "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
309
+ # "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
310
+ # "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
311
+ #
312
+ # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
313
+ {
314
+ name: "lora_config"
315
+ data_type: TYPE_INT32
316
+ dims: [ -1, 3 ]
317
+ optional: true
318
+ allow_ragged_batch: true
319
+ }
320
+ ]
321
+ output [
322
+ {
323
+ name: "output_ids"
324
+ data_type: TYPE_INT32
325
+ dims: [ -1, -1 ]
326
+ },
327
+ {
328
+ name: "sequence_length"
329
+ data_type: TYPE_INT32
330
+ dims: [ -1 ]
331
+ },
332
+ {
333
+ name: "cum_log_probs"
334
+ data_type: TYPE_FP32
335
+ dims: [ -1 ]
336
+ },
337
+ {
338
+ name: "output_log_probs"
339
+ data_type: TYPE_FP32
340
+ dims: [ -1, -1 ]
341
+ },
342
+ {
343
+ name: "context_logits"
344
+ data_type: TYPE_FP32
345
+ dims: [ -1, -1 ]
346
+ },
347
+ {
348
+ name: "generation_logits"
349
+ data_type: TYPE_FP32
350
+ dims: [ -1, -1, -1 ]
351
+ },
352
+ {
353
+ name: "batch_index"
354
+ data_type: TYPE_INT32
355
+ dims: [ 1 ]
356
+ }
357
+ ]
358
+ instance_group [
359
+ {
360
+ count: 1
361
+ kind : KIND_CPU
362
+ }
363
+ ]
364
+ parameters: {
365
+ key: "max_beam_width"
366
+ value: {
367
+ string_value: "1"
368
+ }
369
+ }
370
+ parameters: {
371
+ key: "FORCE_CPU_ONLY_INPUT_TENSORS"
372
+ value: {
373
+ string_value: "no"
374
+ }
375
+ }
376
+ parameters: {
377
+ key: "gpt_model_type"
378
+ value: {
379
+ string_value: "inflight_fused_batching"
380
+ }
381
+ }
382
+ parameters: {
383
+ key: "gpt_model_path"
384
+ value: {
385
+ string_value: "/all_models/inflight_batcher_llm/tensorrt_llm/1"
386
+ }
387
+ }
388
+ parameters: {
389
+ key: "encoder_model_path"
390
+ value: {
391
+ string_value: "${encoder_engine_dir}"
392
+ }
393
+ }
394
+ parameters: {
395
+ key: "max_tokens_in_paged_kv_cache"
396
+ value: {
397
+ string_value: "${max_tokens_in_paged_kv_cache}"
398
+ }
399
+ }
400
+ parameters: {
401
+ key: "max_attention_window_size"
402
+ value: {
403
+ string_value: "${max_attention_window_size}"
404
+ }
405
+ }
406
+ parameters: {
407
+ key: "sink_token_length"
408
+ value: {
409
+ string_value: "${sink_token_length}"
410
+ }
411
+ }
412
+ parameters: {
413
+ key: "batch_scheduler_policy"
414
+ value: {
415
+ string_value: "guaranteed_no_evict"
416
+ }
417
+ }
418
+ parameters: {
419
+ key: "kv_cache_free_gpu_mem_fraction"
420
+ value: {
421
+ string_value: "0.1"
422
+ }
423
+ }
424
+ parameters: {
425
+ key: "kv_cache_host_memory_bytes"
426
+ value: {
427
+ string_value: "${kv_cache_host_memory_bytes}"
428
+ }
429
+ }
430
+ parameters: {
431
+ key: "kv_cache_onboard_blocks"
432
+ value: {
433
+ string_value: "${kv_cache_onboard_blocks}"
434
+ }
435
+ }
436
+ # enable_trt_overlap is deprecated and doesn't have any effect on the runtime
437
+ # parameters: {
438
+ # key: "enable_trt_overlap"
439
+ # value: {
440
+ # string_value: "${enable_trt_overlap}"
441
+ # }
442
+ # }
443
+ parameters: {
444
+ key: "exclude_input_in_output"
445
+ value: {
446
+ string_value: "True"
447
+ }
448
+ }
449
+ parameters: {
450
+ key: "cancellation_check_period_ms"
451
+ value: {
452
+ string_value: "${cancellation_check_period_ms}"
453
+ }
454
+ }
455
+ parameters: {
456
+ key: "stats_check_period_ms"
457
+ value: {
458
+ string_value: "${stats_check_period_ms}"
459
+ }
460
+ }
461
+ parameters: {
462
+ key: "iter_stats_max_iterations"
463
+ value: {
464
+ string_value: "${iter_stats_max_iterations}"
465
+ }
466
+ }
467
+ parameters: {
468
+ key: "request_stats_max_iterations"
469
+ value: {
470
+ string_value: "${request_stats_max_iterations}"
471
+ }
472
+ }
473
+ parameters: {
474
+ key: "enable_kv_cache_reuse"
475
+ value: {
476
+ string_value: "True"
477
+ }
478
+ }
479
+ parameters: {
480
+ key: "normalize_log_probs"
481
+ value: {
482
+ string_value: "${normalize_log_probs}"
483
+ }
484
+ }
485
+ parameters: {
486
+ key: "enable_chunked_context"
487
+ value: {
488
+ string_value: "${enable_chunked_context}"
489
+ }
490
+ }
491
+ parameters: {
492
+ key: "gpu_device_ids"
493
+ value: {
494
+ string_value: "0,1"
495
+ }
496
+ }
497
+ parameters: {
498
+ key: "lora_cache_optimal_adapter_size"
499
+ value: {
500
+ string_value: "${lora_cache_optimal_adapter_size}"
501
+ }
502
+ }
503
+ parameters: {
504
+ key: "lora_cache_max_adapter_size"
505
+ value: {
506
+ string_value: "${lora_cache_max_adapter_size}"
507
+ }
508
+ }
509
+ parameters: {
510
+ key: "lora_cache_gpu_memory_fraction"
511
+ value: {
512
+ string_value: "${lora_cache_gpu_memory_fraction}"
513
+ }
514
+ }
515
+ parameters: {
516
+ key: "lora_cache_host_memory_bytes"
517
+ value: {
518
+ string_value: "${lora_cache_host_memory_bytes}"
519
+ }
520
+ }
521
+ parameters: {
522
+ key: "decoding_mode"
523
+ value: {
524
+ string_value: "top_k_top_p"
525
+ }
526
+ }
527
+ parameters: {
528
+ key: "executor_worker_path"
529
+ value: {
530
+ string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
531
+ }
532
+ }
533
+ parameters: {
534
+ key: "medusa_choices"
535
+ value: {
536
+ string_value: "${medusa_choices}"
537
+ }
538
+ }
539
+ parameters: {
540
+ key: "gpu_weights_percent"
541
+ value: {
542
+ string_value: "${gpu_weights_percent}"
543
+ }
544
+ }
545
+ parameters: {
546
+ key: "enable_context_fmha_fp32_acc"
547
+ value: {
548
+ string_value: "${enable_context_fmha_fp32_acc}"
549
+ }
550
+ }
551
+ parameters: {
552
+ key: "multi_block_mode"
553
+ value: {
554
+ string_value: "${multi_block_mode}"
555
+ }
556
+ }
tensorrt_llm_bls/1/__pycache__/model.cpython-310.pyc ADDED
Binary file (3.15 kB). View file
 
tensorrt_llm_bls/1/lib/__pycache__/decode.cpython-310.pyc ADDED
Binary file (10.3 kB). View file
 
tensorrt_llm_bls/1/lib/__pycache__/triton_decoder.cpython-310.pyc ADDED
Binary file (11.5 kB). View file
 
tensorrt_llm_bls/1/lib/decode.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ from collections.abc import Generator
28
+ from dataclasses import dataclass
29
+ from typing import Optional
30
+
31
+ import numpy as np
32
+ import torch
33
+
34
+
35
+ class RequestValidationError(Exception):
36
+ pass
37
+
38
+
39
+ def _validate_that(condition: bool, msg: str):
40
+ if not condition:
41
+ raise RequestValidationError(msg)
42
+
43
+
44
+ def _validate_non_empty(data, msg: str):
45
+ if isinstance(data, torch.Tensor):
46
+ _validate_that(data is not None and data.numel() > 0, msg)
47
+ else:
48
+ _validate_that(data is not None and data.size > 0, msg)
49
+
50
+
51
+ def _validate_single_gt_0(data, msg: str):
52
+ _validate_non_empty(data, msg)
53
+ _validate_that(data.flatten()[0] > 0, msg)
54
+
55
+
56
+ def _single_value(data: Optional[np.ndarray]):
57
+ if data is None:
58
+ return None
59
+ return data.flatten()[0]
60
+
61
+
62
+ @dataclass
63
+ class Request:
64
+ text_input: np.ndarray = np.array([])
65
+ decoder_text_input: np.ndarray = None
66
+ image_input: Optional[np.ndarray] = None
67
+ max_tokens: Optional[np.ndarray] = None
68
+ bad_words: Optional[np.ndarray] = None
69
+ stop_words: Optional[np.ndarray] = None
70
+ end_id: Optional[np.ndarray] = None
71
+ pad_id: Optional[np.ndarray] = None
72
+ top_k: Optional[np.ndarray] = None
73
+ top_p: Optional[np.ndarray] = None
74
+ temperature: Optional[np.ndarray] = None
75
+ length_penalty: Optional[np.ndarray] = None
76
+ repetition_penalty: Optional[np.ndarray] = None
77
+ min_length: Optional[np.ndarray] = None
78
+ return_log_probs: Optional[np.ndarray] = None
79
+ prompt_embedding_table: Optional[np.ndarray] = None
80
+ prompt_vocab_size: Optional[np.ndarray] = None
81
+ embedding_bias_words: Optional[np.ndarray] = None
82
+ embedding_bias_weights: Optional[np.ndarray] = None
83
+ num_draft_tokens: Optional[np.ndarray] = None
84
+ use_draft_logits: Optional[np.ndarray] = None
85
+ stream: Optional[np.ndarray] = None
86
+ beam_width: Optional[np.ndarray] = None
87
+ return_context_logits: Optional[np.ndarray] = None
88
+ return_generation_logits: Optional[np.ndarray] = None
89
+ random_seed: Optional[np.ndarray] = None
90
+ presence_penalty: Optional[np.ndarray] = None
91
+ frequency_penalty: Optional[np.ndarray] = None
92
+
93
+ def validate(self):
94
+ _validate_non_empty(self.text_input, "text_input is required")
95
+ _validate_single_gt_0(self.max_tokens,
96
+ "max_tokens must be a single value > 0")
97
+
98
+ num_draft_tokens = _single_value(self.num_draft_tokens)
99
+ _single_value(self.return_generation_logits)
100
+ context_logits = _single_value(self.return_context_logits)
101
+
102
+ if num_draft_tokens:
103
+ _validate_that(
104
+ not self.stream.any(),
105
+ "streaming is not supported with speculative decoding")
106
+ _validate_that(
107
+ not context_logits,
108
+ "context logits are not supported with speculative decoding")
109
+
110
+
111
+ @dataclass
112
+ class DraftRequest:
113
+ draft_input_ids: Optional[np.ndarray] = None
114
+ draft_logits: Optional[np.ndarray] = None
115
+
116
+
117
+ @dataclass
118
+ class PreprocResponse:
119
+ input_ids: np.ndarray = np.array([])
120
+ decoder_input_ids: np.ndarray = None
121
+ input_lengths: np.ndarray = np.array([])
122
+ decoder_input_lengths: np.ndarray = None
123
+ bad_words_list: Optional[np.ndarray] = None
124
+ stop_words_list: Optional[np.ndarray] = None
125
+ embedding_bias: Optional[np.ndarray] = None
126
+ end_id: Optional[np.ndarray] = None
127
+ pad_id: Optional[np.ndarray] = None
128
+
129
+ @classmethod
130
+ def with_new_inputs(cls,
131
+ other,
132
+ input_ids: Optional[np.ndarray] = None,
133
+ input_lengths: Optional[np.ndarray] = None):
134
+ return cls(input_ids=(input_ids
135
+ if input_ids is not None else other.input_ids),
136
+ input_lengths=(input_lengths if input_lengths is not None
137
+ else other.input_lengths),
138
+ decoder_input_ids=other.decoder_input_ids,
139
+ decoder_input_lengths=other.decoder_input_lengths,
140
+ bad_words_list=other.bad_words_list,
141
+ stop_words_list=other.stop_words_list,
142
+ end_id=other.end_id,
143
+ pad_id=other.pad_id)
144
+
145
+
146
+ @dataclass
147
+ class MultimodalEncResponse:
148
+ prompt_embedding_table: Optional[torch.Tensor] = None
149
+ prompt_vocab_size: Optional[np.ndarray] = None
150
+
151
+
152
+ @dataclass
153
+ class GenerationResponse:
154
+ output_ids: np.ndarray = np.array([])
155
+ sequence_length: np.ndarray = np.array([])
156
+ cum_log_probs: Optional[np.ndarray] = None
157
+ output_log_probs: Optional[np.ndarray] = None
158
+ context_logits: Optional[np.ndarray] = None
159
+ generation_logits: Optional[np.ndarray] = None
160
+ batch_index: Optional[np.ndarray] = None
161
+
162
+
163
+ @dataclass
164
+ class Response:
165
+ text_output: np.ndarray = np.array([])
166
+ cum_log_probs: Optional[np.ndarray] = None
167
+ output_log_probs: Optional[np.ndarray] = None
168
+ context_logits: Optional[np.ndarray] = None
169
+ generation_logits: Optional[np.ndarray] = None
170
+ batch_index: Optional[np.ndarray] = None
171
+
172
+ def __eq__(self, o) -> bool:
173
+ """Just for testing"""
174
+ if not isinstance(o, Response):
175
+ return False
176
+ return (np.array_equal(self.text_output, o.text_output)
177
+ and np.array_equal(self.cum_log_probs, o.cum_log_probs)
178
+ and np.array_equal(self.output_log_probs, o.output_log_probs)
179
+ and np.array_equal(self.context_logits, o.context_logits)
180
+ and np.array_equal(self.generation_logits, o.generation_logits)
181
+ and np.array_equal(self.batch_index, o.batch_index))
182
+
183
+
184
+ class Decoder:
185
+
186
+ def __init__(self, streaming=False, accumulate=False):
187
+ self._streaming = streaming
188
+ self._accumulate = accumulate
189
+
190
+ self._accumulated_tokens = []
191
+
192
+ def decode(self,
193
+ request: Request,
194
+ speculative_decoding=False,
195
+ is_multimodal=False) -> Generator[Response, None, None]:
196
+
197
+ batch_size = request.text_input.shape[0]
198
+ self._accumulated_tokens = [None] * batch_size
199
+ preproc_response = self.preprocess(request)
200
+
201
+ multimodal_enc_response = None
202
+ if is_multimodal:
203
+ multimodal_enc_response = self._multimodal_enc_generate(request)
204
+
205
+ if speculative_decoding:
206
+ if batch_size > 1:
207
+ raise Exception(
208
+ "speculative decoding is not supported with batch size > 1"
209
+ )
210
+ for gen_response in self._spec_generate(preproc_response, request):
211
+ yield self.postprocess(gen_response, batch_size)
212
+ else:
213
+ if not self._streaming and batch_size == 1:
214
+ gen_response = self._generate_non_streaming(
215
+ preproc_response,
216
+ request,
217
+ multimodal_enc_response=multimodal_enc_response)
218
+ yield self.postprocess(gen_response, batch_size)
219
+ else:
220
+ for gen_response in self._generate(
221
+ preproc_response,
222
+ request,
223
+ multimodal_enc_response=multimodal_enc_response):
224
+ yield self.postprocess(gen_response, batch_size)
225
+
226
+ def encountered_stop_words(self, input_ids, stop_words_ids):
227
+ for stop_word_ids in stop_words_ids:
228
+ if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids):
229
+ return True
230
+ return False
231
+
232
+ def _spec_generate(
233
+ self, preproc: PreprocResponse,
234
+ request: Request) -> Generator[GenerationResponse, None, None]:
235
+
236
+ if preproc.input_ids.shape[0] > 1:
237
+ raise Exception(
238
+ "Speculative decoding does not support batch size > 1.")
239
+
240
+ prompt_input_ids: np.ndarray = preproc.input_ids[0]
241
+ input_ids: np.ndarray = prompt_input_ids
242
+ output_len: int = request.max_tokens[0][0]
243
+ last_input_ids: np.ndarray = None
244
+ draft_output_ids: np.ndarray = None
245
+ draft_logits: np.ndarray = None
246
+
247
+ target_response: GenerationResponse = None
248
+
249
+ cur_preproc = preproc
250
+
251
+ counter = 0
252
+ while True:
253
+ counter += 1
254
+ num_draft_tokens = min(
255
+ request.num_draft_tokens[0][0],
256
+ len(prompt_input_ids) + output_len - len(input_ids) - 1)
257
+
258
+ draft_request = None
259
+ if num_draft_tokens > 0:
260
+ draft_response: GenerationResponse = self._draft_generate_non_streaming(
261
+ cur_preproc, request, num_draft_tokens)
262
+ seq_len: int = draft_response.sequence_length[0][0]
263
+ # [1, beamWidth, outputLength] -> [outputLen]
264
+ draft_output_ids = draft_response.output_ids[0][0]
265
+ # [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded]
266
+ if request.use_draft_logits is not None and request.use_draft_logits[
267
+ 0]:
268
+ if draft_response.generation_logits is not None:
269
+ draft_logits = draft_response.generation_logits[0][0]
270
+
271
+ input_draft_tokens = draft_output_ids[len(input_ids):seq_len]
272
+ draft_request = DraftRequest(
273
+ draft_input_ids=np.expand_dims(input_draft_tokens, 0))
274
+ if request.use_draft_logits is not None and request.use_draft_logits[
275
+ 0]:
276
+ draft_request.draft_logits = np.expand_dims(
277
+ draft_logits[-len(input_draft_tokens):], 0)
278
+ else:
279
+ draft_request = DraftRequest()
280
+ target_response = self._generate_non_streaming(
281
+ cur_preproc, request, draft_request)
282
+ last_input_ids = input_ids
283
+ input_ids = target_response.output_ids[0][0]
284
+ cur_preproc = PreprocResponse.with_new_inputs(
285
+ cur_preproc, np.expand_dims(input_ids, 0),
286
+ np.array([[len(input_ids)]], dtype=np.int32))
287
+
288
+ # Evaluate criteria to stop generation loop.
289
+ # If we've hit or exceeded the max output length, should stop
290
+ length_stop = (len(input_ids) >=
291
+ len(prompt_input_ids) + output_len)
292
+ if length_stop:
293
+ break
294
+ # If draft and target have same outputs, should stop. Normally target should return 1 more token.
295
+ # If they are the same length, they should differ at the last token
296
+ target_draft_equal = draft_output_ids is not None and np.array_equal(
297
+ draft_output_ids, input_ids)
298
+ if target_draft_equal:
299
+ break
300
+ # If tokens no longer change, should stop, means we have hit early stopping
301
+ last_current_equal = np.array_equal(last_input_ids, input_ids)
302
+ if last_current_equal:
303
+ break
304
+ # Need to check if stop words was encountered
305
+ hit_stop_words = self.encountered_stop_words(
306
+ input_ids, preproc.stop_words_list[0])
307
+ if hit_stop_words:
308
+ break
309
+
310
+ yield target_response
311
+
312
+ def _draft_generate_non_streaming(
313
+ self, preproc: PreprocResponse, request: Request,
314
+ num_draft_tokens: int) -> GenerationResponse:
315
+ raise NotImplementedError()
316
+
317
+ def _multimodal_enc_generate(
318
+ self,
319
+ request: Request,
320
+ ) -> MultimodalEncResponse:
321
+ raise NotImplementedError()
322
+
323
+ def _generate(
324
+ self,
325
+ preproc: PreprocResponse,
326
+ request: Request,
327
+ draft_request: Optional[DraftRequest] = None,
328
+ multimodal_enc_response: Optional[MultimodalEncResponse] = None,
329
+ ) -> Generator[GenerationResponse, None, None]:
330
+ raise NotImplementedError()
331
+
332
+ def _generate_non_streaming(
333
+ self,
334
+ preproc: PreprocResponse,
335
+ request: Request,
336
+ draft_request: Optional[DraftRequest] = None,
337
+ multimodal_enc_response: Optional[MultimodalEncResponse] = None,
338
+ ) -> GenerationResponse:
339
+ raise NotImplementedError()
340
+
341
+ def postprocess(self, gen_response: GenerationResponse,
342
+ batch_size) -> Response:
343
+ if self._accumulate and self._streaming:
344
+ new_tokens: np.ndarray = gen_response.output_ids
345
+ if new_tokens.ndim != 3:
346
+ raise Exception("Expected output_ids tensor to have 3 dims.")
347
+ if new_tokens.shape[0] != 1:
348
+ raise Exception("Expected batch size of 1")
349
+ if new_tokens.shape[1] != 1:
350
+ raise Exception(
351
+ "Accumulation of tokens is only implemented for beam width = 1"
352
+ )
353
+
354
+ batch_index = gen_response.batch_index
355
+ if batch_index.ndim != 2:
356
+ raise Exception("Expected batch_index tensor to have 2 dims.")
357
+ if batch_index.shape[0] != 1:
358
+ raise Exception("Expected batch size of 1")
359
+ if batch_index.shape[1] != 1:
360
+ raise Exception("Expected only one batch_index")
361
+
362
+ batch_index = batch_index[0][0]
363
+
364
+ self._accumulated_tokens[batch_index] = new_tokens if (
365
+ self._accumulated_tokens[batch_index] is None
366
+ ) else np.concatenate(
367
+ (self._accumulated_tokens[batch_index], new_tokens), axis=2)
368
+ sequence_lengths = np.array(
369
+ [[self._accumulated_tokens[batch_index].shape[2]]],
370
+ dtype=np.int32)
371
+ return self._postprocess(self._accumulated_tokens[batch_index],
372
+ sequence_lengths, gen_response)
373
+ else:
374
+ return self._postprocess(gen_response.output_ids, None,
375
+ gen_response)
376
+
377
+ def _postprocess(self, tokens: np.ndarray,
378
+ sequence_lengths: Optional[np.ndarray],
379
+ gen_response: GenerationResponse) -> Response:
380
+ raise NotImplementedError()
381
+
382
+ def preprocess(self, request: Request) -> PreprocResponse:
383
+ raise NotImplementedError()
384
+
385
+ def reset_decoder(self):
386
+ self._accumulated_tokens = []
tensorrt_llm_bls/1/lib/triton_decoder.py ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ from collections.abc import Callable
28
+ from typing import Dict, Optional
29
+
30
+ import numpy as np
31
+ import triton_python_backend_utils as pb_utils
32
+ from lib.decode import *
33
+ from torch.utils.dlpack import from_dlpack, to_dlpack
34
+ from typing_extensions import override
35
+
36
+
37
+ class TritonDecoder(Decoder):
38
+
39
+ def __init__(self,
40
+ streaming=False,
41
+ accumulate=False,
42
+ preproc_model_name="preprocessing",
43
+ postproc_model_name="postprocessing",
44
+ llm_model_name="tensorrt_llm",
45
+ draft_llm_model_name: Optional[str] = None,
46
+ multimodal_encoders_name: Optional[str] = None):
47
+ super().__init__(streaming=streaming, accumulate=accumulate)
48
+ self.preproc_model_name = preproc_model_name
49
+ self.postproc_model_name = postproc_model_name
50
+ self.llm_model_name = llm_model_name
51
+ self.draft_llm_model_name = draft_llm_model_name
52
+ self.multimodal_encoders_name = multimodal_encoders_name
53
+
54
+ self._preproc_outputs = [
55
+ "INPUT_ID",
56
+ "DECODER_INPUT_ID",
57
+ "REQUEST_INPUT_LEN",
58
+ "REQUEST_DECODER_INPUT_LEN",
59
+ "BAD_WORDS_IDS",
60
+ "STOP_WORDS_IDS",
61
+ "EMBEDDING_BIAS",
62
+ "OUT_PAD_ID",
63
+ "OUT_END_ID",
64
+ ]
65
+
66
+ self._multimodal_enc_outputs = [
67
+ "OUT_PROMPT_EMBEDDING_TABLE", "OUT_PROMPT_VOCAB_SIZE"
68
+ ]
69
+
70
+ self._llm_outputs = [
71
+ "output_ids", "sequence_length", "cum_log_probs",
72
+ "output_log_probs", "context_logits", "generation_logits",
73
+ "batch_index"
74
+ ]
75
+
76
+ self._postproc_outputs = [
77
+ "OUTPUT",
78
+ ]
79
+
80
+ self.input_names = [
81
+ "text_input",
82
+ "decoder_text_input",
83
+ "image_input",
84
+ "max_tokens",
85
+ "bad_words",
86
+ "stop_words",
87
+ "end_id",
88
+ "pad_id",
89
+ "top_k",
90
+ "top_p",
91
+ "temperature",
92
+ "length_penalty",
93
+ "repetition_penalty",
94
+ "min_length",
95
+ "presence_penalty",
96
+ "frequency_penalty",
97
+ "random_seed",
98
+ "return_log_probs",
99
+ "return_context_logits",
100
+ "return_generation_logits",
101
+ "beam_width",
102
+ "stream",
103
+ "prompt_embedding_table",
104
+ "prompt_vocab_size",
105
+ "embedding_bias_words",
106
+ "embedding_bias_weights",
107
+ "num_draft_tokens",
108
+ "use_draft_logits",
109
+ ]
110
+
111
+ self.__undo_reshape_whitelist = {
112
+ "max_tokens",
113
+ "end_id",
114
+ "pad_id",
115
+ "top_k",
116
+ "top_p",
117
+ "temperature",
118
+ "length_penalty",
119
+ "repetition_penalty",
120
+ "min_length",
121
+ "presence_penalty",
122
+ "frequency_penalty",
123
+ "random_seed",
124
+ "return_log_probs",
125
+ "return_context_logits",
126
+ "return_generation_logits",
127
+ "beam_width",
128
+ "stream",
129
+ "prompt_vocab_size",
130
+ "num_draft_tokens",
131
+ "use_draft_logits",
132
+ }
133
+
134
+ def _exec_triton_request(self, request):
135
+ responses = request.exec(decoupled=True)
136
+ for r in responses:
137
+ if r.has_error():
138
+ raise pb_utils.TritonModelException(r.error().message())
139
+ yield r
140
+
141
+ def _exec_triton_request_single(self, request):
142
+ responses = request.exec(decoupled=False)
143
+ if responses.has_error():
144
+ raise pb_utils.TritonModelException(responses.error().message())
145
+ return responses
146
+
147
+ def create_triton_response(self, response: Response):
148
+ name_map = {
149
+ "text_output": "text_output",
150
+ "cum_log_probs": "cum_log_probs",
151
+ "output_log_probs": "output_log_probs",
152
+ "context_logits": "context_logits",
153
+ "generation_logits": "generation_logits",
154
+ "batch_index": "batch_index"
155
+ }
156
+ tensors = self.create_triton_tensors(response, name_map)
157
+ return pb_utils.InferenceResponse(output_tensors=tensors)
158
+
159
+ def convert_triton_request(self, triton_request) -> Request:
160
+ request = Request()
161
+ for triton_name in self.input_names:
162
+ tensor = pb_utils.get_input_tensor_by_name(triton_request,
163
+ triton_name)
164
+ target_name = triton_name
165
+ if tensor is None:
166
+ continue
167
+ if not hasattr(request, target_name):
168
+ raise AttributeError(
169
+ f"Request has no attribute '{target_name}'")
170
+ setattr(request, target_name, tensor.as_numpy())
171
+ return request
172
+
173
+ def convert_triton_response(self,
174
+ triton_response,
175
+ response_factory: Callable,
176
+ name_map=None):
177
+ response = response_factory()
178
+ for tensor in triton_response.output_tensors():
179
+ if tensor is None:
180
+ continue
181
+ triton_name = tensor.name()
182
+ if tensor.is_cpu():
183
+ value = tensor.as_numpy()
184
+ else:
185
+ # If the tensor is in GPU memory make it torch.Tensor type
186
+ value = from_dlpack(tensor.to_dlpack())
187
+ target_name = triton_name
188
+ if name_map and triton_name in name_map:
189
+ target_name = name_map[triton_name]
190
+ if name_map and not triton_name in name_map:
191
+ continue
192
+ if target_name is None:
193
+ # explicitly ignore this triton input
194
+ continue
195
+ if not hasattr(response, target_name):
196
+ raise AttributeError(
197
+ f"response object has not attribute '{target_name}'")
198
+ setattr(response, target_name, value)
199
+ return response
200
+
201
+ def __undo_reshape(self, x, name):
202
+ if name in self.__undo_reshape_whitelist and len(x.shape) == 1:
203
+ # handle reshapes
204
+ return np.expand_dims(x, 0)
205
+ else:
206
+ return x
207
+
208
+ def create_triton_tensors(self, obj, name_map: dict):
209
+ tensors = []
210
+ for name, triton_name in name_map.items():
211
+ if triton_name is None:
212
+ continue
213
+ value = getattr(obj, name)
214
+ if value is None:
215
+ continue
216
+ if isinstance(value, np.ndarray):
217
+ t = pb_utils.Tensor(triton_name,
218
+ self.__undo_reshape(value, name))
219
+ elif isinstance(value, torch.Tensor):
220
+ t = pb_utils.Tensor.from_dlpack(
221
+ triton_name, to_dlpack(self.__undo_reshape(value, name)))
222
+ tensors.append(t)
223
+ return tensors
224
+
225
+ @override
226
+ def preprocess(self, request: Request) -> PreprocResponse:
227
+ input_tensors = self._get_preproc_tensors(request)
228
+ triton_req = pb_utils.InferenceRequest(
229
+ model_name=self.preproc_model_name,
230
+ inputs=input_tensors,
231
+ requested_output_names=self._preproc_outputs)
232
+ triton_output = self._exec_triton_request_single(triton_req)
233
+ return self._get_preproc_response(triton_output)
234
+
235
+ def _get_preproc_tensors(self, request: Request):
236
+ name_map = {
237
+ "text_input": "QUERY",
238
+ "decoder_text_input": "DECODER_QUERY",
239
+ "max_tokens": "REQUEST_OUTPUT_LEN",
240
+ "bad_words": "BAD_WORDS_DICT",
241
+ "stop_words": "STOP_WORDS_DICT",
242
+ "embedding_bias_words": "EMBEDDING_BIAS_WORDS",
243
+ "embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS",
244
+ "pad_id": "PAD_ID",
245
+ "end_id": "END_ID",
246
+ }
247
+ return self.create_triton_tensors(request, name_map)
248
+
249
+ def _get_preproc_response(self, triton_output):
250
+ name_map = {
251
+ "INPUT_ID": "input_ids",
252
+ "DECODER_INPUT_ID": "decoder_input_ids",
253
+ "REQUEST_INPUT_LEN": "input_lengths",
254
+ "REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths",
255
+ "BAD_WORDS_IDS": "bad_words_list",
256
+ "STOP_WORDS_IDS": "stop_words_list",
257
+ "EMBEDDING_BIAS": "embedding_bias",
258
+ "OUT_PAD_ID": "pad_id",
259
+ "OUT_END_ID": "end_id",
260
+ }
261
+ return self.convert_triton_response(triton_output, PreprocResponse,
262
+ name_map)
263
+
264
+ @override
265
+ def _multimodal_enc_generate(self,
266
+ request: Request) -> MultimodalEncResponse:
267
+ input_tensors = self._get_multimodal_enc_tensors(request)
268
+ triton_req = pb_utils.InferenceRequest(
269
+ model_name=self.multimodal_encoders_name,
270
+ inputs=input_tensors,
271
+ requested_output_names=self._multimodal_enc_outputs)
272
+ triton_output = self._exec_triton_request_single(triton_req)
273
+ return self._get_multimodal_enc_response(triton_output)
274
+
275
+ def _get_multimodal_enc_tensors(self, preproc: PreprocResponse):
276
+ name_map = {
277
+ "image_input": "IMAGE",
278
+ }
279
+ return self.create_triton_tensors(preproc, name_map)
280
+
281
+ def _get_multimodal_enc_response(self, triton_output):
282
+ name_map = {
283
+ "OUT_PROMPT_EMBEDDING_TABLE": "prompt_embedding_table",
284
+ "OUT_PROMPT_VOCAB_SIZE": "prompt_vocab_size",
285
+ }
286
+ return self.convert_triton_response(triton_output,
287
+ MultimodalEncResponse, name_map)
288
+
289
+ @override
290
+ def _draft_generate_non_streaming(
291
+ self, preproc: PreprocResponse, request: Request,
292
+ num_draft_tokens: int) -> GenerationResponse:
293
+ input_tensors = self._get_llm_tensors(preproc, request,
294
+ num_draft_tokens, None, True)
295
+ triton_req = pb_utils.InferenceRequest(
296
+ model_name=self.draft_llm_model_name,
297
+ inputs=input_tensors,
298
+ requested_output_names=self._llm_outputs)
299
+ triton_response = self._exec_triton_request_single(triton_req)
300
+ llm_response = self._get_llm_response(triton_response)
301
+ return llm_response
302
+
303
+ @override
304
+ def _generate(
305
+ self,
306
+ preproc: PreprocResponse,
307
+ request: Request,
308
+ draft_request: Optional[DraftRequest] = None,
309
+ multimodal_enc_response: Optional[MultimodalEncResponse] = None
310
+ ) -> Generator[GenerationResponse, None, None]:
311
+ input_tensors = self._get_llm_tensors(
312
+ preproc,
313
+ request,
314
+ None,
315
+ draft_request,
316
+ multimodal_enc_response=multimodal_enc_response)
317
+ triton_req = pb_utils.InferenceRequest(
318
+ model_name=self.llm_model_name,
319
+ inputs=input_tensors,
320
+ requested_output_names=self._llm_outputs)
321
+ for r in self._exec_triton_request(triton_req):
322
+ yield self._get_llm_response(r)
323
+
324
+ @override
325
+ def _generate_non_streaming(
326
+ self,
327
+ preproc: PreprocResponse,
328
+ request: Request,
329
+ draft_request: Optional[DraftRequest] = None,
330
+ multimodal_enc_response: Optional[MultimodalEncResponse] = None
331
+ ) -> GenerationResponse:
332
+ input_tensors = self._get_llm_tensors(
333
+ preproc,
334
+ request,
335
+ None,
336
+ draft_request,
337
+ multimodal_enc_response=multimodal_enc_response)
338
+ triton_req = pb_utils.InferenceRequest(
339
+ model_name=self.llm_model_name,
340
+ inputs=input_tensors,
341
+ requested_output_names=self._llm_outputs)
342
+ r = self._exec_triton_request_single(triton_req)
343
+ return self._get_llm_response(r)
344
+
345
+ def _get_llm_tensors(
346
+ self,
347
+ preproc: PreprocResponse,
348
+ request: Request,
349
+ num_output_tokens: Optional[int] = None,
350
+ draft_request: Optional[DraftRequest] = None,
351
+ is_draft_model_request: bool = False,
352
+ multimodal_enc_response: MultimodalEncResponse = None):
353
+ tensors = []
354
+ tensors.extend(self._get_tensors_from_preproc(preproc))
355
+ if multimodal_enc_response is not None:
356
+ tensors.extend(
357
+ self._get_tensors_from_multimodal_enc(multimodal_enc_response))
358
+ tensors.extend(
359
+ self._get_llm_tensors_from_request(request, num_output_tokens,
360
+ draft_request,
361
+ is_draft_model_request))
362
+ return tensors
363
+
364
+ def _get_tensors_from_preproc(self, preproc: PreprocResponse):
365
+ name_map = {
366
+ "input_ids": "input_ids",
367
+ "decoder_input_ids": "decoder_input_ids",
368
+ "input_lengths": "input_lengths",
369
+ "bad_words_list": "bad_words_list",
370
+ "stop_words_list": "stop_words_list",
371
+ "embedding_bias": "embedding_bias",
372
+ "pad_id": "pad_id",
373
+ "end_id": "end_id",
374
+ }
375
+ return self.create_triton_tensors(preproc, name_map)
376
+
377
+ def _get_tensors_from_multimodal_enc(
378
+ self, multimodal_enc_response: MultimodalEncResponse):
379
+ name_map = {
380
+ "prompt_embedding_table": "prompt_embedding_table",
381
+ "prompt_vocab_size": "prompt_vocab_size",
382
+ }
383
+ return self.create_triton_tensors(multimodal_enc_response, name_map)
384
+
385
+ def _get_llm_tensors_from_request(
386
+ self,
387
+ request: Request,
388
+ num_output_tokens: Optional[int] = None,
389
+ draft_request: Optional[DraftRequest] = None,
390
+ is_draft_model_request: bool = False):
391
+ name_map: Dict[str, Optional[str]] = {
392
+ "beam_width": "beam_width",
393
+ "top_k": "runtime_top_k",
394
+ "top_p": "runtime_top_p",
395
+ "temperature": "temperature",
396
+ "length_penalty": "len_penalty",
397
+ "repetition_penalty": "repetition_penalty",
398
+ "min_length": "min_length",
399
+ "presence_penalty": "presence_penalty",
400
+ "frequency_penalty": "frequency_penalty",
401
+ "random_seed": "random_seed",
402
+ "return_log_probs": "return_log_probs",
403
+ "stream": "streaming",
404
+ "prompt_embedding_table": "prompt_embedding_table",
405
+ "prompt_vocab_size": "prompt_vocab_size",
406
+ }
407
+ batch_size = request.text_input.shape[0]
408
+ tensors = self.create_triton_tensors(request, name_map)
409
+ out_len_tensor = None
410
+ if request.max_tokens is not None:
411
+ out_len_tensor = request.max_tokens
412
+
413
+ out_len = None
414
+ if num_output_tokens is not None:
415
+ out_len = num_output_tokens
416
+ elif draft_request:
417
+ out_len = len(
418
+ draft_request.draft_input_ids[0]
419
+ ) + 1 if draft_request.draft_input_ids is not None else 1
420
+
421
+ if out_len is not None:
422
+ out_len_tensor = [[out_len]] * batch_size
423
+
424
+ if out_len_tensor is None:
425
+ raise Exception("Could not determine request_output_len")
426
+ else:
427
+ tensors.append(
428
+ pb_utils.Tensor("request_output_len",
429
+ np.array(out_len_tensor, dtype=np.int32)))
430
+
431
+ if draft_request:
432
+ if draft_request.draft_input_ids is not None:
433
+ tensors.append(
434
+ pb_utils.Tensor("draft_input_ids",
435
+ draft_request.draft_input_ids))
436
+ if draft_request.draft_logits is not None and request.use_draft_logits is not None and request.use_draft_logits[
437
+ 0]:
438
+ tensors.append(
439
+ pb_utils.Tensor("draft_logits",
440
+ draft_request.draft_logits))
441
+
442
+ return_context_logits_data = [False]
443
+ return_generation_logits_data = [False]
444
+ if draft_request is None:
445
+ if is_draft_model_request:
446
+ return_generation_logits_data = request.use_draft_logits if request.use_draft_logits is not None else [
447
+ False
448
+ ]
449
+ else:
450
+ return_context_logits_data = request.return_context_logits if request.return_context_logits is not None else [
451
+ False
452
+ ]
453
+ return_generation_logits_data = request.return_generation_logits if request.return_generation_logits is not None else [
454
+ False
455
+ ]
456
+ return_context_logits = np.array([return_context_logits_data] *
457
+ batch_size,
458
+ dtype=bool)
459
+ return_generation_logits = np.array([return_generation_logits_data] *
460
+ batch_size,
461
+ dtype=bool)
462
+
463
+ assert len(return_context_logits.shape) == 2
464
+ assert len(return_generation_logits.shape) == 2
465
+
466
+ tensors.append(
467
+ pb_utils.Tensor("return_context_logits", return_context_logits))
468
+ tensors.append(
469
+ pb_utils.Tensor("return_generation_logits",
470
+ return_generation_logits))
471
+ return tensors
472
+
473
+ def _get_llm_response(self, triton_output):
474
+ name_map = {
475
+ "output_ids": "output_ids",
476
+ "sequence_length": "sequence_length",
477
+ "cum_log_probs": "cum_log_probs",
478
+ "output_log_probs": "output_log_probs",
479
+ "context_logits": "context_logits",
480
+ "generation_logits": "generation_logits",
481
+ "batch_index": "batch_index",
482
+ }
483
+ return self.convert_triton_response(triton_output, GenerationResponse,
484
+ name_map)
485
+
486
+ def _postprocess(self, tokens: np.ndarray,
487
+ sequence_lengths: Optional[np.ndarray],
488
+ gen_response: GenerationResponse) -> Response:
489
+ input_tensors = self._get_postproc_tensors(tokens, sequence_lengths,
490
+ gen_response)
491
+ triton_req = pb_utils.InferenceRequest(
492
+ model_name=self.postproc_model_name,
493
+ inputs=input_tensors,
494
+ requested_output_names=self._postproc_outputs)
495
+ r = self._exec_triton_request_single(triton_req)
496
+ response = self._get_response(r, gen_response)
497
+ return response
498
+
499
+ def _get_postproc_tensors(self, tokens: np.ndarray,
500
+ sequence_lengths: Optional[np.ndarray],
501
+ gen_response: GenerationResponse):
502
+ tensors = [
503
+ pb_utils.Tensor("TOKENS_BATCH", tokens),
504
+ pb_utils.Tensor(
505
+ "SEQUENCE_LENGTH", sequence_lengths
506
+ if sequence_lengths else gen_response.sequence_length)
507
+ ]
508
+ return tensors
509
+
510
+ def _get_response(self, triton_output, gen_res: GenerationResponse):
511
+ tensors = triton_output.output_tensors()
512
+ t_map = {}
513
+ for named_t in tensors:
514
+ name = named_t.name()
515
+ t = named_t.as_numpy()
516
+ t_map[name] = t
517
+ response = Response(text_output=t_map["OUTPUT"],
518
+ cum_log_probs=gen_res.cum_log_probs,
519
+ output_log_probs=gen_res.output_log_probs,
520
+ context_logits=gen_res.context_logits,
521
+ generation_logits=gen_res.generation_logits,
522
+ batch_index=gen_res.batch_index)
523
+ return response
tensorrt_llm_bls/1/model.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ import json
28
+ import traceback
29
+
30
+ import triton_python_backend_utils as pb_utils
31
+ from lib.triton_decoder import TritonDecoder
32
+
33
+
34
+ def get_valid_param_value(param, default_value=''):
35
+ value = param.get('string_value', '')
36
+ return default_value if value.startswith('${') or value == '' else value
37
+
38
+
39
+ class TritonPythonModel:
40
+
41
+ def initialize(self, args):
42
+
43
+ # Parse model configs
44
+ model_config = json.loads(args['model_config'])
45
+
46
+ params = model_config['parameters']
47
+
48
+ accumulate_tokens_str = get_valid_param_value(
49
+ params.get('accumulate_tokens', {}))
50
+ self.accumulate_tokens = accumulate_tokens_str.lower() in [
51
+ 'true', 'yes', '1', 't'
52
+ ]
53
+
54
+ self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
55
+ model_config)
56
+
57
+ self.logger = pb_utils.Logger
58
+
59
+ default_tensorrt_llm_model_name = 'tensorrt_llm'
60
+ self.llm_model_name = get_valid_param_value(
61
+ params.get('tensorrt_llm_model_name', {}),
62
+ default_tensorrt_llm_model_name)
63
+
64
+ self.draft_llm_model_name = get_valid_param_value(
65
+ params.get('tensorrt_llm_draft_model_name', {}), None)
66
+
67
+ self.multimodal_encoders_name = get_valid_param_value(
68
+ params.get('multimodal_encoders_name', {}), None)
69
+
70
+ self.decoder = TritonDecoder(
71
+ streaming=self.decoupled,
72
+ accumulate=self.accumulate_tokens,
73
+ preproc_model_name="preprocessing",
74
+ postproc_model_name="postprocessing",
75
+ llm_model_name=self.llm_model_name,
76
+ draft_llm_model_name=self.draft_llm_model_name,
77
+ multimodal_encoders_name=self.multimodal_encoders_name)
78
+
79
+ def execute(self, requests):
80
+
81
+ responses = []
82
+
83
+ for request in requests:
84
+ if self.decoupled:
85
+ response_sender = request.get_response_sender()
86
+ try:
87
+
88
+ req = self.decoder.convert_triton_request(request)
89
+ req.validate()
90
+ speculative_decode = (req.num_draft_tokens is not None
91
+ and req.num_draft_tokens[0][0] > 0)
92
+ if speculative_decode and (self.draft_llm_model_name is None
93
+ or self.draft_llm_model_name == ""):
94
+ raise Exception(
95
+ "cannot perform speculative decoding without draft model"
96
+ )
97
+ is_multimodal = req.image_input is not None
98
+
99
+ if speculative_decode and is_multimodal:
100
+ raise Exception(
101
+ "Multimodal and speculative decoding is not currently supported"
102
+ )
103
+ res_gen = self.decoder.decode(
104
+ req,
105
+ speculative_decoding=speculative_decode,
106
+ is_multimodal=is_multimodal)
107
+
108
+ for res in res_gen:
109
+ triton_response = self.decoder.create_triton_response(res)
110
+ if self.decoupled:
111
+ response_sender.send(triton_response)
112
+ else:
113
+ responses.append(triton_response)
114
+
115
+ if self.decoupled:
116
+ response_sender.send(
117
+ flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
118
+
119
+ except Exception:
120
+ self.logger.log_error(traceback.format_exc())
121
+ # If encountering an error, send a response with err msg
122
+ error_response = pb_utils.InferenceResponse(
123
+ output_tensors=[],
124
+ error=pb_utils.TritonError(traceback.format_exc()))
125
+
126
+ if self.decoupled:
127
+ response_sender.send(error_response)
128
+ response_sender.send(
129
+ flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
130
+ else:
131
+ responses.append(error_response)
132
+
133
+ self.decoder.reset_decoder()
134
+ if self.decoupled:
135
+ return None
136
+ else:
137
+ assert len(responses) == len(requests)
138
+ return responses
139
+
140
+ def finalize(self):
141
+ """`finalize` is called only once when the model is being unloaded.
142
+ Implementing `finalize` function is optional. This function allows
143
+ the model to perform any necessary clean ups before exit.
144
+ """
145
+ print('Cleaning up...')
tensorrt_llm_bls/config.pbtxt ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "tensorrt_llm_bls"
28
+ backend: "python"
29
+ max_batch_size: 32
30
+
31
+ model_transaction_policy {
32
+ decoupled: True
33
+ }
34
+
35
+ input [
36
+ {
37
+ name: "text_input"
38
+ data_type: TYPE_STRING
39
+ dims: [ 1 ]
40
+ },
41
+ {
42
+ name: "decoder_text_input"
43
+ data_type: TYPE_STRING
44
+ dims: [ 1 ]
45
+ optional: true
46
+ },
47
+ {
48
+ name: "image_input"
49
+ data_type: TYPE_FP16
50
+ dims: [ 3, -1, -1 ]
51
+ optional: true
52
+ },
53
+ {
54
+ name: "max_tokens"
55
+ data_type: TYPE_INT32
56
+ dims: [ 1 ]
57
+ },
58
+ {
59
+ name: "bad_words"
60
+ data_type: TYPE_STRING
61
+ dims: [ -1 ]
62
+ optional: true
63
+ },
64
+ {
65
+ name: "stop_words"
66
+ data_type: TYPE_STRING
67
+ dims: [ -1 ]
68
+ optional: true
69
+ },
70
+ {
71
+ name: "end_id"
72
+ data_type: TYPE_INT32
73
+ dims: [ 1 ]
74
+ optional: true
75
+ },
76
+ {
77
+ name: "pad_id"
78
+ data_type: TYPE_INT32
79
+ dims: [ 1 ]
80
+ optional: true
81
+ },
82
+ {
83
+ name: "top_k"
84
+ data_type: TYPE_INT32
85
+ dims: [ 1 ]
86
+ optional: true
87
+ },
88
+ {
89
+ name: "top_p"
90
+ data_type: TYPE_FP32
91
+ dims: [ 1 ]
92
+ optional: true
93
+ },
94
+ {
95
+ name: "temperature"
96
+ data_type: TYPE_FP32
97
+ dims: [ 1 ]
98
+ optional: true
99
+ },
100
+ {
101
+ name: "length_penalty"
102
+ data_type: TYPE_FP32
103
+ dims: [ 1 ]
104
+ optional: true
105
+ },
106
+ {
107
+ name: "repetition_penalty"
108
+ data_type: TYPE_FP32
109
+ dims: [ 1 ]
110
+ optional: true
111
+ },
112
+ {
113
+ name: "min_length"
114
+ data_type: TYPE_INT32
115
+ dims: [ 1 ]
116
+ optional: true
117
+ },
118
+ {
119
+ name: "presence_penalty"
120
+ data_type: TYPE_FP32
121
+ dims: [ 1 ]
122
+ optional: true
123
+ },
124
+ {
125
+ name: "frequency_penalty"
126
+ data_type: TYPE_FP32
127
+ dims: [ 1 ]
128
+ optional: true
129
+ },
130
+ {
131
+ name: "random_seed"
132
+ data_type: TYPE_UINT64
133
+ dims: [ 1 ]
134
+ optional: true
135
+ },
136
+ {
137
+ name: "return_log_probs"
138
+ data_type: TYPE_BOOL
139
+ dims: [ 1 ]
140
+ reshape: { shape: [ ] }
141
+ optional: true
142
+ },
143
+ {
144
+ name: "return_context_logits"
145
+ data_type: TYPE_BOOL
146
+ dims: [ 1 ]
147
+ reshape: { shape: [ ] }
148
+ optional: true
149
+ },
150
+ {
151
+ name: "return_generation_logits"
152
+ data_type: TYPE_BOOL
153
+ dims: [ 1 ]
154
+ reshape: { shape: [ ] }
155
+ optional: true
156
+ },
157
+ {
158
+ name: "beam_width"
159
+ data_type: TYPE_INT32
160
+ dims: [ 1 ]
161
+ optional: true
162
+ },
163
+ {
164
+ name: "stream"
165
+ data_type: TYPE_BOOL
166
+ dims: [ 1 ]
167
+ optional: true
168
+ },
169
+ {
170
+ name: "prompt_embedding_table"
171
+ data_type: TYPE_FP16
172
+ dims: [ -1, -1 ]
173
+ optional: true
174
+ },
175
+ {
176
+ name: "prompt_vocab_size"
177
+ data_type: TYPE_INT32
178
+ dims: [ 1 ]
179
+ optional: true
180
+ },
181
+ {
182
+ name: "embedding_bias_words"
183
+ data_type: TYPE_STRING
184
+ dims: [ -1 ]
185
+ optional: true
186
+ },
187
+ {
188
+ name: "embedding_bias_weights"
189
+ data_type: TYPE_FP32
190
+ dims: [ -1 ]
191
+ optional: true
192
+ },
193
+ {
194
+ name: "num_draft_tokens",
195
+ data_type: TYPE_INT32,
196
+ dims: [ 1 ]
197
+ optional: true
198
+ },
199
+ {
200
+ name: "use_draft_logits",
201
+ data_type: TYPE_BOOL,
202
+ dims: [ 1 ]
203
+ reshape: { shape: [ ] }
204
+ optional: true
205
+ }
206
+ ]
207
+ output [
208
+ {
209
+ name: "text_output"
210
+ data_type: TYPE_STRING
211
+ dims: [ -1 ]
212
+ },
213
+ {
214
+ name: "cum_log_probs"
215
+ data_type: TYPE_FP32
216
+ dims: [ -1 ]
217
+ },
218
+ {
219
+ name: "output_log_probs"
220
+ data_type: TYPE_FP32
221
+ dims: [ -1, -1 ]
222
+ },
223
+ {
224
+ name: "context_logits"
225
+ data_type: TYPE_FP32
226
+ dims: [ -1, -1 ]
227
+ },
228
+ {
229
+ name: "generation_logits"
230
+ data_type: TYPE_FP32
231
+ dims: [ -1, -1, -1 ]
232
+ },
233
+ {
234
+ name: "batch_index"
235
+ data_type: TYPE_INT32
236
+ dims: [ 1 ]
237
+ }
238
+ ]
239
+
240
+ parameters: {
241
+ key: "accumulate_tokens"
242
+ value: {
243
+ string_value: "${accumulate_tokens}"
244
+ }
245
+ }
246
+ parameters: {
247
+ key: "tensorrt_llm_model_name"
248
+ value: {
249
+ string_value: "tensorrt_llm"
250
+ }
251
+ }
252
+ parameters: {
253
+ key: "tensorrt_llm_draft_model_name"
254
+ value: {
255
+ string_value: "${tensorrt_llm_draft_model_name}"
256
+ }
257
+ }
258
+ parameters: {
259
+ key: "multimodal_encoders_name"
260
+ value: {
261
+ string_value: "${multimodal_encoders_name}"
262
+ }
263
+ }
264
+
265
+ instance_group [
266
+ {
267
+ count: 1
268
+ kind : KIND_CPU
269
+ }
270
+ ]