otherhalf-dev commited on
Commit
a2eca76
·
verified ·
1 Parent(s): 1f1bf26

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tensorrt_llm/1/rank0.engine filter=lfs diff=lfs merge=lfs -text
37
+ tensorrt_llm/1/rank1.engine filter=lfs diff=lfs merge=lfs -text
ensemble/1/.tmp ADDED
File without changes
ensemble/config.pbtxt ADDED
@@ -0,0 +1,606 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "ensemble"
28
+ platform: "ensemble"
29
+ max_batch_size: 32
30
+ input [
31
+ {
32
+ name: "text_input"
33
+ data_type: TYPE_STRING
34
+ dims: [ 1 ]
35
+ },
36
+ {
37
+ name: "decoder_text_input"
38
+ data_type: TYPE_STRING
39
+ dims: [ 1 ]
40
+ optional: true
41
+ },
42
+ {
43
+ name: "max_tokens"
44
+ data_type: TYPE_INT32
45
+ dims: [ 1 ]
46
+ },
47
+ {
48
+ name: "num_return_sequences"
49
+ data_type: TYPE_INT32
50
+ dims: [ 1 ]
51
+ optional: true
52
+ },
53
+ {
54
+ name: "bad_words"
55
+ data_type: TYPE_STRING
56
+ dims: [ -1 ]
57
+ optional: true
58
+ },
59
+ {
60
+ name: "stop_words"
61
+ data_type: TYPE_STRING
62
+ dims: [ -1 ]
63
+ optional: true
64
+ },
65
+ {
66
+ name: "exclude_input_in_output"
67
+ data_type: TYPE_BOOL
68
+ dims: [ 1 ]
69
+ optional: true
70
+ },
71
+ {
72
+ name: "end_id"
73
+ data_type: TYPE_INT32
74
+ dims: [ 1 ]
75
+ optional: true
76
+ },
77
+ {
78
+ name: "pad_id"
79
+ data_type: TYPE_INT32
80
+ dims: [ 1 ]
81
+ optional: true
82
+ },
83
+ {
84
+ name: "top_k"
85
+ data_type: TYPE_INT32
86
+ dims: [ 1 ]
87
+ optional: true
88
+ },
89
+ {
90
+ name: "top_p"
91
+ data_type: TYPE_FP32
92
+ dims: [ 1 ]
93
+ optional: true
94
+ },
95
+ {
96
+ name: "temperature"
97
+ data_type: TYPE_FP32
98
+ dims: [ 1 ]
99
+ optional: true
100
+ },
101
+ {
102
+ name: "length_penalty"
103
+ data_type: TYPE_FP32
104
+ dims: [ 1 ]
105
+ optional: true
106
+ },
107
+ {
108
+ name: "repetition_penalty"
109
+ data_type: TYPE_FP32
110
+ dims: [ 1 ]
111
+ optional: true
112
+ },
113
+ {
114
+ name: "min_length"
115
+ data_type: TYPE_INT32
116
+ dims: [ 1 ]
117
+ optional: true
118
+ },
119
+ {
120
+ name: "presence_penalty"
121
+ data_type: TYPE_FP32
122
+ dims: [ 1 ]
123
+ optional: true
124
+ },
125
+ {
126
+ name: "frequency_penalty"
127
+ data_type: TYPE_FP32
128
+ dims: [ 1 ]
129
+ optional: true
130
+ },
131
+ {
132
+ name: "random_seed"
133
+ data_type: TYPE_UINT64
134
+ dims: [ 1 ]
135
+ optional: true
136
+ },
137
+ {
138
+ name: "return_log_probs"
139
+ data_type: TYPE_BOOL
140
+ dims: [ 1 ]
141
+ optional: true
142
+ },
143
+ {
144
+ name: "return_context_logits"
145
+ data_type: TYPE_BOOL
146
+ dims: [ 1 ]
147
+ optional: true
148
+ },
149
+ {
150
+ name: "return_generation_logits"
151
+ data_type: TYPE_BOOL
152
+ dims: [ 1 ]
153
+ optional: true
154
+ },
155
+ {
156
+ name: "return_kv_cache_reuse_stats"
157
+ data_type: TYPE_BOOL
158
+ dims: [ 1 ]
159
+ optional: true
160
+ },
161
+ {
162
+ name: "beam_width"
163
+ data_type: TYPE_INT32
164
+ dims: [ 1 ]
165
+ optional: true
166
+ },
167
+ {
168
+ name: "stream"
169
+ data_type: TYPE_BOOL
170
+ dims: [ 1 ]
171
+ optional: true
172
+ },
173
+ {
174
+ name: "prompt_embedding_table"
175
+ data_type: TYPE_FP16
176
+ dims: [ -1, -1 ]
177
+ optional: true
178
+ },
179
+ {
180
+ name: "prompt_table_extra_id"
181
+ data_type: TYPE_UINT64
182
+ dims: [ 1 ]
183
+ optional: true
184
+ },
185
+ {
186
+ name: "prompt_vocab_size"
187
+ data_type: TYPE_INT32
188
+ dims: [ 1 ]
189
+ optional: true
190
+ },
191
+ {
192
+ name: "embedding_bias_words"
193
+ data_type: TYPE_STRING
194
+ dims: [ -1 ]
195
+ optional: true
196
+ },
197
+ {
198
+ name: "embedding_bias_weights"
199
+ data_type: TYPE_FP32
200
+ dims: [ -1 ]
201
+ optional: true
202
+ },
203
+ # the unique task ID for the given LoRA.
204
+ # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
205
+ # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
206
+ # If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached.
207
+ {
208
+ name: "lora_task_id"
209
+ data_type: TYPE_UINT64
210
+ dims: [ 1 ]
211
+ optional: true
212
+ },
213
+ # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
214
+ # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
215
+ # each of the in / out tensors are first flattened and then concatenated together in the format above.
216
+ # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
217
+ {
218
+ name: "lora_weights"
219
+ data_type: TYPE_FP16
220
+ dims: [ -1, -1 ]
221
+ optional: true
222
+ allow_ragged_batch: true
223
+ },
224
+ # module identifier (same size a first dimension of lora_weights)
225
+ # See LoraModule::ModuleType for model id mapping
226
+ #
227
+ # "attn_qkv": 0 # compbined qkv adapter
228
+ # "attn_q": 1 # q adapter
229
+ # "attn_k": 2 # k adapter
230
+ # "attn_v": 3 # v adapter
231
+ # "attn_dense": 4 # adapter for the dense layer in attention
232
+ # "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
233
+ # "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
234
+ # "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
235
+ #
236
+ # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
237
+ {
238
+ name: "lora_config"
239
+ data_type: TYPE_INT32
240
+ dims: [ -1, 3 ]
241
+ optional: true
242
+ allow_ragged_batch: true
243
+ },
244
+ {
245
+ name: "guided_decoding_guide_type"
246
+ data_type: TYPE_STRING
247
+ dims: [ 1 ]
248
+ optional: true
249
+ allow_ragged_batch: true
250
+ },
251
+ {
252
+ name: "guided_decoding_guide"
253
+ data_type: TYPE_STRING
254
+ dims: [ 1 ]
255
+ optional: true
256
+ allow_ragged_batch: true
257
+ }
258
+ ]
259
+ output [
260
+ {
261
+ name: "text_output"
262
+ data_type: TYPE_STRING
263
+ dims: [ -1 ]
264
+ },
265
+ {
266
+ name: "cum_log_probs"
267
+ data_type: TYPE_FP32
268
+ dims: [ -1 ]
269
+ },
270
+ {
271
+ name: "output_log_probs"
272
+ data_type: TYPE_FP32
273
+ dims: [ -1, -1 ]
274
+ },
275
+ {
276
+ name: "context_logits"
277
+ data_type: TYPE_FP16
278
+ dims: [ -1, -1 ]
279
+ },
280
+ {
281
+ name: "generation_logits"
282
+ data_type: TYPE_FP16
283
+ dims: [ -1, -1, -1 ]
284
+ },
285
+ {
286
+ name: "batch_index"
287
+ data_type: TYPE_INT32
288
+ dims: [ 1 ]
289
+ },
290
+ {
291
+ name: "sequence_index"
292
+ data_type: TYPE_INT32
293
+ dims: [ 1 ]
294
+ },
295
+ {
296
+ name: "kv_cache_alloc_new_blocks"
297
+ data_type: TYPE_INT32
298
+ dims: [ 1 ]
299
+ },
300
+ {
301
+ name: "kv_cache_reused_blocks"
302
+ data_type: TYPE_INT32
303
+ dims: [ 1 ]
304
+ },
305
+ {
306
+ name: "kv_cache_alloc_total_blocks"
307
+ data_type: TYPE_INT32
308
+ dims: [ 1 ]
309
+ }
310
+ ]
311
+ ensemble_scheduling {
312
+ step [
313
+ {
314
+ model_name: "preprocessing"
315
+ model_version: -1
316
+ input_map {
317
+ key: "QUERY"
318
+ value: "text_input"
319
+ }
320
+ input_map {
321
+ key: "DECODER_QUERY"
322
+ value: "decoder_text_input"
323
+ }
324
+ input_map {
325
+ key: "REQUEST_OUTPUT_LEN"
326
+ value: "max_tokens"
327
+ }
328
+ input_map {
329
+ key: "BAD_WORDS_DICT"
330
+ value: "bad_words"
331
+ }
332
+ input_map {
333
+ key: "STOP_WORDS_DICT"
334
+ value: "stop_words"
335
+ }
336
+ input_map {
337
+ key: "EMBEDDING_BIAS_WORDS"
338
+ value: "embedding_bias_words"
339
+ }
340
+ input_map {
341
+ key: "EMBEDDING_BIAS_WEIGHTS"
342
+ value: "embedding_bias_weights"
343
+ }
344
+ input_map {
345
+ key: "END_ID"
346
+ value: "end_id"
347
+ }
348
+ input_map {
349
+ key: "PAD_ID"
350
+ value: "pad_id"
351
+ }
352
+ input_map {
353
+ key: "PROMPT_TABLE_EXTRA_ID"
354
+ value: "prompt_table_extra_id"
355
+ }
356
+ output_map {
357
+ key: "REQUEST_INPUT_LEN"
358
+ value: "_REQUEST_INPUT_LEN"
359
+ }
360
+ output_map {
361
+ key: "INPUT_ID"
362
+ value: "_INPUT_ID"
363
+ }
364
+ output_map {
365
+ key: "REQUEST_DECODER_INPUT_LEN"
366
+ value: "_REQUEST_DECODER_INPUT_LEN"
367
+ }
368
+ output_map {
369
+ key: "DECODER_INPUT_ID"
370
+ value: "_DECODER_INPUT_ID"
371
+ }
372
+ output_map {
373
+ key: "REQUEST_OUTPUT_LEN"
374
+ value: "_REQUEST_OUTPUT_LEN"
375
+ }
376
+ output_map {
377
+ key: "STOP_WORDS_IDS"
378
+ value: "_STOP_WORDS_IDS"
379
+ }
380
+ output_map {
381
+ key: "BAD_WORDS_IDS"
382
+ value: "_BAD_WORDS_IDS"
383
+ }
384
+ output_map {
385
+ key: "EMBEDDING_BIAS"
386
+ value: "_EMBEDDING_BIAS"
387
+ }
388
+ output_map {
389
+ key: "OUT_END_ID"
390
+ value: "_PREPROCESSOR_END_ID"
391
+ }
392
+ output_map {
393
+ key: "OUT_PAD_ID"
394
+ value: "_PREPROCESSOR_PAD_ID"
395
+ }
396
+ output_map {
397
+ key: "OUT_PROMPT_TABLE_EXTRA_IDS"
398
+ value: "_OUT_PROMPT_TABLE_EXTRA_IDS"
399
+ }
400
+ },
401
+ {
402
+ model_name: "tensorrt_llm"
403
+ model_version: -1
404
+ input_map {
405
+ key: "input_ids"
406
+ value: "_INPUT_ID"
407
+ }
408
+ input_map {
409
+ key: "decoder_input_ids"
410
+ value: "_DECODER_INPUT_ID"
411
+ }
412
+ input_map {
413
+ key: "input_lengths"
414
+ value: "_REQUEST_INPUT_LEN"
415
+ }
416
+ input_map {
417
+ key: "decoder_input_lengths"
418
+ value: "_REQUEST_DECODER_INPUT_LEN"
419
+ }
420
+ input_map {
421
+ key: "exclude_input_in_output"
422
+ value: "exclude_input_in_output"
423
+ }
424
+ input_map {
425
+ key: "request_output_len"
426
+ value: "_REQUEST_OUTPUT_LEN"
427
+ }
428
+ input_map {
429
+ key: "end_id"
430
+ value: "_PREPROCESSOR_END_ID"
431
+ }
432
+ input_map {
433
+ key: "pad_id"
434
+ value: "_PREPROCESSOR_PAD_ID"
435
+ }
436
+ input_map {
437
+ key: "embedding_bias"
438
+ value: "_EMBEDDING_BIAS"
439
+ }
440
+ input_map {
441
+ key: "runtime_top_k"
442
+ value: "top_k"
443
+ }
444
+ input_map {
445
+ key: "runtime_top_p"
446
+ value: "top_p"
447
+ }
448
+ input_map {
449
+ key: "temperature"
450
+ value: "temperature"
451
+ }
452
+ input_map {
453
+ key: "len_penalty"
454
+ value: "length_penalty"
455
+ }
456
+ input_map {
457
+ key: "repetition_penalty"
458
+ value: "repetition_penalty"
459
+ }
460
+ input_map {
461
+ key: "min_length"
462
+ value: "min_length"
463
+ }
464
+ input_map {
465
+ key: "presence_penalty"
466
+ value: "presence_penalty"
467
+ }
468
+ input_map {
469
+ key: "frequency_penalty"
470
+ value: "frequency_penalty"
471
+ }
472
+ input_map {
473
+ key: "random_seed"
474
+ value: "random_seed"
475
+ }
476
+ input_map {
477
+ key: "return_log_probs"
478
+ value: "return_log_probs"
479
+ }
480
+ input_map {
481
+ key: "return_context_logits"
482
+ value: "return_context_logits"
483
+ }
484
+ input_map {
485
+ key: "return_generation_logits"
486
+ value: "return_generation_logits"
487
+ }
488
+ input_map {
489
+ key: "return_kv_cache_reuse_stats"
490
+ value: "return_kv_cache_reuse_stats"
491
+ }
492
+ input_map {
493
+ key: "num_return_sequences"
494
+ value: "num_return_sequences"
495
+ }
496
+ input_map {
497
+ key: "beam_width"
498
+ value: "beam_width"
499
+ }
500
+ input_map {
501
+ key: "streaming"
502
+ value: "stream"
503
+ }
504
+ input_map {
505
+ key: "prompt_embedding_table"
506
+ value: "prompt_embedding_table"
507
+ }
508
+ input_map {
509
+ key: "prompt_vocab_size"
510
+ value: "prompt_vocab_size"
511
+ }
512
+ input_map {
513
+ key: "stop_words_list"
514
+ value: "_STOP_WORDS_IDS"
515
+ }
516
+ input_map {
517
+ key: "bad_words_list"
518
+ value: "_BAD_WORDS_IDS"
519
+ }
520
+ input_map {
521
+ key: "prompt_table_extra_ids"
522
+ value: "_OUT_PROMPT_TABLE_EXTRA_IDS"
523
+ },
524
+ input_map {
525
+ key: "lora_task_id",
526
+ value: "lora_task_id"
527
+ },
528
+ input_map {
529
+ key: "lora_weights",
530
+ value: "lora_weights"
531
+ },
532
+ input_map {
533
+ key: "lora_config",
534
+ value: "lora_config"
535
+ },
536
+ input_map {
537
+ key: "guided_decoding_guide_type",
538
+ value: "guided_decoding_guide_type"
539
+ },
540
+ input_map {
541
+ key: "guided_decoding_guide",
542
+ value: "guided_decoding_guide"
543
+ }
544
+ output_map {
545
+ key: "output_ids"
546
+ value: "_TOKENS_BATCH"
547
+ }
548
+ output_map {
549
+ key: "sequence_length"
550
+ value: "_SEQUENCE_LENGTH"
551
+ },
552
+ output_map {
553
+ key: "cum_log_probs"
554
+ value: "cum_log_probs"
555
+ }
556
+ output_map {
557
+ key: "output_log_probs"
558
+ value: "output_log_probs"
559
+ },
560
+ output_map {
561
+ key: "context_logits"
562
+ value: "context_logits"
563
+ },
564
+ output_map {
565
+ key: "generation_logits"
566
+ value: "generation_logits"
567
+ },
568
+ output_map {
569
+ key: "batch_index"
570
+ value: "batch_index"
571
+ },
572
+ output_map {
573
+ key: "sequence_index"
574
+ value: "sequence_index"
575
+ },
576
+ output_map {
577
+ key: "kv_cache_alloc_new_blocks"
578
+ value: "kv_cache_alloc_new_blocks"
579
+ },
580
+ output_map {
581
+ key: "kv_cache_reused_blocks"
582
+ value: "kv_cache_reused_blocks"
583
+ },
584
+ output_map {
585
+ key: "kv_cache_alloc_total_blocks"
586
+ value: "kv_cache_alloc_total_blocks"
587
+ }
588
+ },
589
+ {
590
+ model_name: "postprocessing"
591
+ model_version: -1
592
+ input_map {
593
+ key: "TOKENS_BATCH"
594
+ value: "_TOKENS_BATCH"
595
+ }
596
+ input_map {
597
+ key: "SEQUENCE_LENGTH"
598
+ value: "_SEQUENCE_LENGTH"
599
+ }
600
+ output_map {
601
+ key: "OUTPUT"
602
+ value: "text_output"
603
+ }
604
+ }
605
+ ]
606
+ }
postprocessing/1/__pycache__/model.cpython-312.pyc ADDED
Binary file (6.52 kB). View file
 
postprocessing/1/model.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ import json
28
+
29
+ import numpy as np
30
+ import triton_python_backend_utils as pb_utils
31
+ from transformers import AutoTokenizer
32
+
33
+
34
+ class TritonPythonModel:
35
+ """Your Python model must use the same class name. Every Python model
36
+ that is created must have "TritonPythonModel" as the class name.
37
+ """
38
+
39
+ def initialize(self, args):
40
+ """`initialize` is called only once when the model is being loaded.
41
+ Implementing `initialize` function is optional. This function allows
42
+ the model to initialize any state associated with this model.
43
+ Parameters
44
+ ----------
45
+ args : dict
46
+ Both keys and values are strings. The dictionary keys and values are:
47
+ * model_config: A JSON string containing the model configuration
48
+ * model_instance_kind: A string containing model instance kind
49
+ * model_instance_device_id: A string containing model instance device ID
50
+ * model_repository: Model repository path
51
+ * model_version: Model version
52
+ * model_name: Model name
53
+ """
54
+ # Parse model configs
55
+ model_config = json.loads(args['model_config'])
56
+ tokenizer_dir = model_config['parameters']['tokenizer_dir'][
57
+ 'string_value']
58
+
59
+ skip_special_tokens = model_config['parameters'].get(
60
+ 'skip_special_tokens')
61
+ if skip_special_tokens is not None:
62
+ skip_special_tokens_str = skip_special_tokens[
63
+ 'string_value'].lower()
64
+ if skip_special_tokens_str in [
65
+ 'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
66
+ ]:
67
+ self.skip_special_tokens = skip_special_tokens_str in [
68
+ 'true', '1', 't', 'y', 'yes'
69
+ ]
70
+ else:
71
+ print(
72
+ f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default."
73
+ )
74
+ self.skip_special_tokens = True
75
+ else:
76
+ print(
77
+ f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default."
78
+ )
79
+ self.skip_special_tokens = True
80
+
81
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
82
+ legacy=False,
83
+ padding_side='left',
84
+ trust_remote_code=True)
85
+ if not self.tokenizer.pad_token:
86
+ self.tokenizer.pad_token = self.tokenizer.eos_token
87
+
88
+ # Parse model output configs
89
+ output_config = pb_utils.get_output_config_by_name(
90
+ model_config, "OUTPUT")
91
+
92
+ # Convert Triton types to numpy types
93
+ self.output_dtype = pb_utils.triton_string_to_numpy(
94
+ output_config['data_type'])
95
+
96
+ def execute(self, requests):
97
+ """`execute` must be implemented in every Python model. `execute`
98
+ function receives a list of pb_utils.InferenceRequest as the only
99
+ argument. This function is called when an inference is requested
100
+ for this model. Depending on the batching configuration (e.g. Dynamic
101
+ Batching) used, `requests` may contain multiple requests. Every
102
+ Python model, must create one pb_utils.InferenceResponse for every
103
+ pb_utils.InferenceRequest in `requests`. If there is an error, you can
104
+ set the error argument when creating a pb_utils.InferenceResponse.
105
+ Parameters
106
+ ----------
107
+ requests : list
108
+ A list of pb_utils.InferenceRequest
109
+ Returns
110
+ -------
111
+ list
112
+ A list of pb_utils.InferenceResponse. The length of this list must
113
+ be the same as `requests`
114
+ """
115
+
116
+ tokens_batch = []
117
+ sequence_lengths = []
118
+ for idx, request in enumerate(requests):
119
+ for input_tensor in request.inputs():
120
+ if input_tensor.name() == "TOKENS_BATCH":
121
+ tokens_batch.append(input_tensor.as_numpy())
122
+ elif input_tensor.name() == "SEQUENCE_LENGTH":
123
+ sequence_lengths.append(input_tensor.as_numpy())
124
+ else:
125
+ raise ValueError(f"unknown input {input_tensor.name}")
126
+
127
+ # batch decode
128
+ list_of_tokens = []
129
+ req_idx_offset = 0
130
+ req_idx_offsets = [req_idx_offset]
131
+ for idx, token_batch in enumerate(tokens_batch):
132
+ for batch_idx, beam_tokens in enumerate(token_batch):
133
+ for beam_idx, tokens in enumerate(beam_tokens):
134
+ seq_len = sequence_lengths[idx][batch_idx][beam_idx]
135
+ list_of_tokens.append(tokens[:seq_len])
136
+ req_idx_offset += 1
137
+
138
+ req_idx_offsets.append(req_idx_offset)
139
+
140
+ all_outputs = self.tokenizer.batch_decode(
141
+ list_of_tokens, skip_special_tokens=self.skip_special_tokens)
142
+
143
+ # construct responses
144
+ responses = []
145
+ for idx, request in enumerate(requests):
146
+ req_outputs = [
147
+ x.encode('utf8')
148
+ for x in all_outputs[req_idx_offsets[idx]:req_idx_offsets[idx +
149
+ 1]]
150
+ ]
151
+
152
+ output_tensor = pb_utils.Tensor(
153
+ 'OUTPUT',
154
+ np.array(req_outputs).astype(self.output_dtype))
155
+
156
+ outputs = [output_tensor]
157
+
158
+ # Create InferenceResponse. You can set an error here in case
159
+ # there was a problem with handling this inference request.
160
+ # Below is an example of how you can set errors in inference
161
+ # response:
162
+ #
163
+ # pb_utils.InferenceResponse(
164
+ # output_tensors=..., TritonError("An error occurred"))
165
+ inference_response = pb_utils.InferenceResponse(
166
+ output_tensors=outputs)
167
+ responses.append(inference_response)
168
+ # You should return a list of pb_utils.InferenceResponse. Length
169
+ # of this list must match the length of `requests` list.
170
+ return responses
171
+
172
+ def finalize(self):
173
+ """`finalize` is called only once when the model is being unloaded.
174
+ Implementing `finalize` function is optional. This function allows
175
+ the model to perform any necessary clean ups before exit.
176
+ """
177
+ print('Cleaning up...')
postprocessing/config.pbtxt ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "postprocessing"
28
+ backend: "python"
29
+ max_batch_size: 32
30
+ dynamic_batching {}
31
+ input [
32
+ {
33
+ name: "TOKENS_BATCH"
34
+ data_type: TYPE_INT32
35
+ dims: [ -1, -1 ]
36
+ },
37
+ {
38
+ name: "SEQUENCE_LENGTH"
39
+ data_type: TYPE_INT32
40
+ dims: [ -1 ]
41
+ }
42
+ ]
43
+ output [
44
+ {
45
+ name: "OUTPUT"
46
+ data_type: TYPE_STRING
47
+ dims: [ -1 ]
48
+ }
49
+ ]
50
+
51
+ parameters {
52
+ key: "tokenizer_dir"
53
+ value: {
54
+ string_value: "huihui-ai/Llama-3.3-70B-Instruct-abliterated"
55
+ }
56
+ }
57
+
58
+ parameters {
59
+ key: "skip_special_tokens"
60
+ value: {
61
+ string_value: "True"
62
+ }
63
+ }
64
+
65
+ instance_group [
66
+ {
67
+ count: 1
68
+ kind: KIND_CPU
69
+ }
70
+ ]
preprocessing/1/__pycache__/model.cpython-312.pyc ADDED
Binary file (39.3 kB). View file
 
preprocessing/1/model.py ADDED
@@ -0,0 +1,908 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ import base64
28
+ import io
29
+ import json
30
+ import os
31
+ from typing import List
32
+
33
+ import numpy as np
34
+ import requests
35
+ import triton_python_backend_utils as pb_utils
36
+ from PIL import Image
37
+ from transformers import AutoProcessor, AutoTokenizer, T5Tokenizer
38
+
39
+
40
+ class TritonPythonModel:
41
+ """Your Python model must use the same class name. Every Python model
42
+ that is created must have "TritonPythonModel" as the class name.
43
+ """
44
+
45
+ def initialize(self, args):
46
+ """`initialize` is called only once when the model is being loaded.
47
+ Implementing `initialize` function is optional. This function allows
48
+ the model to initialize any state associated with this model.
49
+ Parameters
50
+ ----------
51
+ args : dict
52
+ Both keys and values are strings. The dictionary keys and values are:
53
+ * model_config: A JSON string containing the model configuration
54
+ * model_instance_kind: A string containing model instance kind
55
+ * model_instance_device_id: A string containing model instance device ID
56
+ * model_repository: Model repository path
57
+ * model_version: Model version
58
+ * model_name: Model name
59
+ """
60
+ # Parse model configs
61
+ model_config = json.loads(args['model_config'])
62
+ tokenizer_dir = model_config['parameters']['tokenizer_dir'][
63
+ 'string_value']
64
+
65
+ add_special_tokens = model_config['parameters'].get(
66
+ 'add_special_tokens')
67
+ visual_model_path = model_config['parameters']['visual_model_path'][
68
+ 'string_value']
69
+ max_num_images = model_config['parameters'].get('max_num_images')
70
+
71
+ if max_num_images is not None:
72
+ max_num_images_str = max_num_images['string_value']
73
+ if max_num_images_str.isdigit():
74
+ self.max_num_images = int(max_num_images_str)
75
+ else:
76
+ print(
77
+ f"[TensorRT-LLM][WARNING] 'max_num_images' parameter is not set correctly (value is {max_num_images_str}). Will be set to None"
78
+ )
79
+ self.max_num_images = None
80
+ else:
81
+ print(
82
+ f"[TensorRT-LLM][WARNING] Don't setup 'max_num_images'. Set it as None by default."
83
+ )
84
+ self.max_num_images = None
85
+ if visual_model_path == "${visual_model_path}" or visual_model_path == "":
86
+ visual_model_path = None
87
+
88
+ if add_special_tokens is not None:
89
+ add_special_tokens_str = add_special_tokens['string_value'].lower()
90
+ if add_special_tokens_str in [
91
+ 'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
92
+ ]:
93
+ self.add_special_tokens = add_special_tokens_str in [
94
+ 'true', '1', 't', 'y', 'yes'
95
+ ]
96
+ else:
97
+ print(
98
+ f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default."
99
+ )
100
+ self.add_special_tokens = True
101
+ else:
102
+ print(
103
+ f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default."
104
+ )
105
+ self.add_special_tokens = True
106
+
107
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
108
+ legacy=False,
109
+ padding_side='left',
110
+ trust_remote_code=True)
111
+
112
+ if isinstance(self.tokenizer, T5Tokenizer):
113
+ self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id()
114
+
115
+ if not self.tokenizer.pad_token:
116
+ self.tokenizer.pad_token = self.tokenizer.eos_token
117
+
118
+ self.tokenizer_end_id = self.tokenizer.encode(
119
+ self.tokenizer.eos_token, add_special_tokens=False)[0]
120
+ self.tokenizer_pad_id = self.tokenizer.encode(
121
+ self.tokenizer.pad_token, add_special_tokens=False)[0]
122
+ self.vocab_size = self.tokenizer.vocab_size
123
+
124
+ self.is_multimodal = False
125
+ self.model_type = None
126
+ self.vision_preprocessor = None
127
+
128
+ if visual_model_path is not None:
129
+ self.is_multimodal = True
130
+ visual_model_path = os.path.join(visual_model_path, 'config.json')
131
+ with open(visual_model_path, 'r') as f:
132
+ visual_model_config = json.load(f)
133
+ self.model_type = visual_model_config['builder_config'][
134
+ 'model_type']
135
+
136
+ assert self.model_type in [
137
+ 'llava', 'blip2-opt', 'vila', 'mllama', 'llava_onevision'
138
+ ], f"[TensorRT-LLM][ERROR] Currently supported multi-modal models are llava, blip2-opt, vila, mllama and llava_onevision. Got {self.model_type}."
139
+
140
+ assert self.model_type != 'llava_onevison' or self.max_num_images is None or self.max_num_images <= 1, f"LLaVA-OneVsion is not support multi image inference currently."
141
+
142
+ llm_model_path = model_config['parameters']['gpt_model_path'][
143
+ 'string_value']
144
+ llm_model_path = os.path.join(llm_model_path, 'config.json')
145
+ with open(llm_model_path, 'r') as f:
146
+ llm_model_config = json.load(f)
147
+ self.vocab_size = int(
148
+ llm_model_config["pretrained_config"]["vocab_size"])
149
+ self._setup_ptable_shape(llm_model_config)
150
+
151
+ if self.model_type == 'mllama' or self.model_type == 'llava_onevision':
152
+ self.vision_preprocessor = VisionPreProcessor(
153
+ self.model_type,
154
+ AutoProcessor.from_pretrained(tokenizer_dir), model_config)
155
+
156
+ # Parse model output configs and convert Triton types to numpy types
157
+ output_names = [
158
+ "INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN",
159
+ "REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS",
160
+ "OUT_END_ID", "OUT_PAD_ID", "OUT_PROMPT_TABLE_EXTRA_IDS",
161
+ "PIXEL_VALUES", "IMAGE_SIZES"
162
+ ]
163
+ input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"]
164
+ for input_name in input_names:
165
+ setattr(
166
+ self,
167
+ input_name.lower() + "_dtype",
168
+ pb_utils.triton_string_to_numpy(
169
+ pb_utils.get_input_config_by_name(
170
+ model_config, input_name)['data_type']))
171
+
172
+ for output_name in output_names:
173
+ setattr(
174
+ self,
175
+ output_name.lower() + "_dtype",
176
+ pb_utils.triton_string_to_numpy(
177
+ pb_utils.get_output_config_by_name(
178
+ model_config, output_name)['data_type']))
179
+
180
+ def _setup_ptable_shape(self, llm_model_config):
181
+ max_prompt_embedding_table_size = llm_model_config['build_config'][
182
+ 'max_prompt_embedding_table_size']
183
+ max_batch_size = llm_model_config['build_config']['max_batch_size']
184
+
185
+ num_visual_features = max_prompt_embedding_table_size // max_batch_size
186
+ hidden_size = llm_model_config['pretrained_config']['hidden_size']
187
+ if self.max_num_images is not None:
188
+ num_visual_features = num_visual_features // self.max_num_images
189
+
190
+ self.ptable_shape = (-1, num_visual_features, hidden_size)
191
+
192
+ def execute(self, requests):
193
+ """`execute` must be implemented in every Python model. `execute`
194
+ function receives a list of pb_utils.InferenceRequest as the only
195
+ argument. This function is called when an inference is requested
196
+ for this model. Depending on the batching configuration (e.g. Dynamic
197
+ Batching) used, `requests` may contain multiple requests. Every
198
+ Python model, must create one pb_utils.InferenceResponse for every
199
+ pb_utils.InferenceRequest in `requests`. If there is an error, you can
200
+ set the error argument when creating a pb_utils.InferenceResponse.
201
+ Parameters
202
+ ----------
203
+ requests : list
204
+ A list of pb_utils.InferenceRequest
205
+ Returns
206
+ -------
207
+ list
208
+ A list of pb_utils.InferenceResponse. The length of this list must
209
+ be the same as `requests`
210
+ """
211
+
212
+ responses = []
213
+
214
+ # Every Python backend must iterate over everyone of the requests
215
+ # and create a pb_utils.InferenceResponse for each of them.
216
+ for idx, request in enumerate(requests):
217
+ # Get input tensors
218
+ query = pb_utils.get_input_tensor_by_name(request,
219
+ 'QUERY').as_numpy()
220
+ batch_size = query.shape[0]
221
+
222
+ decoder_query = pb_utils.get_input_tensor_by_name(
223
+ request, 'DECODER_QUERY')
224
+ if decoder_query is not None:
225
+ decoder_query = decoder_query.as_numpy()
226
+
227
+ request_output_len = pb_utils.get_input_tensor_by_name(
228
+ request, 'REQUEST_OUTPUT_LEN').as_numpy()
229
+
230
+ bad_words_dict = pb_utils.get_input_tensor_by_name(
231
+ request, 'BAD_WORDS_DICT')
232
+ if bad_words_dict is not None:
233
+ bad_words_dict = bad_words_dict.as_numpy()
234
+
235
+ stop_words_dict = pb_utils.get_input_tensor_by_name(
236
+ request, 'STOP_WORDS_DICT')
237
+ if stop_words_dict is not None:
238
+ stop_words_dict = stop_words_dict.as_numpy()
239
+
240
+ embedding_bias_words = pb_utils.get_input_tensor_by_name(
241
+ request, 'EMBEDDING_BIAS_WORDS')
242
+ if embedding_bias_words is not None:
243
+ embedding_bias_words = embedding_bias_words.as_numpy()
244
+
245
+ embedding_bias_weights = pb_utils.get_input_tensor_by_name(
246
+ request, 'EMBEDDING_BIAS_WEIGHTS')
247
+ if embedding_bias_weights is not None:
248
+ embedding_bias_weights = embedding_bias_weights.as_numpy()
249
+
250
+ # Take the end_id from the input tensors
251
+ # If not specified, use tokenizer to get end_id
252
+ end_id = pb_utils.get_input_tensor_by_name(request, 'END_ID')
253
+ if end_id is not None:
254
+ end_id = end_id.as_numpy()
255
+ else:
256
+ end_id = [[self.tokenizer_end_id]] * batch_size
257
+
258
+ # Take the pad_id from the input tensors
259
+ # If not specified, use tokenizer to get pad_id
260
+ pad_id = pb_utils.get_input_tensor_by_name(request, 'PAD_ID')
261
+ if pad_id is not None:
262
+ pad_id = pad_id.as_numpy()
263
+ else:
264
+ pad_id = [[self.tokenizer_pad_id]] * batch_size
265
+
266
+ # Take the extra_id from the input tensors
267
+ # Extra id is used in kv cache reuse for p-tuning
268
+ prompt_table_extra_id = pb_utils.get_input_tensor_by_name(
269
+ request, 'PROMPT_TABLE_EXTRA_ID')
270
+ if prompt_table_extra_id is not None:
271
+ prompt_table_extra_id = prompt_table_extra_id.as_numpy()
272
+ assert prompt_table_extra_id.shape[
273
+ 0] == batch_size, "Prompt table extra id must have the same batch size as Query"
274
+ assert prompt_table_extra_id.shape[
275
+ 1] == 1, "Multiple IDs cannot be provided for a single image"
276
+
277
+ # Preprocessing vision input passed as a url or bytes tensor
278
+ img_urls = pb_utils.get_input_tensor_by_name(request, 'IMAGE_URL')
279
+ image_bytes = pb_utils.get_input_tensor_by_name(
280
+ request, 'IMAGE_BYTES')
281
+ video_bytes = pb_utils.get_input_tensor_by_name(
282
+ request, 'VIDEO_BYTES')
283
+ vision_processed_tensors = []
284
+ visual_tokens = []
285
+ if self.is_multimodal and (img_urls or image_bytes or video_bytes):
286
+ assert self.vision_preprocessor != None, "Vision preprocessor for preparing images before encoding is None"
287
+ processed_tensors = {}
288
+ if self.model_type == 'mllama':
289
+ processed_tensors = self.vision_preprocessor.mllama_process(
290
+ queries=query.astype(str).tolist(),
291
+ img_urls=img_urls,
292
+ image_bytes=image_bytes,
293
+ )
294
+ elif self.model_type == 'llava_onevision':
295
+ if video_bytes is None:
296
+ processed_tensors, visual_tokens = self.vision_preprocessor.llava_onevision_process_image(
297
+ queries=query.astype(str).tolist(),
298
+ img_urls=img_urls,
299
+ image_bytes=image_bytes,
300
+ )
301
+ else:
302
+ processed_tensors, visual_tokens = self.vision_preprocessor.llava_onevision_process_video(
303
+ queries=query.astype(str).tolist(),
304
+ video_bytes=video_bytes,
305
+ )
306
+ else:
307
+ raise ValueError(
308
+ "Unsupported model type for IMAGE_BYTES or IMAGE_URL inputs"
309
+ )
310
+ vision_processed_tensors = [
311
+ pb_utils.Tensor.from_dlpack(k, v)
312
+ for k, v in processed_tensors.items()
313
+ ]
314
+ else:
315
+ assert self.model_type != "llava_onevision", "Image processing requires IMAGE_BYTES or IMAGE_URL to be provided"
316
+
317
+ # Preprocessing input data.
318
+ # For the LLaVA_OneVision model, num_visual_features is not a fixed value
319
+ input_id, request_input_len = self._create_request(
320
+ query, visual_tokens)
321
+ if decoder_query is not None:
322
+ decoder_input_id, request_decoder_input_len = self._create_request(
323
+ decoder_query)
324
+ else:
325
+ decoder_input_id = pad_id * np.ones((batch_size, 1), np.int32)
326
+ request_decoder_input_len = 1 * np.ones(
327
+ (batch_size, 1), np.int32)
328
+
329
+ bad_words = self._to_word_list_format(bad_words_dict, batch_size)
330
+ stop_words = self._to_word_list_format(stop_words_dict, batch_size)
331
+
332
+ embedding_bias = self._get_embedding_bias(
333
+ embedding_bias_words, embedding_bias_weights,
334
+ self.embedding_bias_weights_dtype, batch_size)
335
+
336
+ if prompt_table_extra_id is not None:
337
+ prompt_table_extra_ids = np.zeros_like(input_id)
338
+ for i in range(batch_size):
339
+ prompt_table_extra_ids[i] = np.where(
340
+ input_id[i] >= self.vocab_size,
341
+ prompt_table_extra_id[i], 0)
342
+
343
+ # Create output tensors. You need pb_utils.Tensor
344
+ # objects to create pb_utils.InferenceResponse.
345
+ input_id_tensor = pb_utils.Tensor(
346
+ 'INPUT_ID', input_id.astype(self.input_id_dtype))
347
+ request_input_len_tensor = pb_utils.Tensor(
348
+ 'REQUEST_INPUT_LEN',
349
+ request_input_len.astype(self.request_input_len_dtype))
350
+ decoder_input_id_tensor = pb_utils.Tensor(
351
+ 'DECODER_INPUT_ID',
352
+ decoder_input_id.astype(self.decoder_input_id_dtype))
353
+ request_decoder_input_len_tensor = pb_utils.Tensor(
354
+ 'REQUEST_DECODER_INPUT_LEN',
355
+ request_decoder_input_len.astype(
356
+ self.request_decoder_input_len_dtype))
357
+ request_output_len_tensor = pb_utils.Tensor(
358
+ 'REQUEST_OUTPUT_LEN', request_output_len)
359
+ bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
360
+ stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
361
+ stop_words)
362
+ embedding_bias_tensor = pb_utils.Tensor('EMBEDDING_BIAS',
363
+ embedding_bias)
364
+ end_id_tensor = pb_utils.Tensor('OUT_END_ID',
365
+ np.array(end_id, dtype=np.int32))
366
+ pad_id_tensor = pb_utils.Tensor('OUT_PAD_ID',
367
+ np.array(pad_id, dtype=np.int32))
368
+
369
+ if prompt_table_extra_id is not None:
370
+ prompt_table_extra_ids_tensor = pb_utils.Tensor(
371
+ 'OUT_PROMPT_TABLE_EXTRA_IDS',
372
+ np.array(prompt_table_extra_ids,
373
+ dtype=self.out_prompt_table_extra_ids_dtype))
374
+ inference_response = pb_utils.InferenceResponse(output_tensors=[
375
+ input_id_tensor, decoder_input_id_tensor,
376
+ bad_words_ids_tensor, stop_words_ids_tensor,
377
+ request_input_len_tensor, request_decoder_input_len_tensor,
378
+ request_output_len_tensor, embedding_bias_tensor,
379
+ end_id_tensor, pad_id_tensor, prompt_table_extra_ids_tensor
380
+ ] + vision_processed_tensors)
381
+ else:
382
+ inference_response = pb_utils.InferenceResponse(
383
+ output_tensors=[
384
+ input_id_tensor, decoder_input_id_tensor,
385
+ bad_words_ids_tensor, stop_words_ids_tensor,
386
+ request_input_len_tensor,
387
+ request_decoder_input_len_tensor,
388
+ request_output_len_tensor, embedding_bias_tensor,
389
+ end_id_tensor, pad_id_tensor
390
+ ] + vision_processed_tensors)
391
+ responses.append(inference_response)
392
+
393
+ # You should return a list of pb_utils.InferenceResponse. Length
394
+ # of this list must match the length of `requests` list.
395
+ return responses
396
+
397
+ def finalize(self):
398
+ """`finalize` is called only once when the model is being unloaded.
399
+ Implementing `finalize` function is optional. This function allows
400
+ the model to perform any necessary clean ups before exit.
401
+ """
402
+ print('Cleaning up...')
403
+
404
+ def _split_prompt_by_images(self,
405
+ concatenated_ids,
406
+ image_token_index=-200):
407
+ """
408
+ Splits tokenized prompts by image placeholders for each sample in the batch.
409
+
410
+ Args:
411
+ concatenated_ids (np.ndarray): A batch of concatenated token IDs, where image placeholders are indicated by `image_token_index`.
412
+
413
+ Returns:
414
+ List[List[np.ndarray]]: A list containing lists of token ID arrays for each prompt segment, per batch sample.
415
+ """
416
+ batch_splits = []
417
+ for batch in concatenated_ids:
418
+ zero_indices = np.where(batch == image_token_index)[0]
419
+ start_idx = 0
420
+ splits = []
421
+ for idx in zero_indices:
422
+ if start_idx != idx:
423
+ splits.append(batch[start_idx:idx].reshape(1, -1))
424
+ start_idx = idx + 1
425
+ if start_idx < len(batch):
426
+ splits.append(batch[start_idx:].reshape(1, -1))
427
+
428
+ splits = [split for split in splits if split.size > 0]
429
+ batch_splits.append(splits)
430
+
431
+ return batch_splits
432
+
433
+ def _setup_fake_prompts(self, batch_size, batch_split_prompts):
434
+ """
435
+ Replaces image placeholders with unique fake prompt IDs for multi-image inputs.
436
+
437
+ Args:
438
+ batch_size (int): The number of samples in the batch.
439
+ batch_split_prompts (List[List[np.ndarray]]): Tokenized prompt segments for each batch sample.
440
+
441
+ Returns:
442
+ np.ndarray: An array of input IDs with image placeholders replaced by fake prompt IDs.
443
+ """
444
+
445
+ num_visual_features = self.ptable_shape[1]
446
+ input_ids_list = []
447
+
448
+ for batch_idx in range(batch_size):
449
+ splits = batch_split_prompts[batch_idx]
450
+ sample_input_ids = [splits[0]]
451
+ sample_fake_prompt_counter = self.vocab_size
452
+
453
+ for split_idx in range(len(splits) - 1):
454
+ fake_prompt_id = np.arange(
455
+ sample_fake_prompt_counter,
456
+ sample_fake_prompt_counter + num_visual_features)
457
+ sample_fake_prompt_counter += num_visual_features
458
+ fake_prompt_id = np.expand_dims(fake_prompt_id, axis=0)
459
+ sample_input_ids.append(fake_prompt_id)
460
+ sample_input_ids.append(splits[split_idx + 1])
461
+
462
+ sample_input_ids = np.concatenate(sample_input_ids, axis=1)
463
+ input_ids_list.append(sample_input_ids)
464
+
465
+ # Pad the input_ids to the same length for bs > 1
466
+ max_seq_len = max(
467
+ [sample_input_ids.shape[1] for sample_input_ids in input_ids_list])
468
+ input_ids_padded = []
469
+ for sample_input_ids in input_ids_list:
470
+ seq_len = sample_input_ids.shape[1]
471
+ pad_width = max_seq_len - seq_len
472
+ if pad_width > 0:
473
+ sample_input_ids_padded = np.pad(
474
+ sample_input_ids, ((0, 0), (0, pad_width)),
475
+ 'constant',
476
+ constant_values=self.tokenizer_pad_id)
477
+ else:
478
+ sample_input_ids_padded = sample_input_ids
479
+ input_ids_padded.append(sample_input_ids_padded)
480
+
481
+ input_ids = np.stack(input_ids_padded)
482
+ input_ids = input_ids.reshape(batch_size, -1).astype(np.int32)
483
+
484
+ return input_ids
485
+
486
+ def _process_multi_image_inputs(self, query, image_token_index=-200):
487
+ """
488
+ Processes input queries that contain multiple images by tokenizing the input strings and inserting image_token_index between the parts.
489
+
490
+ Args:
491
+ query (np.ndarray): Batch of input strings.
492
+
493
+ Returns:
494
+ List[np.ndarray]: List of tokenized input IDs for each sample.
495
+ """
496
+ start_ids = []
497
+ for s in query:
498
+ parts = s[0].decode().split('<image>')
499
+ num_images = len(parts) - 1
500
+ if num_images > self.max_num_images:
501
+ raise ValueError(
502
+ f"The number of images in the request ({num_images}) exceeds the maximum allowed ({self.max_num_images})."
503
+ )
504
+ tokenized_parts = [
505
+ self.tokenizer.encode(part, add_special_tokens=False)
506
+ for part in parts
507
+ ]
508
+
509
+ # Insert `image_token_index` between the parts to represent <image>
510
+ final_ids = []
511
+ for i, part in enumerate(tokenized_parts):
512
+ final_ids.extend(part)
513
+ if i < len(tokenized_parts) - 1:
514
+ final_ids.append(image_token_index)
515
+
516
+ start_ids.append(np.array(final_ids).astype(int))
517
+
518
+ return start_ids
519
+
520
+ def _create_request(self, query, visual_tokens=None):
521
+ """
522
+ query : batch string (2D numpy array)
523
+ """
524
+ if isinstance(self.tokenizer, T5Tokenizer):
525
+ start_ids = [
526
+ np.array([self.tokenizer_bos_id] + self.tokenizer.encode(
527
+ s[0].decode(), add_special_tokens=self.add_special_tokens)
528
+ ).astype(int) for s in query
529
+ ]
530
+ else:
531
+ if self.is_multimodal and self.max_num_images and self.max_num_images > 1:
532
+ start_ids = self._process_multi_image_inputs(query)
533
+
534
+ else:
535
+ start_ids = [
536
+ np.array(
537
+ self.tokenizer.encode(s[0].decode(),
538
+ add_special_tokens=self.
539
+ add_special_tokens)).astype(int)
540
+ for s in query
541
+ ]
542
+
543
+ if self.is_multimodal:
544
+ if 'blip2' in self.model_type or 'mllama' == self.model_type:
545
+ pre_prompt = None
546
+ post_prompt = None
547
+ elif 'llava' == self.model_type:
548
+ pre_prompt = "USER:\n"
549
+ post_prompt = " ASSISTANT:"
550
+ elif 'vila' == self.model_type:
551
+ pre_prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: "
552
+ post_prompt = " ASSISTANT:"
553
+ elif 'llava_onevision' == self.model_type:
554
+ pre_prompt = "<|im_start|>user "
555
+ post_prompt = "<|im_end|><|im_start|>assistant\n"
556
+
557
+ pre_prompt_id = np.array(
558
+ self.tokenizer.encode(
559
+ pre_prompt,
560
+ add_special_tokens=self.add_special_tokens,
561
+ padding=True)) if pre_prompt is not None else np.array(
562
+ [], dtype=int)
563
+
564
+ post_prompt_id = np.array(
565
+ self.tokenizer.encode(
566
+ post_prompt,
567
+ add_special_tokens=self.add_special_tokens,
568
+ padding=True)) if post_prompt is not None else np.array(
569
+ [], dtype=int)
570
+
571
+ if self.max_num_images and self.max_num_images > 1:
572
+ concatenated_ids = [
573
+ np.concatenate((pre_prompt_id, ids, post_prompt_id),
574
+ axis=0) for ids in start_ids
575
+ ]
576
+ batch_split_prompts = self._split_prompt_by_images(
577
+ concatenated_ids)
578
+ start_ids = self._setup_fake_prompts(query.shape[0],
579
+ batch_split_prompts)
580
+ elif self.model_type == 'llava_onevision':
581
+ fake_prompt_ids = []
582
+ extra_id = np.array(
583
+ self.tokenizer.encode(
584
+ '\n',
585
+ add_special_tokens=self.add_special_tokens,
586
+ padding=True))
587
+ for tokens in visual_tokens:
588
+ prompt_id = np.arange(self.vocab_size,
589
+ self.vocab_size + tokens)
590
+ fake_prompt_ids.append(prompt_id)
591
+ start_ids = [
592
+ np.concatenate((pre_prompt_id, prompt_id, extra_id, ids,
593
+ post_prompt_id),
594
+ axis=0)
595
+ for prompt_id, ids in zip(fake_prompt_ids, start_ids)
596
+ ]
597
+ else:
598
+ fake_prompt_id = np.arange(
599
+ self.vocab_size, self.vocab_size + self.ptable_shape[1])
600
+ start_ids = [
601
+ np.concatenate(
602
+ (pre_prompt_id, fake_prompt_id, ids, post_prompt_id),
603
+ axis=0) for ids in start_ids
604
+ ]
605
+
606
+ start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int)
607
+
608
+ max_len = 0
609
+ for seq in start_ids:
610
+ max_len = max(max_len, seq.shape[0])
611
+ start_ids = np.stack([
612
+ np.pad(seq, (0, max_len - seq.shape[0]),
613
+ 'constant',
614
+ constant_values=(0, self.tokenizer_pad_id))
615
+ for seq in start_ids
616
+ ])
617
+
618
+ return start_ids, start_lengths
619
+
620
+ def _to_word_list_format(self, word_lists: List[List[str | bytes]],
621
+ batch_size):
622
+ '''
623
+ word_lists format:
624
+ len(word_lists) == batch_size
625
+ word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
626
+ '''
627
+ assert self.tokenizer != None, "need to set tokenizer"
628
+
629
+ if word_lists is None:
630
+ # Return an empty array of shape (1,2,0)
631
+ return np.empty([batch_size, 2, 0], dtype="int32")
632
+
633
+ flat_ids = []
634
+ offsets = []
635
+ for word_list in word_lists:
636
+ item_flat_ids = []
637
+ item_offsets = []
638
+
639
+ for word in word_list:
640
+ if isinstance(word, bytes):
641
+ word = word.decode()
642
+
643
+ ids = self.tokenizer.encode(word, add_special_tokens=False)
644
+ if len(ids) == 0:
645
+ continue
646
+
647
+ item_flat_ids += ids
648
+ item_offsets.append(len(ids))
649
+
650
+ flat_ids.append(np.array(item_flat_ids))
651
+ offsets.append(np.cumsum(np.array(item_offsets)))
652
+
653
+ pad_to = max(1, max(len(ids) for ids in flat_ids))
654
+
655
+ for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
656
+ flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)),
657
+ constant_values=0)
658
+ offsets[i] = np.pad(offs, (0, pad_to - len(offs)),
659
+ constant_values=-1)
660
+
661
+ return np.array([flat_ids, offsets], dtype="int32").transpose(
662
+ (1, 0, 2))
663
+
664
+ def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights,
665
+ bias_dtype, batch_size):
666
+
667
+ assert self.tokenizer != None, "need to set tokenizer"
668
+
669
+ if embedding_bias_words is None or embedding_bias_weights is None:
670
+ return np.empty([batch_size, 0],
671
+ dtype=self.embedding_bias_weights_dtype)
672
+
673
+ batch_embedding_bias = []
674
+ for words, weights in zip(embedding_bias_words,
675
+ embedding_bias_weights):
676
+
677
+ vocab_size = len(self.tokenizer.vocab)
678
+ embedding_bias = [0.] * vocab_size
679
+
680
+ assert len(words) == len(
681
+ weights
682
+ ), "Embedding bias words must have same dimension as embedding bias weights"
683
+
684
+ for word, weight in zip(words, weights):
685
+ if isinstance(word, bytes):
686
+ word = word.decode()
687
+ ids = self.tokenizer.encode(word)
688
+
689
+ if len(ids) == 0:
690
+ continue
691
+
692
+ for id in ids:
693
+ embedding_bias[id] += weight
694
+
695
+ batch_embedding_bias.append(np.array(embedding_bias))
696
+
697
+ return np.array(batch_embedding_bias, dtype=bias_dtype)
698
+
699
+
700
+ class VisionPreProcessor:
701
+ """ A class that can load images from url requests, and process them via a vision model processor,
702
+ in preparation for the vision encoder.
703
+ """
704
+
705
+ def __init__(self,
706
+ vision_model_type,
707
+ vision_model_processor,
708
+ preprocessor_model_config={}):
709
+ # import libraries that are only relevant for multimodal models
710
+ import torch
711
+ from torch.utils.dlpack import from_dlpack
712
+
713
+ # NOTE: Due to the behavior of MPI initialization, it is recommended to avoid using import tensorrt_llm
714
+ # except for the specific modules tensorrt_llm and multimodal_encoders.
715
+ # As a result, the function str_dtype_to_torch has been copied directly from tensorrt_llm._utils.
716
+ _str_to_torch_dtype_dict = dict(
717
+ bfloat16=torch.bfloat16,
718
+ float16=torch.float16,
719
+ float32=torch.float32,
720
+ int64=torch.int64,
721
+ int32=torch.int32,
722
+ int8=torch.int8,
723
+ bool=torch.bool,
724
+ fp8=torch.float8_e4m3fn,
725
+ )
726
+
727
+ def str_dtype_to_torch(dtype):
728
+ ret = _str_to_torch_dtype_dict.get(dtype)
729
+ assert ret is not None, f'Unsupported dtype: {dtype}'
730
+ return ret
731
+
732
+ self.load_images_tensor = lambda tensor: tensor if not hasattr(
733
+ tensor, 'to_dlpack') else from_dlpack(tensor.to_dlpack())
734
+
735
+ # extract expected output tensor dtype
736
+ self.output_str_dtypes = {}
737
+ for properties in preprocessor_model_config.get('output', []):
738
+ dtype = properties['data_type']
739
+ self.output_str_dtypes[properties['name']] = np.dtype(
740
+ pb_utils.triton_string_to_numpy(dtype)).name
741
+
742
+ # create method for converting output tensors batch to the expected type
743
+ self.convert_tensor_list_to_tensor = lambda tensor_list: torch.concat(
744
+ [
745
+ torch.from_numpy(x) if isinstance(x, np.ndarray) else x
746
+ for x in tensor_list
747
+ ],
748
+ dim=0)
749
+ self.convert_tensor_to_str_dtype = lambda tensor, dtype: tensor.to(
750
+ str_dtype_to_torch(dtype))
751
+
752
+ # create model-specific processor
753
+ self.vision_model_processor = vision_model_processor
754
+ self.vision_model_type = vision_model_type
755
+
756
+ def load_images_from_urls(self, img_urls):
757
+ images = []
758
+ for img_url in img_urls:
759
+ img_url = img_url.decode()
760
+ if img_url.startswith("data:image/jpeg;base64,"):
761
+ image_base64 = img_url.split(",")[1]
762
+ # Decode the base64 string
763
+ image_data = base64.b64decode(image_base64)
764
+ # Create a BytesIO object from the decoded data
765
+ image_buffer = io.BytesIO(image_data)
766
+ images.append(Image.open(image_buffer))
767
+ else:
768
+ images.append(
769
+ Image.open(requests.get(img_url, stream=True).raw))
770
+ return images
771
+
772
+ def mllama_process(self, queries, img_urls=None, image_bytes=None):
773
+ vision_processed_tensors = {}
774
+ if img_urls is not None or image_bytes is not None:
775
+ if img_urls is not None:
776
+ # download and read images
777
+ images = [
778
+ self.load_images_from_urls(urls)
779
+ for urls in img_urls.as_numpy()
780
+ ]
781
+ else:
782
+ images = [
783
+ img for img_list in self.load_images_tensor(image_bytes)
784
+ for img in img_list
785
+ ]
786
+
787
+ batch_size = len(images)
788
+
789
+ preprocessor_outputs = {}
790
+ possible_output_names = [
791
+ 'PIXEL_VALUES', 'ASPECT_RATIO_IDS', 'ASPECT_RATIO_MASK',
792
+ 'CROSS_ATTENTION_MASK'
793
+ ]
794
+ for batch_id in range(batch_size):
795
+ # Preprocess images and query
796
+ processed_vision_data = self.vision_model_processor(
797
+ images=images[batch_id],
798
+ text=queries[batch_id],
799
+ return_tensors="pt")
800
+
801
+ # Reshape pixel_values to [num_images, *HWC/CHW]
802
+ val = processed_vision_data["pixel_values"]
803
+
804
+ val = val.reshape(1, -1, *(val.shape[-3:]))
805
+ processed_vision_data["pixel_values"] = val
806
+ # Create vision output tensors
807
+ for key in possible_output_names:
808
+ val = processed_vision_data.get(key.lower())
809
+ if val is not None:
810
+ if key not in preprocessor_outputs:
811
+ preprocessor_outputs[key] = []
812
+ preprocessor_outputs[key].append(val)
813
+
814
+ for key, tensor_list in preprocessor_outputs.items():
815
+ val = self.convert_tensor_list_to_tensor(tensor_list)
816
+ if key in self.output_str_dtypes:
817
+ val = self.convert_tensor_to_str_dtype(
818
+ val, self.output_str_dtypes[key])
819
+ vision_processed_tensors[key] = val
820
+ return vision_processed_tensors
821
+
822
+ def llava_onevision_process_image(self,
823
+ queries,
824
+ img_urls=None,
825
+ image_bytes=None):
826
+
827
+ import torch
828
+ vision_processed_tensors = {}
829
+ if img_urls is not None:
830
+ # download and read images
831
+ images = [
832
+ self.load_images_from_urls(urls)
833
+ for urls in img_urls.as_numpy()
834
+ ]
835
+ else:
836
+ images = [
837
+ img for img_list in self.load_images_tensor(image_bytes)
838
+ for img in img_list
839
+ ]
840
+
841
+ batch_size = len(images)
842
+ assert len(
843
+ queries
844
+ ) == batch_size, f"Image must have the same batch size as Query."
845
+ preprocessor_outputs = {}
846
+ possible_output_names = ['PIXEL_VALUES', 'IMAGE_SIZES']
847
+ visual_tokens = []
848
+ for batch_id in range(batch_size):
849
+ # Preprocess images and query
850
+ processed_vision_data = self.vision_model_processor(
851
+ images=images[batch_id], text='<image>', return_tensors="pt")
852
+ visual_tokens.append(processed_vision_data['input_ids'].shape[1])
853
+
854
+ # Create vision output tensors
855
+ for key in possible_output_names:
856
+ val = processed_vision_data.get(key.lower())
857
+ if val is not None:
858
+ if key not in preprocessor_outputs:
859
+ preprocessor_outputs[key] = []
860
+ preprocessor_outputs[key].append(val)
861
+
862
+ max_patch = max(x.shape[1]
863
+ for x in preprocessor_outputs['PIXEL_VALUES'])
864
+ preprocessor_outputs['PIXEL_VALUES'] = [
865
+ torch.nn.functional.pad(
866
+ image, (0, 0, 0, 0, 0, 0, 0, max_patch - image.shape[1], 0, 0),
867
+ mode='constant')
868
+ for image in preprocessor_outputs['PIXEL_VALUES']
869
+ ]
870
+ for key, tensor_list in preprocessor_outputs.items():
871
+ val = self.convert_tensor_list_to_tensor(tensor_list)
872
+ if key in self.output_str_dtypes:
873
+ val = self.convert_tensor_to_str_dtype(
874
+ val, self.output_str_dtypes[key])
875
+ vision_processed_tensors[key] = val
876
+ return vision_processed_tensors, visual_tokens
877
+
878
+ def llava_onevision_process_video(self, queries, video_bytes=None):
879
+ import torch
880
+ vision_processed_tensors = {}
881
+ videos = [video for video in self.load_images_tensor(video_bytes)]
882
+
883
+ batch_size = len(videos)
884
+ assert len(
885
+ queries
886
+ ) == batch_size, f"Video must have the same batch size as Query."
887
+ preprocessor_outputs = {}
888
+ preprocessor_outputs['PIXEL_VALUES'] = []
889
+ preprocessor_outputs['IS_VIDEO_INPUT'] = []
890
+ visual_tokens = []
891
+ for batch_id in range(len(queries)):
892
+ processed_vision_data = self.vision_model_processor(
893
+ videos=list(videos[batch_id]),
894
+ text='<video>',
895
+ return_tensors="pt")
896
+ visual_tokens.append(processed_vision_data['input_ids'].shape[1])
897
+ preprocessor_outputs['PIXEL_VALUES'].append(
898
+ processed_vision_data['pixel_values_videos'])
899
+ preprocessor_outputs['IS_VIDEO_INPUT'].append(
900
+ torch.ones((1, 1), dtype=torch.bool))
901
+
902
+ for key, tensor_list in preprocessor_outputs.items():
903
+ val = self.convert_tensor_list_to_tensor(tensor_list)
904
+ if key in self.output_str_dtypes:
905
+ val = self.convert_tensor_to_str_dtype(
906
+ val, self.output_str_dtypes[key])
907
+ vision_processed_tensors[key] = val
908
+ return vision_processed_tensors, visual_tokens
preprocessing/config.pbtxt ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "preprocessing"
28
+ backend: "python"
29
+ max_batch_size: 32
30
+
31
+
32
+ input [
33
+ {
34
+ name: "QUERY"
35
+ data_type: TYPE_STRING
36
+ dims: [ 1 ]
37
+ },
38
+ {
39
+ name: "DECODER_QUERY"
40
+ data_type: TYPE_STRING
41
+ dims: [ 1 ]
42
+ optional: true
43
+ },
44
+ {
45
+ name: "IMAGE_BYTES"
46
+ data_type: TYPE_UINT8
47
+ dims: [ -1, -1, -1, -1 ]
48
+ optional: true
49
+ },
50
+ {
51
+ name: "IMAGE_URL"
52
+ data_type: TYPE_STRING
53
+ dims: [ 1 ]
54
+ optional: true
55
+ },
56
+ {
57
+ name: "VIDEO_BYTES"
58
+ data_type: TYPE_UINT8
59
+ dims: [ -1, -1, -1, -1 ]
60
+ optional: true
61
+ },
62
+ {
63
+ name: "REQUEST_OUTPUT_LEN"
64
+ data_type: TYPE_INT32
65
+ dims: [ 1 ]
66
+ },
67
+ {
68
+ name: "BAD_WORDS_DICT"
69
+ data_type: TYPE_STRING
70
+ dims: [ -1 ]
71
+ optional: true
72
+ },
73
+ {
74
+ name: "STOP_WORDS_DICT"
75
+ data_type: TYPE_STRING
76
+ dims: [ -1 ]
77
+ optional: true
78
+ },
79
+ {
80
+ name: "EMBEDDING_BIAS_WORDS"
81
+ data_type: TYPE_STRING
82
+ dims: [ -1 ]
83
+ optional: true
84
+ },
85
+ {
86
+ name: "EMBEDDING_BIAS_WEIGHTS"
87
+ data_type: TYPE_FP32
88
+ dims: [ -1 ]
89
+ optional: true
90
+ },
91
+ {
92
+ name: "END_ID"
93
+ data_type: TYPE_INT32
94
+ dims: [ 1 ]
95
+ optional: true
96
+ },
97
+ {
98
+ name: "PAD_ID"
99
+ data_type: TYPE_INT32
100
+ dims: [ 1 ]
101
+ optional: true
102
+ },
103
+ {
104
+ name: "PROMPT_TABLE_EXTRA_ID"
105
+ data_type: TYPE_UINT64
106
+ dims: [ 1 ]
107
+ optional: true
108
+ }
109
+ ]
110
+ output [
111
+ {
112
+ name: "INPUT_ID"
113
+ data_type: TYPE_INT32
114
+ dims: [ -1 ]
115
+ },
116
+ {
117
+ name: "REQUEST_INPUT_LEN"
118
+ data_type: TYPE_INT32
119
+ dims: [ 1 ]
120
+ },
121
+ {
122
+ name: "DECODER_INPUT_ID"
123
+ data_type: TYPE_INT32
124
+ dims: [ -1 ]
125
+ },
126
+ {
127
+ name: "REQUEST_DECODER_INPUT_LEN"
128
+ data_type: TYPE_INT32
129
+ dims: [ 1 ]
130
+ },
131
+ {
132
+ name: "BAD_WORDS_IDS"
133
+ data_type: TYPE_INT32
134
+ dims: [ 2, -1 ]
135
+ },
136
+ {
137
+ name: "STOP_WORDS_IDS"
138
+ data_type: TYPE_INT32
139
+ dims: [ 2, -1 ]
140
+ },
141
+ {
142
+ name: "EMBEDDING_BIAS"
143
+ data_type: TYPE_FP32
144
+ dims: [ -1 ]
145
+ },
146
+ {
147
+ name: "REQUEST_OUTPUT_LEN"
148
+ data_type: TYPE_INT32
149
+ dims: [ -1 ]
150
+ },
151
+ {
152
+ name: "OUT_END_ID"
153
+ data_type: TYPE_INT32
154
+ dims: [ 1 ]
155
+ },
156
+ {
157
+ name: "OUT_PAD_ID"
158
+ data_type: TYPE_INT32
159
+ dims: [ 1 ]
160
+ },
161
+ {
162
+ name: "OUT_PROMPT_TABLE_EXTRA_IDS"
163
+ data_type: TYPE_UINT64
164
+ dims: [ -1 ]
165
+ },
166
+ {
167
+ name: "PIXEL_VALUES"
168
+ data_type: TYPE_FP16
169
+ dims: [ -1, -1, -1, -1 ]
170
+ },
171
+ {
172
+ name: "ASPECT_RATIO_IDS"
173
+ data_type: TYPE_INT64
174
+ dims: [ -1 ]
175
+ },
176
+ {
177
+ name: "ASPECT_RATIO_MASK"
178
+ data_type: TYPE_INT64
179
+ dims: [ -1, -1 ]
180
+ },
181
+ {
182
+ name: "CROSS_ATTENTION_MASK"
183
+ data_type: TYPE_INT64
184
+ dims: [ -1, -1, -1 ]
185
+ },
186
+ # Required for image postprocessing in the llava_onevision model
187
+ {
188
+ name: "IMAGE_SIZES"
189
+ data_type: TYPE_INT64
190
+ dims: [ 2 ]
191
+ },
192
+ # Indicates if the input is video in the llava_onevision model
193
+ {
194
+ name: "IS_VIDEO_INPUT"
195
+ data_type: TYPE_BOOL
196
+ dims: [ 1 ]
197
+ }
198
+ ]
199
+
200
+ parameters {
201
+ key: "tokenizer_dir"
202
+ value: {
203
+ string_value: "huihui-ai/Llama-3.3-70B-Instruct-abliterated"
204
+ }
205
+ }
206
+
207
+ parameters {
208
+ key: "add_special_tokens"
209
+ value: {
210
+ string_value: "False"
211
+ }
212
+ }
213
+
214
+ parameters {
215
+ key: "visual_model_path"
216
+ value: {
217
+ string_value: "${visual_model_path}"
218
+ }
219
+ }
220
+
221
+ parameters: {
222
+ key: "gpt_model_path"
223
+ value: {
224
+ string_value: "/all_models/inflight_batcher_llm/tensorrt_llm/1"
225
+ }
226
+ }
227
+
228
+ parameters: {
229
+ key: "max_num_images"
230
+ value: {
231
+ string_value: "${max_num_images}"
232
+ }
233
+ }
234
+
235
+ instance_group [
236
+ {
237
+ count: 1
238
+ kind: KIND_CPU
239
+ }
240
+ ]
tensorrt_llm/1/.gitkeep ADDED
File without changes
tensorrt_llm/1/config.json ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "0.18.0.dev2025020400",
3
+ "pretrained_config": {
4
+ "mlp_bias": false,
5
+ "attn_bias": false,
6
+ "rotary_base": 500000.0,
7
+ "rotary_scaling": {
8
+ "factor": 8.0,
9
+ "high_freq_factor": 4.0,
10
+ "low_freq_factor": 1.0,
11
+ "original_max_position_embeddings": 8192,
12
+ "rope_type": "llama3"
13
+ },
14
+ "residual_mlp": false,
15
+ "disable_weight_only_quant_plugin": false,
16
+ "moe": {
17
+ "num_experts": 0,
18
+ "shared_expert_intermediate_size": 0,
19
+ "top_k": 0,
20
+ "normalization_mode": null,
21
+ "sparse_mixer_epsilon": 0.01,
22
+ "tp_mode": 0,
23
+ "device_limited_n_group": 0,
24
+ "device_limited_topk_group": 0,
25
+ "device_limited_routed_scaling_factor": 1.0
26
+ },
27
+ "remove_duplicated_kv_heads": false,
28
+ "fc_after_embed": false,
29
+ "use_input_layernorm_in_first_layer": true,
30
+ "use_last_layernorm": true,
31
+ "layer_idx_offset": 0,
32
+ "embedding_multiplier": 1.0,
33
+ "attention_multiplier": 1.0,
34
+ "residual_multiplier": 1.0,
35
+ "output_multiplier_scale": 1.0,
36
+ "has_partial_lora_mask": false,
37
+ "architecture": "LlamaForCausalLM",
38
+ "dtype": "float16",
39
+ "vocab_size": 128256,
40
+ "hidden_size": 8192,
41
+ "num_hidden_layers": 80,
42
+ "num_attention_heads": 64,
43
+ "hidden_act": "silu",
44
+ "logits_dtype": "float16",
45
+ "norm_epsilon": 1e-05,
46
+ "runtime_defaults": null,
47
+ "position_embedding_type": "rope_gpt_neox",
48
+ "num_key_value_heads": 8,
49
+ "intermediate_size": 28672,
50
+ "max_position_embeddings": 131072,
51
+ "mapping": {
52
+ "world_size": 2,
53
+ "gpus_per_node": 8,
54
+ "cp_size": 1,
55
+ "tp_size": 2,
56
+ "pp_size": 1,
57
+ "moe_tp_size": 2,
58
+ "moe_ep_size": 1,
59
+ "auto_parallel": false
60
+ },
61
+ "quantization": {
62
+ "quant_algo": "FP8",
63
+ "kv_cache_quant_algo": "FP8",
64
+ "group_size": 128,
65
+ "smoothquant_val": 0.5,
66
+ "clamp_val": null,
67
+ "use_meta_recipe": false,
68
+ "has_zero_point": false,
69
+ "pre_quant_scale": false,
70
+ "exclude_modules": [
71
+ "transformer.layers.33.input_layernorm",
72
+ "transformer.layers.58.post_layernorm",
73
+ "transformer.layers.43.post_layernorm",
74
+ "transformer.layers.45.input_layernorm",
75
+ "transformer.layers.8.post_layernorm",
76
+ "transformer.layers.79.input_layernorm",
77
+ "transformer.layers.70.post_layernorm",
78
+ "transformer.layers.73.input_layernorm",
79
+ "transformer.layers.19.input_layernorm",
80
+ "transformer.layers.46.input_layernorm",
81
+ "transformer.layers.48.input_layernorm",
82
+ "transformer.layers.67.post_layernorm",
83
+ "transformer.layers.12.input_layernorm",
84
+ "transformer.layers.60.post_layernorm",
85
+ "transformer.layers.17.post_layernorm",
86
+ "transformer.layers.57.input_layernorm",
87
+ "transformer.layers.0.input_layernorm",
88
+ "transformer.layers.49.input_layernorm",
89
+ "transformer.layers.4.post_layernorm",
90
+ "transformer.layers.39.post_layernorm",
91
+ "transformer.layers.73.post_layernorm",
92
+ "transformer.layers.44.post_layernorm",
93
+ "transformer.layers.13.input_layernorm",
94
+ "transformer.layers.56.post_layernorm",
95
+ "transformer.layers.62.post_layernorm",
96
+ "transformer.layers.42.post_layernorm",
97
+ "transformer.layers.27.input_layernorm",
98
+ "transformer.layers.22.post_layernorm",
99
+ "transformer.layers.77.input_layernorm",
100
+ "transformer.layers.51.input_layernorm",
101
+ "transformer.layers.21.post_layernorm",
102
+ "transformer.layers.54.post_layernorm",
103
+ "transformer.layers.22.input_layernorm",
104
+ "transformer.layers.47.input_layernorm",
105
+ "transformer.layers.15.input_layernorm",
106
+ "transformer.layers.7.input_layernorm",
107
+ "transformer.layers.63.input_layernorm",
108
+ "transformer.layers.70.input_layernorm",
109
+ "transformer.layers.5.input_layernorm",
110
+ "transformer.layers.29.post_layernorm",
111
+ "transformer.vocab_embedding",
112
+ "transformer.layers.2.post_layernorm",
113
+ "transformer.layers.11.post_layernorm",
114
+ "transformer.layers.54.input_layernorm",
115
+ "transformer.layers.45.post_layernorm",
116
+ "transformer.layers.78.post_layernorm",
117
+ "transformer.layers.23.post_layernorm",
118
+ "transformer.layers.30.input_layernorm",
119
+ "transformer.layers.58.input_layernorm",
120
+ "transformer.layers.18.input_layernorm",
121
+ "transformer.layers.3.input_layernorm",
122
+ "transformer.layers.7.post_layernorm",
123
+ "transformer.layers.77.post_layernorm",
124
+ "transformer.layers.47.post_layernorm",
125
+ "transformer.layers.38.input_layernorm",
126
+ "transformer.layers.41.post_layernorm",
127
+ "transformer.layers.55.post_layernorm",
128
+ "transformer.layers.64.post_layernorm",
129
+ "transformer.layers.57.post_layernorm",
130
+ "transformer.layers.29.input_layernorm",
131
+ "transformer.layers.28.input_layernorm",
132
+ "transformer.layers.9.input_layernorm",
133
+ "transformer.layers.43.input_layernorm",
134
+ "transformer.layers.28.post_layernorm",
135
+ "transformer.layers.52.post_layernorm",
136
+ "transformer.layers.17.input_layernorm",
137
+ "transformer.layers.19.post_layernorm",
138
+ "transformer.layers.15.post_layernorm",
139
+ "transformer.layers.25.post_layernorm",
140
+ "transformer.layers.32.input_layernorm",
141
+ "transformer.layers.76.post_layernorm",
142
+ "transformer.layers.16.input_layernorm",
143
+ "transformer.layers.75.post_layernorm",
144
+ "transformer.layers.62.input_layernorm",
145
+ "transformer.layers.50.input_layernorm",
146
+ "transformer.layers.35.input_layernorm",
147
+ "transformer.layers.59.input_layernorm",
148
+ "transformer.layers.68.post_layernorm",
149
+ "transformer.layers.40.post_layernorm",
150
+ "transformer.layers.10.post_layernorm",
151
+ "transformer.layers.50.post_layernorm",
152
+ "transformer.layers.14.input_layernorm",
153
+ "transformer.layers.61.post_layernorm",
154
+ "transformer.layers.41.input_layernorm",
155
+ "transformer.layers.3.post_layernorm",
156
+ "transformer.layers.69.input_layernorm",
157
+ "transformer.layers.2.input_layernorm",
158
+ "transformer.layers.1.post_layernorm",
159
+ "transformer.layers.14.post_layernorm",
160
+ "transformer.layers.1.input_layernorm",
161
+ "transformer.layers.53.input_layernorm",
162
+ "transformer.layers.65.input_layernorm",
163
+ "lm_head",
164
+ "transformer.layers.32.post_layernorm",
165
+ "transformer.layers.11.input_layernorm",
166
+ "transformer.layers.59.post_layernorm",
167
+ "transformer.layers.37.input_layernorm",
168
+ "transformer.ln_f",
169
+ "transformer.layers.4.input_layernorm",
170
+ "transformer.layers.34.post_layernorm",
171
+ "transformer.layers.78.input_layernorm",
172
+ "transformer.layers.44.input_layernorm",
173
+ "transformer.layers.48.post_layernorm",
174
+ "transformer.layers.20.post_layernorm",
175
+ "transformer.layers.49.post_layernorm",
176
+ "transformer.layers.42.input_layernorm",
177
+ "transformer.layers.66.post_layernorm",
178
+ "transformer.layers.74.input_layernorm",
179
+ "transformer.layers.20.input_layernorm",
180
+ "transformer.layers.5.post_layernorm",
181
+ "transformer.layers.69.post_layernorm",
182
+ "transformer.layers.35.post_layernorm",
183
+ "transformer.layers.56.input_layernorm",
184
+ "transformer.layers.79.post_layernorm",
185
+ "transformer.layers.31.post_layernorm",
186
+ "transformer.layers.60.input_layernorm",
187
+ "transformer.layers.36.post_layernorm",
188
+ "transformer.layers.23.input_layernorm",
189
+ "transformer.layers.26.post_layernorm",
190
+ "transformer.layers.66.input_layernorm",
191
+ "transformer.layers.68.input_layernorm",
192
+ "transformer.layers.52.input_layernorm",
193
+ "transformer.layers.72.input_layernorm",
194
+ "transformer.layers.26.input_layernorm",
195
+ "transformer.layers.9.post_layernorm",
196
+ "transformer.layers.71.post_layernorm",
197
+ "transformer.layers.72.post_layernorm",
198
+ "transformer.layers.18.post_layernorm",
199
+ "transformer.layers.6.input_layernorm",
200
+ "transformer.layers.33.post_layernorm",
201
+ "transformer.layers.51.post_layernorm",
202
+ "transformer.layers.76.input_layernorm",
203
+ "transformer.layers.64.input_layernorm",
204
+ "transformer.layers.16.post_layernorm",
205
+ "transformer.layers.25.input_layernorm",
206
+ "transformer.layers.0.post_layernorm",
207
+ "transformer.layers.38.post_layernorm",
208
+ "transformer.layers.63.post_layernorm",
209
+ "transformer.layers.12.post_layernorm",
210
+ "transformer.layers.30.post_layernorm",
211
+ "transformer.layers.67.input_layernorm",
212
+ "transformer.layers.46.post_layernorm",
213
+ "transformer.layers.24.input_layernorm",
214
+ "transformer.layers.53.post_layernorm",
215
+ "transformer.layers.74.post_layernorm",
216
+ "transformer.layers.71.input_layernorm",
217
+ "transformer.layers.55.input_layernorm",
218
+ "transformer.layers.6.post_layernorm",
219
+ "transformer.layers.40.input_layernorm",
220
+ "transformer.layers.13.post_layernorm",
221
+ "transformer.layers.27.post_layernorm",
222
+ "transformer.layers.8.input_layernorm",
223
+ "transformer.layers.24.post_layernorm",
224
+ "transformer.layers.37.post_layernorm",
225
+ "transformer.layers.61.input_layernorm",
226
+ "transformer.layers.34.input_layernorm",
227
+ "transformer.layers.36.input_layernorm",
228
+ "transformer.layers.31.input_layernorm",
229
+ "transformer.layers.65.post_layernorm",
230
+ "transformer.layers.21.input_layernorm",
231
+ "transformer.layers.39.input_layernorm",
232
+ "transformer.layers.10.input_layernorm",
233
+ "transformer.layers.75.input_layernorm"
234
+ ]
235
+ },
236
+ "use_parallel_embedding": true,
237
+ "embedding_sharding_dim": 0,
238
+ "head_size": 128,
239
+ "qk_layernorm": false,
240
+ "rotary_embedding_dim": 128,
241
+ "producer": {
242
+ "name": "modelopt",
243
+ "version": "0.23.0"
244
+ },
245
+ "share_embedding_table": false,
246
+ "bias": false,
247
+ "rotary_pct": 1.0,
248
+ "rank": 1,
249
+ "decoder": "llama",
250
+ "rmsnorm": true,
251
+ "lm_head_bias": false,
252
+ "tie_word_embeddings": false,
253
+ "model_type": "llama"
254
+ },
255
+ "build_config": {
256
+ "max_input_len": 124000,
257
+ "max_seq_len": 131072,
258
+ "opt_batch_size": 8,
259
+ "max_batch_size": 32,
260
+ "max_beam_width": 1,
261
+ "max_num_tokens": 128000,
262
+ "opt_num_tokens": null,
263
+ "max_prompt_embedding_table_size": 0,
264
+ "kv_cache_type": "PAGED",
265
+ "gather_context_logits": false,
266
+ "gather_generation_logits": false,
267
+ "strongly_typed": true,
268
+ "force_num_profiles": null,
269
+ "profiling_verbosity": "layer_names_only",
270
+ "enable_debug_output": false,
271
+ "max_draft_len": 0,
272
+ "speculative_decoding_mode": 1,
273
+ "use_refit": false,
274
+ "input_timing_cache": null,
275
+ "output_timing_cache": "model.cache",
276
+ "lora_config": {
277
+ "lora_dir": [],
278
+ "lora_ckpt_source": "hf",
279
+ "max_lora_rank": 64,
280
+ "lora_target_modules": [],
281
+ "trtllm_modules_to_hf_modules": {}
282
+ },
283
+ "auto_parallel_config": {
284
+ "world_size": 1,
285
+ "gpus_per_node": 8,
286
+ "cluster_key": "H100-PCIe",
287
+ "cluster_info": null,
288
+ "sharding_cost_model": "alpha_beta",
289
+ "comm_cost_model": "alpha_beta",
290
+ "enable_pipeline_parallelism": false,
291
+ "enable_shard_unbalanced_shape": false,
292
+ "enable_shard_dynamic_shape": false,
293
+ "enable_reduce_scatter": true,
294
+ "builder_flags": null,
295
+ "debug_mode": false,
296
+ "infer_shape": true,
297
+ "validation_mode": false,
298
+ "same_buffer_io": {
299
+ "past_key_value_(\\d+)": "present_key_value_\\1"
300
+ },
301
+ "same_spec_io": {},
302
+ "sharded_io_allowlist": [
303
+ "past_key_value_\\d+",
304
+ "present_key_value_\\d*"
305
+ ],
306
+ "fill_weights": false,
307
+ "parallel_config_cache": null,
308
+ "profile_cache": null,
309
+ "dump_path": null,
310
+ "debug_outputs": []
311
+ },
312
+ "weight_sparsity": false,
313
+ "weight_streaming": false,
314
+ "plugin_config": {
315
+ "dtype": "float16",
316
+ "bert_attention_plugin": "auto",
317
+ "gpt_attention_plugin": "auto",
318
+ "gemm_plugin": "fp8",
319
+ "explicitly_disable_gemm_plugin": false,
320
+ "gemm_swiglu_plugin": null,
321
+ "fp8_rowwise_gemm_plugin": null,
322
+ "qserve_gemm_plugin": null,
323
+ "identity_plugin": null,
324
+ "nccl_plugin": "float16",
325
+ "lora_plugin": null,
326
+ "dora_plugin": false,
327
+ "weight_only_groupwise_quant_matmul_plugin": null,
328
+ "weight_only_quant_matmul_plugin": null,
329
+ "smooth_quant_plugins": true,
330
+ "smooth_quant_gemm_plugin": null,
331
+ "layernorm_quantization_plugin": null,
332
+ "rmsnorm_quantization_plugin": null,
333
+ "quantize_per_token_plugin": false,
334
+ "quantize_tensor_plugin": false,
335
+ "moe_plugin": "auto",
336
+ "mamba_conv1d_plugin": "auto",
337
+ "low_latency_gemm_plugin": null,
338
+ "low_latency_gemm_swiglu_plugin": null,
339
+ "gemm_allreduce_plugin": null,
340
+ "context_fmha": true,
341
+ "bert_context_fmha_fp32_acc": false,
342
+ "paged_kv_cache": true,
343
+ "remove_input_padding": true,
344
+ "reduce_fusion": false,
345
+ "user_buffer": false,
346
+ "tokens_per_block": 32,
347
+ "use_paged_context_fmha": true,
348
+ "use_fp8_context_fmha": true,
349
+ "fuse_fp4_quant": false,
350
+ "multiple_profiles": true,
351
+ "paged_state": false,
352
+ "streamingllm": false,
353
+ "manage_weights": false,
354
+ "use_fused_mlp": true,
355
+ "pp_reduce_scatter": false
356
+ },
357
+ "use_strip_plan": false,
358
+ "max_encoder_input_len": 1024,
359
+ "monitor_memory": false,
360
+ "use_mrope": false
361
+ }
362
+ }
tensorrt_llm/1/model.py ADDED
@@ -0,0 +1,1386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import json
3
+ import os
4
+ import sys
5
+ import time
6
+ from dataclasses import dataclass
7
+ from random import randint
8
+ from threading import Lock, Thread
9
+ from typing import Any, List
10
+
11
+ import numpy as np
12
+ import torch
13
+ import triton_python_backend_utils as pb_utils
14
+ from torch import from_numpy
15
+ from torch.utils.dlpack import from_dlpack
16
+
17
+ import tensorrt_llm.bindings.executor as trtllm
18
+ from tensorrt_llm.llmapi.tokenizer import _xgrammar_tokenizer_info
19
+
20
+ METRIC_TOTAL_OUTPUT_TOKENS = "total_output_tokens"
21
+ METRIC_TOTAL_INPUT_TOKENS = "total_input_tokens"
22
+ import tensorrt_llm.logger as logger
23
+
24
+ # From https://github.com/pytorch/pytorch/blob/39425feac799905402abe4d15667fa47c344f2d7/torch/testing/_internal/common_utils.py#L1761
25
+ # Dict of NumPy dtype -> torch dtype (when the correspondence exists)
26
+ numpy_to_torch_dtype_dict = {
27
+ np.bool_: torch.bool,
28
+ np.uint8: torch.uint8,
29
+ np.uint16: torch.uint16,
30
+ np.uint32: torch.uint32,
31
+ np.uint64: torch.uint64,
32
+ np.int8: torch.int8,
33
+ np.int16: torch.int16,
34
+ np.int32: torch.int32,
35
+ np.int64: torch.int64,
36
+ np.float16: torch.float16,
37
+ np.float32: torch.float32,
38
+ np.float64: torch.float64,
39
+ np.complex64: torch.complex64,
40
+ np.complex128: torch.complex128
41
+ }
42
+
43
+ # Dict of torch dtype -> NumPy dtype
44
+ torch_to_numpy_dtype_dict = {
45
+ value: key
46
+ for (key, value) in numpy_to_torch_dtype_dict.items()
47
+ }
48
+ torch_to_numpy_dtype_dict.update({
49
+ torch.bfloat16: np.float32,
50
+ torch.complex32: np.complex64
51
+ })
52
+
53
+
54
+ @dataclass
55
+ class RequestData:
56
+ triton_req_id: int
57
+ triton_user_id: str
58
+ batch_index: int
59
+ batch_size: int
60
+ num_return_sequences: int
61
+ num_input_tokens: int
62
+ num_output_tokens: int
63
+ response_sender: Any
64
+
65
+
66
+ def mpi_comm():
67
+ from mpi4py import MPI
68
+ return MPI.COMM_WORLD
69
+
70
+
71
+ def mpi_rank():
72
+ return mpi_comm().Get_rank()
73
+
74
+
75
+ def get_input_tensor_by_name(request,
76
+ name,
77
+ expected_batch_size=None,
78
+ batch_index=None,
79
+ force_on_torch=False):
80
+ tensor = pb_utils.get_input_tensor_by_name(request, name)
81
+ if tensor is None:
82
+ return None
83
+
84
+ if tensor.is_cpu() and not force_on_torch:
85
+ tensor = tensor.as_numpy()
86
+ else:
87
+ tensor = from_dlpack(tensor.to_dlpack())
88
+
89
+ if expected_batch_size is not None and tensor.shape[
90
+ 0] != expected_batch_size:
91
+ raise pb_utils.TritonModelException(
92
+ f"Expected batch size doesn't match batch size for tensor {name}. Expected {expected_batch_size} got {tensor.shape[0]}"
93
+ )
94
+
95
+ if batch_index is not None and expected_batch_size is not None and batch_index >= expected_batch_size:
96
+ raise pb_utils.TritonModelException(
97
+ f"Invalid batch index in get_input_tensor_by_name for {name}")
98
+
99
+ if batch_index is not None:
100
+ # Add leading 1 batch dimension
101
+ if isinstance(tensor, np.ndarray):
102
+ return np.expand_dims(tensor[batch_index], axis=0)
103
+ elif isinstance(tensor, torch.Tensor):
104
+ return torch.unsqueeze(tensor[batch_index], dim=0)
105
+ else:
106
+ return tensor
107
+
108
+
109
+ def get_input_scalar_by_name(request,
110
+ name,
111
+ expected_batch_size=1,
112
+ batch_index=0):
113
+ tensor = pb_utils.get_input_tensor_by_name(request, name)
114
+ if tensor is None:
115
+ return None
116
+ tensor = tensor.as_numpy()
117
+
118
+ if tensor.size != expected_batch_size:
119
+ raise pb_utils.TritonModelException(
120
+ f"Expected a scalar tensor for tensor {name}")
121
+
122
+ return tensor.item(batch_index)
123
+
124
+
125
+ def read_parameter_as_type(value, name, pytype=str):
126
+ if value == "":
127
+ return None
128
+ if value.startswith("${") and value.endswith("}"):
129
+ return None
130
+ if pytype is bool:
131
+ return value.lower() in ["1", "true"]
132
+ try:
133
+ result = pytype(value)
134
+ return result
135
+ except:
136
+ pb_utils.Logger.log_warning(
137
+ f"Could not read parameter '{name}' with value '{value}', will use default."
138
+ )
139
+ return None
140
+
141
+
142
+ def get_parameter(model_config, name, pytype=str):
143
+ if name not in model_config['parameters']:
144
+ return None
145
+ return read_parameter_as_type(
146
+ model_config['parameters'][name]['string_value'], name, pytype)
147
+
148
+
149
+ def convert_word_list(word_list):
150
+ if word_list is None:
151
+ return None
152
+ word_list = word_list.tolist()
153
+ if len(word_list) == 0 or len(word_list[0]) != 2:
154
+ raise pb_utils.TritonModelException(f"Invalid format for word list.")
155
+ words, indices = word_list[0]
156
+ result = []
157
+ current_index = 0
158
+ for i in indices:
159
+ if i == -1:
160
+ continue
161
+ if i > len(words):
162
+ raise pb_utils.TritonModelException(
163
+ f"Invalid format for word list.")
164
+ current_word = []
165
+ while current_index < i:
166
+ current_word.append(words[current_index])
167
+ current_index += 1
168
+ result.append(current_word)
169
+ return result
170
+
171
+
172
+ def parse_medusa_choices(medusa_choices):
173
+ if medusa_choices is None:
174
+ return None
175
+ try:
176
+ result = json.loads(
177
+ "[" + medusa_choices.replace("{", "[").replace("}", "]") + "]")
178
+ assert isinstance(result, list) and len(result) > 0
179
+ assert all([isinstance(x, list) for x in result])
180
+ assert all([isinstance(y, int) for x in result for y in x])
181
+ except Exception:
182
+ raise pb_utils.TritonModelException(
183
+ "Invalid format for medusa_choices")
184
+ return result
185
+
186
+
187
+ def parse_eagle_choices(eagle_choices):
188
+ return parse_medusa_choices(eagle_choices)
189
+
190
+
191
+ def get_sampling_config_from_request(request, batch_size=1, batch_index=0):
192
+ kwargs = {}
193
+ kwargs['beam_width'] = get_input_scalar_by_name(
194
+ request, 'beam_width', batch_size, batch_index) or 1
195
+ kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k',
196
+ batch_size, batch_index)
197
+ kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p',
198
+ batch_size, batch_index)
199
+ kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[
200
+ 'top_p'] <= 0 else kwargs['top_p']
201
+ kwargs['random_seed'] = get_input_scalar_by_name(request, 'random_seed',
202
+ batch_size, batch_index)
203
+ kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature',
204
+ batch_size, batch_index)
205
+ kwargs['min_length'] = get_input_scalar_by_name(request, 'min_length',
206
+ batch_size, batch_index)
207
+ kwargs['repetition_penalty'] = get_input_scalar_by_name(
208
+ request, 'repetition_penalty', batch_size, batch_index)
209
+ kwargs['presence_penalty'] = get_input_scalar_by_name(
210
+ request, 'presence_penalty', batch_size, batch_index)
211
+ kwargs['frequency_penalty'] = get_input_scalar_by_name(
212
+ request, 'frequency_penalty', batch_size, batch_index)
213
+ kwargs['length_penalty'] = get_input_scalar_by_name(
214
+ request, 'len_penalty', batch_size, batch_index)
215
+ kwargs['top_p_min'] = get_input_scalar_by_name(request,
216
+ 'runtime_top_p_min',
217
+ batch_size, batch_index)
218
+ kwargs['top_p_reset_ids'] = get_input_scalar_by_name(
219
+ request, 'runtime_top_p_reset_ids', batch_size, batch_index)
220
+ kwargs['top_p_decay'] = get_input_scalar_by_name(request,
221
+ 'runtime_top_p_decay',
222
+ batch_size, batch_index)
223
+ kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name(
224
+ request, 'beam_search_diversity_rate', batch_size, batch_index)
225
+ kwargs['early_stopping'] = get_input_scalar_by_name(
226
+ request, 'early_stopping', batch_size, batch_index)
227
+ kwargs['num_return_sequences'] = get_input_scalar_by_name(
228
+ request, 'num_return_sequences', batch_size, batch_index) or 1
229
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
230
+ return trtllm.SamplingConfig(**kwargs)
231
+
232
+
233
+ def get_output_config_from_request(request, batch_size=1, batch_index=0):
234
+ kwargs = {}
235
+ kwargs["return_log_probs"] = get_input_scalar_by_name(
236
+ request, 'return_log_probs', batch_size, batch_index)
237
+ kwargs["return_context_logits"] = get_input_scalar_by_name(
238
+ request, 'return_context_logits', batch_size, batch_index)
239
+ kwargs["return_generation_logits"] = get_input_scalar_by_name(
240
+ request, 'return_generation_logits', batch_size, batch_index)
241
+ kwargs["return_perf_metrics"] = get_input_scalar_by_name(
242
+ request, 'return_kv_cache_reuse_stats', batch_size, batch_index)
243
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
244
+ return trtllm.OutputConfig(**kwargs)
245
+
246
+
247
+ def get_external_draft_tokens_config_from_request(request,
248
+ batch_size=1,
249
+ batch_index=0):
250
+ kwargs = {}
251
+ draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids',
252
+ batch_size, batch_index)
253
+ if draft_input_ids is not None:
254
+ kwargs['tokens'] = draft_input_ids[0].tolist()
255
+ draft_logits = get_input_tensor_by_name(request, 'draft_logits',
256
+ batch_size, batch_index)
257
+ if draft_logits is not None:
258
+ kwargs['logits'] = from_numpy(draft_logits).squeeze(dim=0)
259
+ kwargs['acceptance_threshold'] = get_input_scalar_by_name(
260
+ request, 'draft_acceptance_threshold', batch_size, batch_index)
261
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
262
+ if len(kwargs) > 0:
263
+ return trtllm.ExternalDraftTokensConfig(**kwargs)
264
+ return None
265
+
266
+
267
+ def get_prompt_tuning_config_from_request(request,
268
+ batch_size=1,
269
+ batch_index=0,
270
+ input_length=0):
271
+ # prompt_vocab_size is unused by executor.
272
+ kwargs = {}
273
+ prompt_embedding_table = get_input_tensor_by_name(
274
+ request, 'prompt_embedding_table', batch_size, batch_index)
275
+ prompt_table_extra_ids = get_input_tensor_by_name(
276
+ request, 'prompt_table_extra_ids', batch_size, batch_index)
277
+ if prompt_embedding_table is not None:
278
+ if isinstance(prompt_embedding_table, np.ndarray):
279
+ kwargs["embedding_table"] = from_numpy(
280
+ prompt_embedding_table).squeeze(dim=0)
281
+ elif isinstance(prompt_embedding_table, torch.Tensor):
282
+ kwargs["embedding_table"] = prompt_embedding_table.squeeze(dim=0)
283
+
284
+ if prompt_table_extra_ids is not None:
285
+ prompt_table_extra_ids = prompt_table_extra_ids[0].tolist()
286
+ if len(prompt_table_extra_ids) != 0:
287
+ kwargs["input_token_extra_ids"] = prompt_table_extra_ids[
288
+ 0:input_length]
289
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
290
+ if len(kwargs) > 0:
291
+ return trtllm.PromptTuningConfig(**kwargs)
292
+ return None
293
+
294
+
295
+ def get_lora_config_from_request(request, batch_size=1, batch_index=0):
296
+ kwargs = {}
297
+ kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id',
298
+ batch_size, batch_index)
299
+ lora_weights = get_input_tensor_by_name(request, 'lora_weights',
300
+ batch_size, batch_index)
301
+ if lora_weights is not None:
302
+ kwargs["weights"] = from_numpy(lora_weights).squeeze(dim=0)
303
+ lora_config = get_input_tensor_by_name(request, 'lora_config', batch_size,
304
+ batch_index)
305
+ if lora_config is not None:
306
+ kwargs["config"] = from_numpy(lora_config).squeeze(dim=0)
307
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
308
+ if len(kwargs) > 0:
309
+ return trtllm.LoraConfig(**kwargs)
310
+ return None
311
+
312
+
313
+ def get_guided_decoding_params_from_request(request,
314
+ batch_size=1,
315
+ batch_index=0):
316
+ kwargs = {}
317
+ guided_decoding_guide_type = get_input_tensor_by_name(
318
+ request, 'guided_decoding_guide_type', batch_size, batch_index)
319
+ if guided_decoding_guide_type is not None:
320
+ guided_decoding_guide_type = guided_decoding_guide_type.squeeze(
321
+ axis=0)[0].decode()
322
+ guided_decoding_guide_type_mapping = {
323
+ "json": trtllm.GuidedDecodingParams.GuideType.JSON,
324
+ "json_schema": trtllm.GuidedDecodingParams.GuideType.JSON_SCHEMA,
325
+ "regex": trtllm.GuidedDecodingParams.GuideType.REGEX,
326
+ "ebnf_grammar": trtllm.GuidedDecodingParams.GuideType.EBNF_GRAMMAR
327
+ }
328
+ guided_decoding_guide_type = guided_decoding_guide_type_mapping.get(
329
+ guided_decoding_guide_type)
330
+ kwargs['guide_type'] = guided_decoding_guide_type
331
+
332
+ guided_decoding_guide = get_input_tensor_by_name(request,
333
+ 'guided_decoding_guide',
334
+ batch_size, batch_index)
335
+ if guided_decoding_guide is not None:
336
+ kwargs['guide'] = guided_decoding_guide.squeeze(axis=0)[0].decode()
337
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
338
+ if len(kwargs) > 0:
339
+ return trtllm.GuidedDecodingParams(**kwargs)
340
+ return None
341
+
342
+
343
+ def get_kv_cache_retention_config_from_request(request,
344
+ batch_size=1,
345
+ batch_index=0):
346
+
347
+ def get_tensor_and_check_length(name: str, expected_length: int):
348
+ tensor = get_input_tensor_by_name(request, name, batch_size,
349
+ batch_index)
350
+
351
+ if tensor is None:
352
+ raise RuntimeError(f"{name} must be provided.")
353
+
354
+ tensor = np.squeeze(tensor, axis=0)
355
+
356
+ if len(tensor) != expected_length:
357
+ raise RuntimeError(
358
+ f"Invalid {name} length. Expected length {expected_length}, got length {len(tensor)}"
359
+ )
360
+
361
+ return tensor
362
+
363
+ token_range_starts = get_input_tensor_by_name(
364
+ request, "retention_token_range_starts", batch_size, batch_index)
365
+
366
+ if token_range_starts is not None:
367
+ token_range_starts = np.squeeze(token_range_starts, axis=0)
368
+
369
+ token_range_ends = get_tensor_and_check_length(
370
+ "retention_token_range_ends", len(token_range_starts))
371
+ token_range_ends = [
372
+ None if end == -1 else end for end in token_range_ends
373
+ ]
374
+
375
+ token_range_priorities = get_tensor_and_check_length(
376
+ "retention_token_range_priorities", len(token_range_starts))
377
+
378
+ token_range_durations_ms = get_input_tensor_by_name(
379
+ request, "retention_token_range_durations_ms", batch_size,
380
+ batch_index)
381
+
382
+ if token_range_durations_ms is None:
383
+ token_range_durations_ms = [None] * len(token_range_starts)
384
+ else:
385
+ token_range_durations_ms = np.squeeze(token_range_durations_ms,
386
+ axis=0)
387
+ token_range_durations_ms = [
388
+ None if duration == -1 else duration
389
+ for duration in token_range_durations_ms
390
+ ]
391
+
392
+ if len(token_range_durations_ms) != len(token_range_starts):
393
+ raise RuntimeError(
394
+ f"Invalid retention_token_range_durations length. Expected length {len(token_range_starts)}, got length {len(token_range_durations_ms)}"
395
+ )
396
+
397
+ ranges = []
398
+
399
+ for start, end, priority, duration_ms in zip(token_range_starts,
400
+ token_range_ends,
401
+ token_range_priorities,
402
+ token_range_durations_ms):
403
+ ranges.append(
404
+ trtllm.KvCacheRetentionConfig.TokenRangeRetentionConfig(
405
+ token_start=start,
406
+ token_end=end,
407
+ priority=priority.item(),
408
+ duration_ms=None if duration_ms is None else
409
+ datetime.timedelta(milliseconds=duration_ms.item())))
410
+
411
+ decode_args = {}
412
+
413
+ decode_priority = get_input_scalar_by_name(
414
+ request, "retention_decode_priority", batch_size, batch_index)
415
+ if decode_priority is not None:
416
+ decode_args['decode_retention_priority'] = decode_priority
417
+
418
+ decode_duration_ms = get_input_scalar_by_name(
419
+ request, "retention_decode_duration_ms", batch_size, batch_index)
420
+ if decode_duration_ms is not None:
421
+ decode_args[
422
+ 'decode_duration_ms'] = decode_duration_ms if decode_duration_ms != -1 else None
423
+
424
+ return trtllm.KvCacheRetentionConfig(
425
+ token_range_retention_configs=ranges, **decode_args)
426
+
427
+ return None
428
+
429
+
430
+ def build_1_2_5_buckets(max_value: int) -> List[int]:
431
+ """
432
+ Builds a list of buckets with increasing powers of 10 multiplied by
433
+ mantissa values (1, 5), starting from 10 until the value exceeds
434
+ the specified maximum.
435
+
436
+ Example:
437
+ >>> build_1_2_5_buckets(1000)
438
+ [10, 50, 100, 500, 1000]
439
+ """
440
+ mantissa_lst = [1, 5]
441
+ exponent = 1 # Start from exponent 1 instead of 0
442
+ buckets: List[int] = []
443
+ while True:
444
+ for m in mantissa_lst:
445
+ value = m * 10**exponent
446
+ if value <= max_value:
447
+ buckets.append(value)
448
+ else:
449
+ return buckets
450
+ exponent += 1
451
+
452
+
453
+ def convert_request(request, exclude_input_from_output, decoupled):
454
+ inputs = {}
455
+ input_token_ids = get_input_tensor_by_name(request, 'input_ids')
456
+ if input_token_ids is None:
457
+ raise pb_utils.TritonModelException(
458
+ "A value is required for input_ids")
459
+ if len(input_token_ids.shape) != 2:
460
+ raise pb_utils.TritonModelException(f"Invalid format for input_ids")
461
+ batch_size = input_token_ids.shape[0]
462
+ requests = []
463
+ for batch_index in range(0, batch_size):
464
+ input_token_ids = get_input_tensor_by_name(request, 'input_ids',
465
+ batch_size, batch_index)[0]
466
+ if input_token_ids is None:
467
+ raise pb_utils.TritonModelException(
468
+ "A value is required for input_ids")
469
+ input_token_ids = input_token_ids.tolist()
470
+ if len(input_token_ids) == 0:
471
+ raise pb_utils.TritonModelException(
472
+ f"Invalid format for input_ids")
473
+
474
+ input_length = get_input_scalar_by_name(request, 'input_lengths',
475
+ batch_size, batch_index)
476
+ if input_length is None:
477
+ input_length = len(input_token_ids)
478
+ # Trim input token ids with input_lengths
479
+ inputs['input_token_ids'] = input_token_ids[0:input_length]
480
+ inputs['max_new_tokens'] = get_input_scalar_by_name(
481
+ request, 'request_output_len', batch_size, batch_index)
482
+ if inputs['max_new_tokens'] is None:
483
+ raise pb_utils.TritonModelException(
484
+ "A value is required for request_output_len")
485
+ inputs['streaming'] = get_input_scalar_by_name(request, 'streaming',
486
+ batch_size, batch_index)
487
+ if inputs['streaming'] and not decoupled:
488
+ raise pb_utils.TritonModelException(
489
+ "Streaming is only supported in decoupled mode.")
490
+
491
+ inputs['end_id'] = get_input_scalar_by_name(request, 'end_id',
492
+ batch_size, batch_index)
493
+ inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id',
494
+ batch_size, batch_index)
495
+ inputs['stop_words'] = convert_word_list(
496
+ get_input_tensor_by_name(request, 'stop_words_list', batch_size,
497
+ batch_index))
498
+ inputs['bad_words'] = convert_word_list(
499
+ get_input_tensor_by_name(request, 'bad_words_list', batch_size,
500
+ batch_index))
501
+ embedding_bias = get_input_tensor_by_name(request, 'embedding_bias',
502
+ batch_size, batch_index)
503
+ if embedding_bias is not None and embedding_bias.size != 0:
504
+ inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze(
505
+ dim=0)
506
+
507
+ sampling_config = get_sampling_config_from_request(
508
+ request, batch_size, batch_index)
509
+ output_config = get_output_config_from_request(request, batch_size,
510
+ batch_index)
511
+ req_exclude_input_from_output = get_input_scalar_by_name(
512
+ request, 'exclude_input_in_output', batch_size, batch_index)
513
+ if req_exclude_input_from_output is None:
514
+ # if request doesn't specify exclude_input_from_output, try to use the parameter
515
+ output_config.exclude_input_from_output = (
516
+ exclude_input_from_output
517
+ if exclude_input_from_output is not None else False)
518
+ else:
519
+ output_config.exclude_input_from_output = req_exclude_input_from_output
520
+
521
+ external_draft_tokens_config = get_external_draft_tokens_config_from_request(
522
+ request, batch_size, batch_index)
523
+ prompt_tuning_config = get_prompt_tuning_config_from_request(
524
+ request, batch_size, batch_index, input_length)
525
+ lora_config = get_lora_config_from_request(request, batch_size,
526
+ batch_index)
527
+ kv_cache_retention_config = get_kv_cache_retention_config_from_request(
528
+ request, batch_size, batch_index)
529
+
530
+ # Inputs for mllama support
531
+ encoder_input_features = get_input_tensor_by_name(
532
+ request, 'encoder_input_features', batch_size, batch_index)
533
+ if encoder_input_features is not None:
534
+ if isinstance(encoder_input_features, np.ndarray):
535
+ encoder_input_features = from_numpy(
536
+ encoder_input_features).squeeze(dim=0)
537
+ elif isinstance(encoder_input_features, torch.Tensor):
538
+ encoder_input_features = encoder_input_features.squeeze(dim=0)
539
+ inputs['encoder_input_features'] = encoder_input_features
540
+ logger.debug(
541
+ f"inputs to llm: encoder_input_features ({encoder_input_features.shape}"
542
+ )
543
+
544
+ encoder_output_length = get_input_tensor_by_name(
545
+ request, 'encoder_output_lengths', batch_size, batch_index)
546
+ if encoder_output_length is not None:
547
+ inputs['encoder_output_length'] = np.squeeze(
548
+ encoder_output_length, axis=0)
549
+
550
+ cross_attention_mask = get_input_tensor_by_name(
551
+ request, 'cross_attention_mask', batch_size, batch_index)
552
+ if cross_attention_mask is not None:
553
+ inputs['cross_attention_mask'] = cross_attention_mask[0]
554
+ logger.debug(
555
+ f"inputs to llm: cross_attention_mask ({ cross_attention_mask.shape})"
556
+ )
557
+
558
+ skip_cross_attn_blocks = get_input_tensor_by_name(
559
+ request,
560
+ 'skip_cross_attn_blocks',
561
+ batch_size,
562
+ batch_index,
563
+ force_on_torch=True)
564
+ if skip_cross_attn_blocks is not None:
565
+ inputs['skip_cross_attn_blocks'] = skip_cross_attn_blocks[0]
566
+ logger.debug(
567
+ f"inputs to llm: skip_cross_attn_blocks ({ skip_cross_attn_blocks.shape})"
568
+ )
569
+
570
+ guided_decoding_params = get_guided_decoding_params_from_request(
571
+ request, batch_size, batch_index)
572
+
573
+ requests.append(
574
+ trtllm.Request(
575
+ **inputs,
576
+ sampling_config=sampling_config,
577
+ output_config=output_config,
578
+ external_draft_tokens_config=external_draft_tokens_config,
579
+ prompt_tuning_config=prompt_tuning_config,
580
+ lora_config=lora_config,
581
+ guided_decoding_params=guided_decoding_params,
582
+ kv_cache_retention_config=kv_cache_retention_config))
583
+ return requests
584
+
585
+
586
+ def convert_response(response,
587
+ batch_index,
588
+ batch_size,
589
+ num_return_sequences,
590
+ expected_logits_dtype=torch.float32):
591
+
592
+ if response.has_error():
593
+ return pb_utils.InferenceResponse(output_tensors=[],
594
+ error=pb_utils.TritonError(
595
+ response.error_msg)), True, 0
596
+ result = response.result
597
+ beam_lengths = np.expand_dims(
598
+ np.array([len(beam) for beam in result.output_token_ids], np.int32), 0)
599
+ max_beam_length = max([len(beam) for beam in result.output_token_ids])
600
+ output_ids = np.full((1, len(result.output_token_ids), max_beam_length),
601
+ -1, np.int32)
602
+ for idx, beam in enumerate(result.output_token_ids):
603
+ output_ids[0, idx, :len(beam)] = beam
604
+
605
+ output_lengths = output_ids.size
606
+ output_tensors = [
607
+ pb_utils.Tensor("output_ids", output_ids),
608
+ pb_utils.Tensor("sequence_length", beam_lengths),
609
+ ]
610
+
611
+ if result.cum_log_probs is not None:
612
+ output_tensors.append(
613
+ pb_utils.Tensor(
614
+ "cum_log_probs",
615
+ np.expand_dims(np.array(result.cum_log_probs, np.float32), 0)))
616
+
617
+ if result.log_probs is not None:
618
+ output_tensors.append(
619
+ pb_utils.Tensor(
620
+ "output_log_probs",
621
+ np.expand_dims(np.array(result.log_probs, np.float32), 0)))
622
+
623
+ if result.context_logits is not None:
624
+ assert (result.context_logits.dtype is expected_logits_dtype)
625
+ output_tensors.append(
626
+ pb_utils.Tensor(
627
+ "context_logits",
628
+ np.expand_dims(
629
+ np.array(
630
+ result.context_logits, torch_to_numpy_dtype_dict[
631
+ result.context_logits.dtype]), 0)))
632
+
633
+ if result.generation_logits is not None:
634
+ assert (result.generation_logits.dtype is expected_logits_dtype)
635
+ output_tensors.append(
636
+ pb_utils.Tensor(
637
+ "generation_logits",
638
+ np.expand_dims(
639
+ np.array(
640
+ result.generation_logits, torch_to_numpy_dtype_dict[
641
+ result.generation_logits.dtype]), 0)))
642
+
643
+ if batch_size > 1:
644
+ output_tensors.append(
645
+ pb_utils.Tensor(
646
+ "batch_index",
647
+ np.expand_dims(np.array([batch_index], np.int32), 0)))
648
+
649
+ if num_return_sequences > 1:
650
+ output_tensors.append(
651
+ pb_utils.Tensor(
652
+ "sequence_index",
653
+ np.expand_dims(np.array([result.sequence_index], np.int32),
654
+ 0)))
655
+
656
+ if result.request_perf_metrics is not None:
657
+ kv_cache_metrics = result.request_perf_metrics.kv_cache_metrics
658
+ output_tensors.append(
659
+ pb_utils.Tensor(
660
+ "kv_cache_alloc_new_blocks",
661
+ np.expand_dims(
662
+ np.array([kv_cache_metrics.num_new_allocated_blocks],
663
+ np.int32), 0)))
664
+ output_tensors.append(
665
+ pb_utils.Tensor(
666
+ "kv_cache_reused_blocks",
667
+ np.expand_dims(
668
+ np.array([kv_cache_metrics.num_reused_blocks], np.int32),
669
+ 0)))
670
+ output_tensors.append(
671
+ pb_utils.Tensor(
672
+ "kv_cache_alloc_total_blocks",
673
+ np.expand_dims(
674
+ np.array([kv_cache_metrics.num_total_allocated_blocks],
675
+ np.int32), 0)))
676
+
677
+ return pb_utils.InferenceResponse(
678
+ output_tensors), result.is_final, output_lengths
679
+
680
+
681
+ def convert_scheduler_policy(batch_scheduler_policy: str):
682
+ if batch_scheduler_policy.lower() == "max_utilization":
683
+ return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
684
+ elif batch_scheduler_policy.lower() == "guaranteed_no_evict":
685
+ return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
686
+ raise pb_utils.TritonModelException(
687
+ f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported."
688
+ )
689
+
690
+
691
+ def convert_batching_type(gpt_model_type: str):
692
+ if gpt_model_type is None:
693
+ return None
694
+ if gpt_model_type.lower(
695
+ ) == "inflight_fused_batching" or gpt_model_type.lower(
696
+ ) == "inflight_batching":
697
+ return trtllm.BatchingType.INFLIGHT
698
+ elif gpt_model_type.lower() == "v1":
699
+ return trtllm.BatchingType.STATIC
700
+ raise pb_utils.TritonModelException(
701
+ f"gpt_model_type value of '{gpt_model_type}' is not supported.")
702
+
703
+
704
+ def convert_decoding_mode(decoding_mode: str):
705
+ if decoding_mode is None:
706
+ return None
707
+ elif decoding_mode == "auto":
708
+ return trtllm.DecodingMode.Auto()
709
+ elif decoding_mode == "top_k":
710
+ return trtllm.DecodingMode.TopK()
711
+ elif decoding_mode == "top_p":
712
+ return trtllm.DecodingMode.TopP()
713
+ elif decoding_mode == "top_k_top_p":
714
+ return trtllm.DecodingMode.TopKTopP()
715
+ elif decoding_mode == "beam_search":
716
+ return trtllm.DecodingMode.BeamSearch()
717
+ elif decoding_mode == "medusa":
718
+ return trtllm.DecodingMode.Medusa()
719
+ elif decoding_mode == "redrafter":
720
+ return trtllm.DecodingMode.ExplicitDraftTokens()
721
+ elif decoding_mode == "lookahead":
722
+ return trtllm.DecodingMode.Lookahead()
723
+ elif decoding_mode == "eagle":
724
+ return trtllm.DecodingMode.Eagle()
725
+ raise pb_utils.TritonModelException(
726
+ f"decoding_mode value of '{decoding_mode}' is not supported.")
727
+
728
+
729
+ def convert_timestamp_to_seconds(timestamp: str):
730
+ return int(
731
+ datetime.datetime.strptime(timestamp,
732
+ "%m-%d-%Y %H:%M:%S.%f").timestamp())
733
+
734
+
735
+ def triton_string_to_torch(dtype):
736
+ type_map = {
737
+ "TYPE_BOOL": torch.bool,
738
+ "TYPE_UINT8": torch.uint8,
739
+ "TYPE_INT8": torch.int8,
740
+ "TYPE_INT16": torch.int16,
741
+ "TYPE_INT32": torch.int32,
742
+ "TYPE_INT64": torch.int64,
743
+ "TYPE_FP16": torch.float16,
744
+ "TYPE_FP32": torch.float32,
745
+ "TYPE_FP64": torch.float64,
746
+ "TYPE_BF16": torch.bfloat16
747
+ }
748
+ return type_map[dtype]
749
+
750
+
751
+ class TritonPythonModel:
752
+ """Your Python model must use the same class name. Every Python model
753
+ that is created must have "TritonPythonModel" as the class name.
754
+ """
755
+
756
+ def get_scheduler_config(self, model_config):
757
+ batch_scheduler_policy = get_parameter(model_config,
758
+ "batch_scheduler_policy")
759
+ if batch_scheduler_policy is None:
760
+ return trtllm.SchedulerConfig()
761
+ return trtllm.SchedulerConfig(
762
+ convert_scheduler_policy(batch_scheduler_policy))
763
+
764
+ def get_kv_cache_config(self, model_config):
765
+ kwargs = {
766
+ "enable_block_reuse":
767
+ get_parameter(model_config, "enable_kv_cache_reuse", bool),
768
+ "max_tokens":
769
+ get_parameter(model_config, "max_tokens_in_paged_kv_cache", int),
770
+ "sink_token_length":
771
+ get_parameter(model_config, "sink_token_length", int),
772
+ "free_gpu_memory_fraction":
773
+ get_parameter(model_config, "kv_cache_free_gpu_mem_fraction",
774
+ float),
775
+ "cross_kv_cache_fraction":
776
+ get_parameter(model_config, "cross_kv_cache_fraction", float),
777
+ "host_cache_size":
778
+ get_parameter(model_config, "kv_cache_host_memory_bytes", int),
779
+ "onboard_blocks":
780
+ get_parameter(model_config, "kv_cache_onboard_blocks", bool),
781
+ }
782
+ max_attention_window_size = get_parameter(model_config,
783
+ "max_attention_window_size")
784
+ if max_attention_window_size:
785
+ kwargs["max_attention_window"] = [
786
+ int(x) for x in max_attention_window_size.split(",")
787
+ ]
788
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
789
+ return trtllm.KvCacheConfig(**kwargs)
790
+
791
+ def get_parallel_config(self, model_config):
792
+ kwargs = {}
793
+ gpu_device_ids = get_parameter(model_config, "gpu_device_ids")
794
+ if gpu_device_ids:
795
+ kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")]
796
+ self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR",
797
+ "0") == "1"
798
+ if self.use_orchestrator_mode:
799
+ kwargs[
800
+ "communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR
801
+ worker_path = get_parameter(model_config, "worker_path")
802
+ spawn_processes = os.environ.get(
803
+ "TRTLLM_ORCHESTRATOR_SPAWN_PROCESSES", "1") == "1"
804
+ if not spawn_processes:
805
+ raise pb_utils.TritonModelException(
806
+ "Orchestrator mode with --disable-spawn-processes is not supported in the Python backend."
807
+ )
808
+ is_orchestrator = (mpi_rank() == 0) if spawn_processes else True
809
+ if worker_path is not None:
810
+ raise pb_utils.TritonModelException(
811
+ "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable."
812
+ )
813
+ executor_worker_path = get_parameter(model_config,
814
+ "executor_worker_path")
815
+ kwargs["orchestrator_config"] = trtllm.OrchestratorConfig(
816
+ is_orchestrator, executor_worker_path)
817
+ if len(kwargs) > 0:
818
+ return trtllm.ParallelConfig(**kwargs)
819
+ return None
820
+
821
+ def get_peft_cache_config(self, model_config):
822
+ kwargs = {
823
+ "optimal_adapter_size":
824
+ get_parameter(model_config, "lora_cache_optimal_adapter_size",
825
+ int),
826
+ "max_adapter_size":
827
+ get_parameter(model_config, "lora_cache_max_adapter_size", int),
828
+ "device_cache_percent":
829
+ get_parameter(model_config, "lora_cache_gpu_memory_fraction",
830
+ float),
831
+ "host_cache_size":
832
+ get_parameter(model_config, "lora_cache_host_memory_bytes", int),
833
+ }
834
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
835
+ return trtllm.PeftCacheConfig(**kwargs)
836
+
837
+ def get_decoding_config(self, model_config):
838
+ eagle_choices = parse_eagle_choices(
839
+ get_parameter(model_config, "eagle_choices"))
840
+ kwargs = {
841
+ "medusa_choices":
842
+ parse_medusa_choices(get_parameter(model_config,
843
+ "medusa_choices")),
844
+ "eagle_config":
845
+ None
846
+ if eagle_choices is None else trtllm.EagleConfig(eagle_choices),
847
+ "decoding_mode":
848
+ convert_decoding_mode(get_parameter(model_config,
849
+ "decoding_mode")),
850
+ }
851
+ print(kwargs)
852
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
853
+ return trtllm.DecodingConfig(**kwargs)
854
+
855
+ def get_extended_runtime_perf_knob_config(self, model_config):
856
+ kwargs = {
857
+ "multi_block_mode":
858
+ get_parameter(model_config, "multi_block_mode", bool),
859
+ "enable_context_fmha_fp32_acc":
860
+ get_parameter(model_config, "enable_context_fmha_fp32_acc", bool),
861
+ "cuda_graph_mode":
862
+ get_parameter(model_config, "cuda_graph_mode", bool),
863
+ "cuda_graph_cache_size":
864
+ get_parameter(model_config, "cuda_graph_cache_size", int),
865
+ }
866
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
867
+ return trtllm.ExtendedRuntimePerfKnobConfig(**kwargs)
868
+
869
+ def get_guided_decoding_config(self, model_config):
870
+
871
+ guided_decoding_backend = get_parameter(model_config,
872
+ "guided_decoding_backend", str)
873
+
874
+ tokenizer_dir = get_parameter(model_config, "tokenizer_dir", str)
875
+ if guided_decoding_backend not in ['xgrammar']:
876
+ if tokenizer_dir:
877
+ pb_utils.Logger.log_warn(
878
+ f"Guided decoding backend has not been set but tokenizer_dir is given. Tokenizer_dir will be ignored."
879
+ )
880
+ return None
881
+
882
+ if guided_decoding_backend == 'xgrammar':
883
+ guided_decoding_backend = trtllm.GuidedDecodingConfig.GuidedDecodingBackend.XGRAMMAR
884
+
885
+ if not tokenizer_dir:
886
+ raise ValueError(
887
+ "Guided decoding requires tokenizer's information. Please provide 'tokenizer_dir'."
888
+ )
889
+ from transformers import AutoTokenizer
890
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
891
+ pb_utils.Logger.log_info(
892
+ f"Guided decoding has been set with {guided_decoding_backend} backend"
893
+ )
894
+ return trtllm.GuidedDecodingConfig(
895
+ backend=guided_decoding_backend,
896
+ **_xgrammar_tokenizer_info(tokenizer))
897
+
898
+ def get_executor_config(self, model_config):
899
+ kwargs = {
900
+ "max_beam_width":
901
+ get_parameter(model_config, "max_beam_width", int),
902
+ "scheduler_config":
903
+ self.get_scheduler_config(model_config),
904
+ "kv_cache_config":
905
+ self.get_kv_cache_config(model_config),
906
+ "enable_chunked_context":
907
+ get_parameter(model_config, "enable_chunked_context", bool),
908
+ "normalize_log_probs":
909
+ get_parameter(model_config, "normalize_log_probs", bool),
910
+ "batching_type":
911
+ convert_batching_type(get_parameter(model_config,
912
+ "gpt_model_type")),
913
+ "parallel_config":
914
+ self.get_parallel_config(model_config),
915
+ "peft_cache_config":
916
+ self.get_peft_cache_config(model_config),
917
+ "decoding_config":
918
+ self.get_decoding_config(model_config),
919
+ "max_queue_size":
920
+ model_config.get(
921
+ "dynamic_batching",
922
+ {},
923
+ ).get(
924
+ "default_queue_policy",
925
+ {},
926
+ ).get("max_queue_size"),
927
+ "extended_runtime_perf_knob_config":
928
+ self.get_extended_runtime_perf_knob_config(model_config),
929
+ "guided_decoding_config":
930
+ self.get_guided_decoding_config(model_config)
931
+ }
932
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
933
+ return trtllm.ExecutorConfig(**kwargs)
934
+
935
+ def create_metrics(self, model: str, version: str, is_v1_model: bool):
936
+ self.request_metric_family = pb_utils.MetricFamily(
937
+ name="nv_trt_llm_request_metrics",
938
+ description="TRT LLM request metrics",
939
+ kind=pb_utils.MetricFamily.GAUGE,
940
+ )
941
+ self.runtime_memory_metric_family = pb_utils.MetricFamily(
942
+ name="nv_trt_llm_runtime_memory_metrics",
943
+ description="TRT LLM runtime memory metrics",
944
+ kind=pb_utils.MetricFamily.GAUGE,
945
+ )
946
+ self.kv_cache_metric_family = pb_utils.MetricFamily(
947
+ name="nv_trt_llm_kv_cache_block_metrics",
948
+ description="TRT LLM KV cache block metrics",
949
+ kind=pb_utils.MetricFamily.GAUGE,
950
+ )
951
+ model_type = "v1" if is_v1_model else "inflight_batcher"
952
+ self.model_type_metric_family = pb_utils.MetricFamily(
953
+ name=f"nv_trt_llm_{model_type}_metrics",
954
+ description=f"TRT LLM {model_type}-specific metrics",
955
+ kind=pb_utils.MetricFamily.GAUGE,
956
+ )
957
+ self.general_metric_family = pb_utils.MetricFamily(
958
+ name="nv_trt_llm_general_metrics",
959
+ description="General TRT LLM metrics",
960
+ kind=pb_utils.MetricFamily.GAUGE,
961
+ )
962
+ # Set the metric using self.general_metric_output_family.observe(string_size)
963
+ self.request_tokens_metric_family = pb_utils.MetricFamily(
964
+ name="nv_llm_input_token_len",
965
+ description="TRT LLM response metrics",
966
+ kind=pb_utils.MetricFamily.HISTOGRAM,
967
+ )
968
+ self.response_tokens_metric_family = pb_utils.MetricFamily(
969
+ name="nv_llm_output_token_len",
970
+ description="TRT LLM response metrics",
971
+ kind=pb_utils.MetricFamily.HISTOGRAM,
972
+ )
973
+ common_labels = {"model": model, "version": version}
974
+ self.all_metrics = {
975
+ # Request metrics
976
+ "num_active_requests":
977
+ self.request_metric_family.Metric(labels={
978
+ "request_type": "active",
979
+ **common_labels
980
+ }),
981
+ "max_num_active_requests":
982
+ self.request_metric_family.Metric(labels={
983
+ "request_type": "max",
984
+ **common_labels
985
+ }),
986
+ "num_scheduled_requests":
987
+ self.request_metric_family.Metric(labels={
988
+ "request_type": "scheduled",
989
+ **common_labels
990
+ }),
991
+ "num_context_requests":
992
+ self.request_metric_family.Metric(labels={
993
+ "request_type": "context",
994
+ **common_labels
995
+ }),
996
+ # Runtime metrics
997
+ "cpu_mem_usage":
998
+ self.runtime_memory_metric_family.Metric(labels={
999
+ "memory_type": "cpu",
1000
+ **common_labels
1001
+ }),
1002
+ "gpu_mem_usage":
1003
+ self.runtime_memory_metric_family.Metric(labels={
1004
+ "memory_type": "gpu",
1005
+ **common_labels
1006
+ }),
1007
+ "pinned_mem_usage":
1008
+ self.runtime_memory_metric_family.Metric(labels={
1009
+ "memory_type": "pinned",
1010
+ **common_labels
1011
+ }),
1012
+ # KV cache metrics
1013
+ "max_num_blocks":
1014
+ self.kv_cache_metric_family.Metric(labels={
1015
+ "kv_cache_block_type": "max",
1016
+ **common_labels
1017
+ }),
1018
+ "free_num_blocks":
1019
+ self.kv_cache_metric_family.Metric(labels={
1020
+ "kv_cache_block_type": "free",
1021
+ **common_labels
1022
+ }),
1023
+ "used_num_blocks":
1024
+ self.kv_cache_metric_family.Metric(labels={
1025
+ "kv_cache_block_type": "used",
1026
+ **common_labels
1027
+ }),
1028
+ "tokens_per_block":
1029
+ self.kv_cache_metric_family.Metric(labels={
1030
+ "kv_cache_block_type": "tokens_per",
1031
+ **common_labels
1032
+ }),
1033
+ # General metrics
1034
+ "timestamp":
1035
+ self.general_metric_family.Metric(labels={
1036
+ "general_type": "timestamp",
1037
+ **common_labels
1038
+ }),
1039
+ "iter":
1040
+ self.general_metric_family.Metric(labels={
1041
+ "general_type": "iteration_counter",
1042
+ **common_labels
1043
+ }),
1044
+ METRIC_TOTAL_OUTPUT_TOKENS:
1045
+ self.response_tokens_metric_family.Metric(
1046
+ labels={
1047
+ "response_metric_type": METRIC_TOTAL_OUTPUT_TOKENS,
1048
+ **common_labels
1049
+ },
1050
+ buckets=build_1_2_5_buckets(1000)),
1051
+ METRIC_TOTAL_INPUT_TOKENS:
1052
+ self.request_tokens_metric_family.Metric(
1053
+ labels={
1054
+ "response_metric_type": METRIC_TOTAL_INPUT_TOKENS,
1055
+ **common_labels
1056
+ },
1057
+ buckets=build_1_2_5_buckets(1000)),
1058
+ }
1059
+ if is_v1_model:
1060
+ self.all_metrics.update({
1061
+ "num_ctx_tokens":
1062
+ self.model_type_metric_family.Metric(labels={
1063
+ "v1_specific_metric": "total_context_tokens",
1064
+ **common_labels
1065
+ }),
1066
+ "num_gen_tokens":
1067
+ self.model_type_metric_family.Metric(
1068
+ labels={
1069
+ "v1_specific_metric": "total_generation_tokens",
1070
+ **common_labels
1071
+ }),
1072
+ "empty_gen_slots":
1073
+ self.model_type_metric_family.Metric(
1074
+ labels={
1075
+ "v1_specific_metric": "empty_generation_slots",
1076
+ **common_labels
1077
+ }),
1078
+ })
1079
+ else:
1080
+ self.all_metrics.update({
1081
+ "num_ctx_tokens":
1082
+ self.model_type_metric_family.Metric(
1083
+ labels={
1084
+ "inflight_batcher_specific_metric":
1085
+ "total_context_tokens",
1086
+ **common_labels
1087
+ }),
1088
+ "num_gen_requests":
1089
+ self.model_type_metric_family.Metric(
1090
+ labels={
1091
+ "inflight_batcher_specific_metric":
1092
+ "generation_requests",
1093
+ **common_labels
1094
+ }),
1095
+ "micro_batch_id":
1096
+ self.model_type_metric_family.Metric(
1097
+ labels={
1098
+ "inflight_batcher_specific_metric": "micro_batch_id",
1099
+ **common_labels
1100
+ }),
1101
+ "num_paused_requests":
1102
+ self.model_type_metric_family.Metric(
1103
+ labels={
1104
+ "inflight_batcher_specific_metric": "paused_requests",
1105
+ **common_labels
1106
+ }),
1107
+ })
1108
+
1109
+ def initialize(self, args):
1110
+ """`initialize` is called only once when the model is being loaded.
1111
+ Implementing `initialize` function is optional. This function allows
1112
+ the model to initialize any state associated with this model.
1113
+
1114
+ Parameters
1115
+ ----------
1116
+ args : dict
1117
+ Both keys and values are strings. The dictionary keys and values are:
1118
+ * model_config: A JSON string containing the model configuration
1119
+ * model_instance_kind: A string containing model instance kind
1120
+ * model_instance_device_id: A string containing model instance device ID
1121
+ * model_repository: Model repository path
1122
+ * model_version: Model version
1123
+ * model_name: Model name
1124
+ """
1125
+ model_config = json.loads(args['model_config'])
1126
+ gpt_model_path = get_parameter(model_config, "gpt_model_path")
1127
+ if get_parameter(model_config, "enable_trt_overlap", bool):
1128
+ raise pb_utils.TritonModelException(
1129
+ f"enable_trt_overlap=true is not supported.")
1130
+ self.exclude_input_from_output = get_parameter(
1131
+ model_config, "exclude_input_in_output", bool)
1132
+ executor_config = self.get_executor_config(model_config)
1133
+ self.executor = trtllm.Executor(gpt_model_path,
1134
+ trtllm.ModelType.DECODER_ONLY,
1135
+ executor_config)
1136
+ self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
1137
+ model_config)
1138
+ self.cancellation_check_period_ms = get_parameter(
1139
+ model_config, "cancellation_check_period_ms", int) or 100
1140
+ self.stats_check_period_ms = get_parameter(
1141
+ model_config, "stats_check_period_ms", int) or 100
1142
+
1143
+ self.logits_dtype = None
1144
+ for output in model_config['output']:
1145
+ if output['name'] == 'context_logits' or output[
1146
+ 'name'] == 'generation_logits':
1147
+ self.logits_dtype = triton_string_to_torch(output['data_type'])
1148
+
1149
+ self.create_metrics(args["model_name"],
1150
+ args["model_version"],
1151
+ is_v1_model=executor_config.batching_type ==
1152
+ trtllm.BatchingType.STATIC)
1153
+ self.triton_user_id_to_req_ids = {}
1154
+ self.triton_req_id_to_req_ids = {}
1155
+ self.req_id_to_request_data = {}
1156
+ self.lock = Lock()
1157
+ self.running = False
1158
+ self.awaiter_thread = Thread(target=self.awaiter_loop)
1159
+ self.cancellation_thread = Thread(target=self.cancellation_loop)
1160
+ self.metrics_thread = Thread(target=self.metrics_loop)
1161
+ if self.executor.can_enqueue_requests():
1162
+ self.running = True
1163
+ self.awaiter_thread.start()
1164
+ self.cancellation_thread.start()
1165
+ self.metrics_thread.start()
1166
+ else:
1167
+ # In leader mode, worker ranks will wait here until leader is done.
1168
+ self.executor.shutdown()
1169
+
1170
+ def handle_stop_request(self, triton_user_id, response_sender):
1171
+ if triton_user_id is None or triton_user_id == "":
1172
+ response_sender.send(
1173
+ pb_utils.InferenceResponse(error=pb_utils.TritonError(
1174
+ "A request id must be provided for request cancellation")),
1175
+ flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
1176
+ return
1177
+
1178
+ with self.lock:
1179
+ if triton_user_id in self.triton_user_id_to_req_ids:
1180
+ req_ids = self.triton_user_id_to_req_ids[triton_user_id]
1181
+ for req_id in req_ids:
1182
+ self.executor.cancel_request(req_id)
1183
+
1184
+ response_sender.send(
1185
+ pb_utils.InferenceResponse(),
1186
+ flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
1187
+
1188
+ def execute(self, requests):
1189
+ """`execute` must be implemented in every Python model. `execute`
1190
+ function receives a list of pb_utils.InferenceRequest as the only
1191
+ argument. This function is called when an inference is requested
1192
+ for this model.
1193
+
1194
+ Parameters
1195
+ ----------
1196
+ requests : list
1197
+ A list of pb_utils.InferenceRequest
1198
+
1199
+ Returns
1200
+ -------
1201
+ list
1202
+ A list of pb_utils.InferenceResponse. The length of this list must
1203
+ be the same as `requests`
1204
+ """
1205
+ if not self.executor.can_enqueue_requests():
1206
+ return
1207
+
1208
+ # Convert to executor requests.
1209
+
1210
+ triton_requests = []
1211
+ executor_requests = []
1212
+ batch_indices = []
1213
+ triton_user_ids = []
1214
+ triton_req_ids = []
1215
+
1216
+ for request in requests:
1217
+
1218
+ triton_user_id = request.request_id()
1219
+
1220
+ response_sender = request.get_response_sender()
1221
+ stop = get_input_scalar_by_name(request, 'stop')
1222
+
1223
+ if stop:
1224
+ self.handle_stop_request(triton_user_id, response_sender)
1225
+ else:
1226
+ #Unique request id used to identify each triton request
1227
+ triton_req_id = str(randint(0, sys.maxsize))
1228
+ self.triton_req_id_to_req_ids[triton_req_id] = set()
1229
+ if triton_user_id is not None and triton_user_id != "":
1230
+ self.triton_user_id_to_req_ids[triton_user_id] = set()
1231
+
1232
+ try:
1233
+ converted_reqs = convert_request(
1234
+ request, self.exclude_input_from_output,
1235
+ self.decoupled)
1236
+ except Exception as e:
1237
+ response_sender.send(
1238
+ pb_utils.InferenceResponse(error=pb_utils.TritonError(
1239
+ f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'"
1240
+ )),
1241
+ flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
1242
+ else:
1243
+ for batch_index, converted_req in enumerate(
1244
+ converted_reqs):
1245
+ triton_requests.append(request)
1246
+ executor_requests.append(converted_req)
1247
+ triton_user_ids.append(triton_user_id)
1248
+ triton_req_ids.append(triton_req_id)
1249
+ batch_indices.append(batch_index)
1250
+
1251
+ with self.lock:
1252
+ request_ids = self.executor.enqueue_requests(executor_requests)
1253
+ for req_id, triton_req_id, triton_user_id, executor_request, triton_request, batch_index in zip(
1254
+ request_ids, triton_req_ids, triton_user_ids,
1255
+ executor_requests, triton_requests, batch_indices):
1256
+
1257
+ self.req_id_to_request_data[req_id] = RequestData(
1258
+ triton_req_id, triton_user_id, batch_index,
1259
+ len(batch_indices),
1260
+ executor_request.sampling_config.num_return_sequences, 0,
1261
+ 0, triton_request.get_response_sender())
1262
+ self.triton_req_id_to_req_ids[triton_req_id].add(req_id)
1263
+ input_len = len(
1264
+ executor_request.input_token_ids
1265
+ ) if executor_request.input_token_ids is not None else 0
1266
+ self.req_id_to_request_data[
1267
+ req_id].num_input_tokens += input_len
1268
+ # This checks both request level and instance config level
1269
+ if executor_request.output_config.exclude_input_from_output == False and executor_request.streaming == False:
1270
+ self.req_id_to_request_data[
1271
+ req_id].num_output_tokens -= self.req_id_to_request_data[
1272
+ req_id].num_input_tokens * executor_request.sampling_config.beam_width
1273
+ if triton_user_id is not None and triton_user_id != "":
1274
+ self.triton_user_id_to_req_ids[triton_user_id].add(req_id)
1275
+
1276
+ return None
1277
+
1278
+ def awaiter_loop(self):
1279
+ """Gets responses from executor and returns the results."""
1280
+ while self.running:
1281
+ for response in self.executor.await_responses(
1282
+ timeout=datetime.timedelta(milliseconds=1)):
1283
+ req_id = response.request_id
1284
+ request_data = None
1285
+ with self.lock:
1286
+ if req_id not in self.req_id_to_request_data:
1287
+ continue
1288
+ request_data = self.req_id_to_request_data[req_id]
1289
+
1290
+ triton_response, is_final, output_length = convert_response(
1291
+ response, request_data.batch_index,
1292
+ request_data.batch_size, request_data.num_return_sequences,
1293
+ self.logits_dtype)
1294
+ with self.lock:
1295
+ self.req_id_to_request_data[
1296
+ req_id].num_output_tokens += output_length
1297
+ triton_request_final = False
1298
+ if is_final:
1299
+ with self.lock:
1300
+ # Check if all executor requests part of that triton request are finished
1301
+ self.triton_req_id_to_req_ids[
1302
+ request_data.triton_req_id].remove(req_id)
1303
+ if len(self.triton_req_id_to_req_ids[
1304
+ request_data.triton_req_id]) == 0:
1305
+ pb_utils.Logger.log_info(
1306
+ f"DELETING Req id {req_id}, triton_req_id {request_data.triton_req_id} "
1307
+ )
1308
+ triton_request_final = True
1309
+ del self.triton_req_id_to_req_ids[
1310
+ request_data.triton_req_id]
1311
+ if request_data.triton_user_id is not None and request_data.triton_user_id != "":
1312
+ del self.triton_user_id_to_req_ids[
1313
+ request_data.triton_user_id]
1314
+ self.update_metrics_per_request(req_id)
1315
+ del self.req_id_to_request_data[req_id]
1316
+
1317
+ request_data.response_sender.send(
1318
+ triton_response,
1319
+ flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
1320
+ if triton_request_final else 0)
1321
+
1322
+ def cancellation_loop(self):
1323
+ """Checks if any pending requests have been cancelled."""
1324
+ while self.running:
1325
+ time.sleep(self.cancellation_check_period_ms / 1000.0)
1326
+ with self.lock:
1327
+ for req_id, request_data in self.req_id_to_request_data.items(
1328
+ ):
1329
+ if request_data.response_sender.is_cancelled():
1330
+ self.executor.cancel_request(req_id)
1331
+
1332
+ def update_metrics_per_request(self, req_id):
1333
+ """Updates triton metrics after completing one request"""
1334
+ output_tokens = self.req_id_to_request_data[req_id].num_output_tokens
1335
+ input_tokens = self.req_id_to_request_data[req_id].num_input_tokens
1336
+
1337
+ self.all_metrics[METRIC_TOTAL_OUTPUT_TOKENS].observe(output_tokens)
1338
+ self.all_metrics[METRIC_TOTAL_INPUT_TOKENS].observe(input_tokens)
1339
+
1340
+ def metrics_loop(self):
1341
+ """Updates triton metrics using stats from the executor."""
1342
+ while self.running:
1343
+ time.sleep(self.stats_check_period_ms / 1000.0)
1344
+ for stat in self.executor.get_latest_iteration_stats():
1345
+ try:
1346
+ for key, metric in self.all_metrics.items():
1347
+ # Skip processing for both histogram metrics
1348
+ if isinstance(key, str) and key in [
1349
+ METRIC_TOTAL_OUTPUT_TOKENS,
1350
+ METRIC_TOTAL_INPUT_TOKENS
1351
+ ]:
1352
+ continue
1353
+ value = None
1354
+ if hasattr(stat, key):
1355
+ value = getattr(stat, key)
1356
+ elif stat.kv_cache_stats is not None and hasattr(
1357
+ stat.kv_cache_stats, key):
1358
+ value = getattr(stat.kv_cache_stats, key)
1359
+ elif stat.static_batching_stats is not None and hasattr(
1360
+ stat.static_batching_stats, key):
1361
+ value = getattr(stat.static_batching_stats, key)
1362
+ elif stat.inflight_batching_stats is not None and hasattr(
1363
+ stat.inflight_batching_stats, key):
1364
+ value = getattr(stat.inflight_batching_stats, key)
1365
+ if value is not None:
1366
+ if key == "timestamp":
1367
+ value = convert_timestamp_to_seconds(value)
1368
+ metric.set(value)
1369
+ else:
1370
+ pb_utils.Logger.log_warn(
1371
+ f"Metric \"{key}\" not found.")
1372
+ except Exception as e:
1373
+ pb_utils.Logger.log_warn(
1374
+ f"Error while processing metrics: {e}")
1375
+
1376
+ def finalize(self):
1377
+ """`finalize` is called only once when the model is being unloaded.
1378
+ Implementing `finalize` function is optional. This function allows
1379
+ the model to perform any necessary clean ups before exit.
1380
+ """
1381
+ if self.executor.can_enqueue_requests():
1382
+ self.running = False
1383
+ self.awaiter_thread.join()
1384
+ self.cancellation_thread.join()
1385
+ self.metrics_thread.join()
1386
+ self.executor.shutdown()
tensorrt_llm/1/rank0.engine ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7595e62baa9d736243148716820f6258fbc253d709a52778771f7593dfde37a6
3
+ size 36509691604
tensorrt_llm/1/rank1.engine ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce45fa6b73f60436052b12754ccf229b02c319b94fecafe52d513b05900cf244
3
+ size 36509692228
tensorrt_llm/config.pbtxt ADDED
@@ -0,0 +1,757 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "tensorrt_llm"
28
+ backend: "tensorrtllm"
29
+ max_batch_size: 32
30
+
31
+ model_transaction_policy {
32
+ decoupled: True
33
+ }
34
+
35
+ dynamic_batching {
36
+ preferred_batch_size: [ 32 ]
37
+ max_queue_delay_microseconds: 0
38
+ default_queue_policy: { max_queue_size: 32 }
39
+ }
40
+ input [
41
+ {
42
+ name: "input_ids"
43
+ data_type: TYPE_INT32
44
+ dims: [ -1 ]
45
+ allow_ragged_batch: true
46
+ optional: true
47
+ },
48
+ {
49
+ name: "encoder_input_features"
50
+ data_type: TYPE_FP16
51
+ dims: [ -1, -1 ]
52
+ allow_ragged_batch: true
53
+ optional: true
54
+ },
55
+ {
56
+ name: "encoder_output_lengths"
57
+ data_type: TYPE_INT32
58
+ dims: [ 1 ]
59
+ reshape: { shape: [ ] }
60
+ optional: true
61
+ },
62
+ {
63
+ name: "input_lengths"
64
+ data_type: TYPE_INT32
65
+ dims: [ 1 ]
66
+ reshape: { shape: [ ] }
67
+ },
68
+ {
69
+ name: "request_output_len"
70
+ data_type: TYPE_INT32
71
+ dims: [ 1 ]
72
+ reshape: { shape: [ ] }
73
+ },
74
+ {
75
+ name: "num_return_sequences"
76
+ data_type: TYPE_INT32
77
+ dims: [ 1 ]
78
+ reshape: { shape: [ ] }
79
+ optional: true
80
+ },
81
+ {
82
+ name: "draft_input_ids"
83
+ data_type: TYPE_INT32
84
+ dims: [ -1 ]
85
+ optional: true
86
+ allow_ragged_batch: true
87
+ },
88
+ {
89
+ name: "decoder_input_ids"
90
+ data_type: TYPE_INT32
91
+ dims: [ -1 ]
92
+ optional: true
93
+ allow_ragged_batch: true
94
+ },
95
+ {
96
+ name: "decoder_input_lengths"
97
+ data_type: TYPE_INT32
98
+ dims: [ 1 ]
99
+ optional: true
100
+ reshape: { shape: [ ] }
101
+ },
102
+ {
103
+ name: "draft_logits"
104
+ data_type: TYPE_FP16
105
+ dims: [ -1, -1 ]
106
+ optional: true
107
+ allow_ragged_batch: true
108
+ },
109
+ {
110
+ name: "draft_acceptance_threshold"
111
+ data_type: TYPE_FP32
112
+ dims: [ 1 ]
113
+ reshape: { shape: [ ] }
114
+ optional: true
115
+ },
116
+ {
117
+ name: "end_id"
118
+ data_type: TYPE_INT32
119
+ dims: [ 1 ]
120
+ reshape: { shape: [ ] }
121
+ optional: true
122
+ },
123
+ {
124
+ name: "pad_id"
125
+ data_type: TYPE_INT32
126
+ dims: [ 1 ]
127
+ reshape: { shape: [ ] }
128
+ optional: true
129
+ },
130
+ {
131
+ name: "stop_words_list"
132
+ data_type: TYPE_INT32
133
+ dims: [ 2, -1 ]
134
+ optional: true
135
+ allow_ragged_batch: true
136
+ },
137
+ {
138
+ name: "bad_words_list"
139
+ data_type: TYPE_INT32
140
+ dims: [ 2, -1 ]
141
+ optional: true
142
+ allow_ragged_batch: true
143
+ },
144
+ {
145
+ name: "embedding_bias"
146
+ data_type: TYPE_FP32
147
+ dims: [ -1 ]
148
+ optional: true
149
+ allow_ragged_batch: true
150
+ },
151
+ {
152
+ name: "beam_width"
153
+ data_type: TYPE_INT32
154
+ dims: [ 1 ]
155
+ reshape: { shape: [ ] }
156
+ optional: true
157
+ },
158
+ {
159
+ name: "temperature"
160
+ data_type: TYPE_FP32
161
+ dims: [ 1 ]
162
+ reshape: { shape: [ ] }
163
+ optional: true
164
+ },
165
+ {
166
+ name: "runtime_top_k"
167
+ data_type: TYPE_INT32
168
+ dims: [ 1 ]
169
+ reshape: { shape: [ ] }
170
+ optional: true
171
+ },
172
+ {
173
+ name: "runtime_top_p"
174
+ data_type: TYPE_FP32
175
+ dims: [ 1 ]
176
+ reshape: { shape: [ ] }
177
+ optional: true
178
+ },
179
+ {
180
+ name: "runtime_top_p_min"
181
+ data_type: TYPE_FP32
182
+ dims: [ 1 ]
183
+ reshape: { shape: [ ] }
184
+ optional: true
185
+ },
186
+ {
187
+ name: "runtime_top_p_decay"
188
+ data_type: TYPE_FP32
189
+ dims: [ 1 ]
190
+ reshape: { shape: [ ] }
191
+ optional: true
192
+ },
193
+ {
194
+ name: "runtime_top_p_reset_ids"
195
+ data_type: TYPE_INT32
196
+ dims: [ 1 ]
197
+ reshape: { shape: [ ] }
198
+ optional: true
199
+ },
200
+ {
201
+ name: "len_penalty"
202
+ data_type: TYPE_FP32
203
+ dims: [ 1 ]
204
+ reshape: { shape: [ ] }
205
+ optional: true
206
+ },
207
+ {
208
+ name: "early_stopping"
209
+ data_type: TYPE_BOOL
210
+ dims: [ 1 ]
211
+ reshape: { shape: [ ] }
212
+ optional: true
213
+ },
214
+ {
215
+ name: "repetition_penalty"
216
+ data_type: TYPE_FP32
217
+ dims: [ 1 ]
218
+ reshape: { shape: [ ] }
219
+ optional: true
220
+ },
221
+ {
222
+ name: "min_length"
223
+ data_type: TYPE_INT32
224
+ dims: [ 1 ]
225
+ reshape: { shape: [ ] }
226
+ optional: true
227
+ },
228
+ {
229
+ name: "beam_search_diversity_rate"
230
+ data_type: TYPE_FP32
231
+ dims: [ 1 ]
232
+ reshape: { shape: [ ] }
233
+ optional: true
234
+ },
235
+ {
236
+ name: "presence_penalty"
237
+ data_type: TYPE_FP32
238
+ dims: [ 1 ]
239
+ reshape: { shape: [ ] }
240
+ optional: true
241
+ },
242
+ {
243
+ name: "frequency_penalty"
244
+ data_type: TYPE_FP32
245
+ dims: [ 1 ]
246
+ reshape: { shape: [ ] }
247
+ optional: true
248
+ },
249
+ {
250
+ name: "random_seed"
251
+ data_type: TYPE_UINT64
252
+ dims: [ 1 ]
253
+ reshape: { shape: [ ] }
254
+ optional: true
255
+ },
256
+ {
257
+ name: "return_log_probs"
258
+ data_type: TYPE_BOOL
259
+ dims: [ 1 ]
260
+ reshape: { shape: [ ] }
261
+ optional: true
262
+ },
263
+ {
264
+ name: "return_context_logits"
265
+ data_type: TYPE_BOOL
266
+ dims: [ 1 ]
267
+ reshape: { shape: [ ] }
268
+ optional: true
269
+ },
270
+ {
271
+ name: "return_generation_logits"
272
+ data_type: TYPE_BOOL
273
+ dims: [ 1 ]
274
+ reshape: { shape: [ ] }
275
+ optional: true
276
+ },
277
+ {
278
+ name: "return_kv_cache_reuse_stats"
279
+ data_type: TYPE_BOOL
280
+ dims: [ 1 ]
281
+ reshape: { shape: [ ] }
282
+ optional: true
283
+ },
284
+ {
285
+ name: "exclude_input_in_output"
286
+ data_type: TYPE_BOOL
287
+ dims: [ 1 ]
288
+ reshape: { shape: [ ] }
289
+ optional: true
290
+ },
291
+ {
292
+ name: "stop"
293
+ data_type: TYPE_BOOL
294
+ dims: [ 1 ]
295
+ reshape: { shape: [ ] }
296
+ optional: true
297
+ },
298
+ {
299
+ name: "streaming"
300
+ data_type: TYPE_BOOL
301
+ dims: [ 1 ]
302
+ reshape: { shape: [ ] }
303
+ optional: true
304
+ },
305
+ {
306
+ name: "prompt_embedding_table"
307
+ data_type: TYPE_FP16
308
+ dims: [ -1, -1 ]
309
+ optional: true
310
+ allow_ragged_batch: true
311
+ },
312
+ {
313
+ name: "prompt_table_extra_ids"
314
+ data_type: TYPE_UINT64
315
+ dims: [ -1 ]
316
+ optional: true
317
+ allow_ragged_batch: true
318
+ },
319
+ {
320
+ name: "prompt_vocab_size"
321
+ data_type: TYPE_INT32
322
+ dims: [ 1 ]
323
+ reshape: { shape: [ ] }
324
+ optional: true
325
+ },
326
+ # cross_attention_mask shape `[bs, seq_len, num_images*num_tiles]`
327
+ {
328
+ name: "cross_attention_mask"
329
+ data_type: TYPE_BOOL
330
+ dims: [ -1, -1 ]
331
+ optional: true
332
+ allow_ragged_batch: true
333
+ },
334
+ # the unique task ID for the given LoRA.
335
+ # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
336
+ # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
337
+ # If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached.
338
+ {
339
+ name: "lora_task_id"
340
+ data_type: TYPE_UINT64
341
+ dims: [ 1 ]
342
+ reshape: { shape: [ ] }
343
+ optional: true
344
+ },
345
+ # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
346
+ # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
347
+ # each of the in / out tensors are first flattened and then concatenated together in the format above.
348
+ # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
349
+ {
350
+ name: "lora_weights"
351
+ data_type: TYPE_FP16
352
+ dims: [ -1, -1 ]
353
+ optional: true
354
+ allow_ragged_batch: true
355
+ },
356
+ # module identifier (same size a first dimension of lora_weights)
357
+ # See LoraModule::ModuleType for model id mapping
358
+ #
359
+ # "attn_qkv": 0 # compbined qkv adapter
360
+ # "attn_q": 1 # q adapter
361
+ # "attn_k": 2 # k adapter
362
+ # "attn_v": 3 # v adapter
363
+ # "attn_dense": 4 # adapter for the dense layer in attention
364
+ # "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
365
+ # "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
366
+ # "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
367
+ #
368
+ # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
369
+ {
370
+ name: "lora_config"
371
+ data_type: TYPE_INT32
372
+ dims: [ -1, 3 ]
373
+ optional: true
374
+ allow_ragged_batch: true
375
+ },
376
+ {
377
+ name: "context_phase_params"
378
+ data_type: TYPE_UINT8
379
+ dims: [ -1 ]
380
+ optional: true
381
+ allow_ragged_batch: true
382
+ },
383
+ # skip_cross_attn_blocks shape `[bs, 1]`, only used in mllama
384
+ {
385
+ name: "skip_cross_attn_blocks"
386
+ data_type: TYPE_BOOL
387
+ dims: [ 1 ]
388
+ optional: true
389
+ allow_ragged_batch: true
390
+ },
391
+ {
392
+ name: "retention_token_range_starts"
393
+ data_type: TYPE_INT32
394
+ dims: [ -1 ]
395
+ optional: true
396
+ allow_ragged_batch: true
397
+ },
398
+ {
399
+ name: "retention_token_range_ends"
400
+ data_type: TYPE_INT32
401
+ dims: [ -1 ]
402
+ optional: true
403
+ allow_ragged_batch: true
404
+ },
405
+ {
406
+ name: "retention_token_range_priorities"
407
+ data_type: TYPE_INT32
408
+ dims: [ -1 ]
409
+ optional: true
410
+ allow_ragged_batch: true
411
+ },
412
+ {
413
+ name: "retention_token_range_durations_ms"
414
+ data_type: TYPE_INT32
415
+ dims: [ -1 ]
416
+ optional: true
417
+ allow_ragged_batch: true
418
+ },
419
+ {
420
+ name: "retention_decode_priority"
421
+ data_type: TYPE_INT32
422
+ dims: [ 1 ]
423
+ optional: true
424
+ allow_ragged_batch: true
425
+ },
426
+ {
427
+ name: "retention_decode_duration_ms"
428
+ data_type: TYPE_INT32
429
+ dims: [ 1 ]
430
+ optional: true
431
+ allow_ragged_batch: true
432
+ },
433
+ {
434
+ name: "guided_decoding_guide_type"
435
+ data_type: TYPE_STRING
436
+ dims: [ 1 ]
437
+ optional: true
438
+ allow_ragged_batch: true
439
+ },
440
+ {
441
+ name: "guided_decoding_guide"
442
+ data_type: TYPE_STRING
443
+ dims: [ 1 ]
444
+ optional: true
445
+ allow_ragged_batch: true
446
+ }
447
+ ]
448
+ output [
449
+ {
450
+ name: "output_ids"
451
+ data_type: TYPE_INT32
452
+ dims: [ -1, -1 ]
453
+ },
454
+ {
455
+ name: "sequence_length"
456
+ data_type: TYPE_INT32
457
+ dims: [ -1 ]
458
+ },
459
+ {
460
+ name: "cum_log_probs"
461
+ data_type: TYPE_FP32
462
+ dims: [ -1 ]
463
+ },
464
+ {
465
+ name: "output_log_probs"
466
+ data_type: TYPE_FP32
467
+ dims: [ -1, -1 ]
468
+ },
469
+ {
470
+ name: "context_logits"
471
+ data_type: TYPE_FP16
472
+ dims: [ -1, -1 ]
473
+ },
474
+ {
475
+ name: "generation_logits"
476
+ data_type: TYPE_FP16
477
+ dims: [ -1, -1, -1 ]
478
+ },
479
+ {
480
+ name: "batch_index"
481
+ data_type: TYPE_INT32
482
+ dims: [ 1 ]
483
+ },
484
+ {
485
+ name: "sequence_index"
486
+ data_type: TYPE_INT32
487
+ dims: [ 1 ]
488
+ },
489
+ {
490
+ name: "context_phase_params"
491
+ data_type: TYPE_UINT8
492
+ dims: [ -1 ]
493
+ },
494
+ {
495
+ name: "kv_cache_alloc_new_blocks"
496
+ data_type: TYPE_INT32
497
+ dims: [ 1 ]
498
+ },
499
+ {
500
+ name: "kv_cache_reused_blocks"
501
+ data_type: TYPE_INT32
502
+ dims: [ 1 ]
503
+ },
504
+ {
505
+ name: "kv_cache_alloc_total_blocks"
506
+ data_type: TYPE_INT32
507
+ dims: [ 1 ]
508
+ }
509
+ ]
510
+ instance_group [
511
+ {
512
+ count: 1
513
+ kind : KIND_CPU
514
+ }
515
+ ]
516
+ parameters: {
517
+ key: "max_beam_width"
518
+ value: {
519
+ string_value: "1"
520
+ }
521
+ }
522
+ parameters: {
523
+ key: "FORCE_CPU_ONLY_INPUT_TENSORS"
524
+ value: {
525
+ string_value: "no"
526
+ }
527
+ }
528
+ parameters: {
529
+ key: "gpt_model_type"
530
+ value: {
531
+ string_value: "inflight_fused_batching"
532
+ }
533
+ }
534
+ parameters: {
535
+ key: "gpt_model_path"
536
+ value: {
537
+ string_value: "/all_models/inflight_batcher_llm/tensorrt_llm/1"
538
+ }
539
+ }
540
+ parameters: {
541
+ key: "encoder_model_path"
542
+ value: {
543
+ string_value: "${encoder_engine_dir}"
544
+ }
545
+ }
546
+ parameters: {
547
+ key: "max_tokens_in_paged_kv_cache"
548
+ value: {
549
+ string_value: "${max_tokens_in_paged_kv_cache}"
550
+ }
551
+ }
552
+ parameters: {
553
+ key: "max_attention_window_size"
554
+ value: {
555
+ string_value: "${max_attention_window_size}"
556
+ }
557
+ }
558
+ parameters: {
559
+ key: "sink_token_length"
560
+ value: {
561
+ string_value: "${sink_token_length}"
562
+ }
563
+ }
564
+ parameters: {
565
+ key: "batch_scheduler_policy"
566
+ value: {
567
+ string_value: "guaranteed_no_evict"
568
+ }
569
+ }
570
+ parameters: {
571
+ key: "kv_cache_free_gpu_mem_fraction"
572
+ value: {
573
+ string_value: "${kv_cache_free_gpu_mem_fraction}"
574
+ }
575
+ }
576
+ parameters: {
577
+ key: "cross_kv_cache_fraction"
578
+ value: {
579
+ string_value: "${cross_kv_cache_fraction}"
580
+ }
581
+ }
582
+ parameters: {
583
+ key: "kv_cache_host_memory_bytes"
584
+ value: {
585
+ string_value: "${kv_cache_host_memory_bytes}"
586
+ }
587
+ }
588
+ # kv_cache_onboard_blocks is for internal implementation.
589
+ parameters: {
590
+ key: "kv_cache_onboard_blocks"
591
+ value: {
592
+ string_value: "${kv_cache_onboard_blocks}"
593
+ }
594
+ }
595
+ # enable_trt_overlap is deprecated and doesn't have any effect on the runtime
596
+ # parameters: {
597
+ # key: "enable_trt_overlap"
598
+ # value: {
599
+ # string_value: "${enable_trt_overlap}"
600
+ # }
601
+ # }
602
+ parameters: {
603
+ key: "exclude_input_in_output"
604
+ value: {
605
+ string_value: "True"
606
+ }
607
+ }
608
+ parameters: {
609
+ key: "cancellation_check_period_ms"
610
+ value: {
611
+ string_value: "${cancellation_check_period_ms}"
612
+ }
613
+ }
614
+ parameters: {
615
+ key: "stats_check_period_ms"
616
+ value: {
617
+ string_value: "${stats_check_period_ms}"
618
+ }
619
+ }
620
+ parameters: {
621
+ key: "iter_stats_max_iterations"
622
+ value: {
623
+ string_value: "${iter_stats_max_iterations}"
624
+ }
625
+ }
626
+ parameters: {
627
+ key: "request_stats_max_iterations"
628
+ value: {
629
+ string_value: "${request_stats_max_iterations}"
630
+ }
631
+ }
632
+ parameters: {
633
+ key: "enable_kv_cache_reuse"
634
+ value: {
635
+ string_value: "True"
636
+ }
637
+ }
638
+ parameters: {
639
+ key: "normalize_log_probs"
640
+ value: {
641
+ string_value: "${normalize_log_probs}"
642
+ }
643
+ }
644
+ parameters: {
645
+ key: "enable_chunked_context"
646
+ value: {
647
+ string_value: "${enable_chunked_context}"
648
+ }
649
+ }
650
+ parameters: {
651
+ key: "gpu_device_ids"
652
+ value: {
653
+ string_value: "${gpu_device_ids}"
654
+ }
655
+ }
656
+ parameters: {
657
+ key: "participant_ids"
658
+ value: {
659
+ string_value: "${participant_ids}"
660
+ }
661
+ }
662
+ parameters: {
663
+ key: "lora_cache_optimal_adapter_size"
664
+ value: {
665
+ string_value: "${lora_cache_optimal_adapter_size}"
666
+ }
667
+ }
668
+ parameters: {
669
+ key: "lora_cache_max_adapter_size"
670
+ value: {
671
+ string_value: "${lora_cache_max_adapter_size}"
672
+ }
673
+ }
674
+ parameters: {
675
+ key: "lora_cache_gpu_memory_fraction"
676
+ value: {
677
+ string_value: "${lora_cache_gpu_memory_fraction}"
678
+ }
679
+ }
680
+ parameters: {
681
+ key: "lora_cache_host_memory_bytes"
682
+ value: {
683
+ string_value: "${lora_cache_host_memory_bytes}"
684
+ }
685
+ }
686
+ parameters: {
687
+ key: "decoding_mode"
688
+ value: {
689
+ string_value: "${decoding_mode}"
690
+ }
691
+ }
692
+ parameters: {
693
+ key: "executor_worker_path"
694
+ value: {
695
+ string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
696
+ }
697
+ }
698
+ parameters: {
699
+ key: "medusa_choices"
700
+ value: {
701
+ string_value: "${medusa_choices}"
702
+ }
703
+ }
704
+ parameters: {
705
+ key: "eagle_choices"
706
+ value: {
707
+ string_value: "${eagle_choices}"
708
+ }
709
+ }
710
+ parameters: {
711
+ key: "gpu_weights_percent"
712
+ value: {
713
+ string_value: "${gpu_weights_percent}"
714
+ }
715
+ }
716
+ parameters: {
717
+ key: "enable_context_fmha_fp32_acc"
718
+ value: {
719
+ string_value: "${enable_context_fmha_fp32_acc}"
720
+ }
721
+ }
722
+ parameters: {
723
+ key: "multi_block_mode"
724
+ value: {
725
+ string_value: "${multi_block_mode}"
726
+ }
727
+ }
728
+ parameters: {
729
+ key: "cuda_graph_mode"
730
+ value: {
731
+ string_value: "${cuda_graph_mode}"
732
+ }
733
+ }
734
+ parameters: {
735
+ key: "cuda_graph_cache_size"
736
+ value: {
737
+ string_value: "${cuda_graph_cache_size}"
738
+ }
739
+ }
740
+ parameters: {
741
+ key: "speculative_decoding_fast_logits"
742
+ value: {
743
+ string_value: "${speculative_decoding_fast_logits}"
744
+ }
745
+ }
746
+ parameters: {
747
+ key: "tokenizer_dir"
748
+ value: {
749
+ string_value: "${tokenizer_dir}"
750
+ }
751
+ }
752
+ parameters: {
753
+ key: "guided_decoding_backend"
754
+ value: {
755
+ string_value: "${guided_decoding_backend}"
756
+ }
757
+ }
tensorrt_llm_bls/1/__pycache__/model.cpython-312.pyc ADDED
Binary file (5.53 kB). View file
 
tensorrt_llm_bls/1/lib/__pycache__/decode.cpython-312.pyc ADDED
Binary file (21.7 kB). View file
 
tensorrt_llm_bls/1/lib/__pycache__/triton_decoder.cpython-312.pyc ADDED
Binary file (19.4 kB). View file
 
tensorrt_llm_bls/1/lib/decode.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ from collections.abc import Generator
28
+ from dataclasses import dataclass, field
29
+ from typing import Optional
30
+
31
+ import numpy as np
32
+ import torch
33
+
34
+
35
+ class RequestValidationError(Exception):
36
+ pass
37
+
38
+
39
+ def _validate_that(condition: bool, msg: str):
40
+ if not condition:
41
+ raise RequestValidationError(msg)
42
+
43
+
44
+ def _validate_non_empty(data, msg: str):
45
+ if isinstance(data, torch.Tensor):
46
+ _validate_that(data is not None and data.numel() > 0, msg)
47
+ else:
48
+ _validate_that(data is not None and data.size > 0, msg)
49
+
50
+
51
+ def _validate_single_gt_0(data, msg: str):
52
+ _validate_non_empty(data, msg)
53
+ _validate_that(data.flatten()[0] > 0, msg)
54
+
55
+
56
+ def _single_value(data: Optional[np.ndarray]):
57
+ if data is None:
58
+ return None
59
+ return data.flatten()[0]
60
+
61
+
62
+ @dataclass
63
+ class Request:
64
+ text_input: np.ndarray = field(default_factory=lambda: np.array([]))
65
+ decoder_text_input: np.ndarray = None
66
+ image_input: Optional[np.ndarray] = None
67
+ image_bytes_input: Optional[np.ndarray] = None
68
+ image_url_input: Optional[np.ndarray] = None
69
+ video_bytes_input: Optional[np.ndarray] = None
70
+ max_tokens: Optional[np.ndarray] = None
71
+ bad_words: Optional[np.ndarray] = None
72
+ stop_words: Optional[np.ndarray] = None
73
+ end_id: Optional[np.ndarray] = None
74
+ pad_id: Optional[np.ndarray] = None
75
+ top_k: Optional[np.ndarray] = None
76
+ top_p: Optional[np.ndarray] = None
77
+ temperature: Optional[np.ndarray] = None
78
+ length_penalty: Optional[np.ndarray] = None
79
+ repetition_penalty: Optional[np.ndarray] = None
80
+ min_length: Optional[np.ndarray] = None
81
+ return_log_probs: Optional[np.ndarray] = None
82
+ prompt_embedding_table: Optional[np.ndarray] = None
83
+ prompt_vocab_size: Optional[np.ndarray] = None
84
+ prompt_table_extra_id: Optional[np.ndarray] = None
85
+ embedding_bias_words: Optional[np.ndarray] = None
86
+ embedding_bias_weights: Optional[np.ndarray] = None
87
+ num_draft_tokens: Optional[np.ndarray] = None
88
+ use_draft_logits: Optional[np.ndarray] = None
89
+ stream: Optional[np.ndarray] = None
90
+ beam_width: Optional[np.ndarray] = None
91
+ return_context_logits: Optional[np.ndarray] = None
92
+ return_generation_logits: Optional[np.ndarray] = None
93
+ random_seed: Optional[np.ndarray] = None
94
+ presence_penalty: Optional[np.ndarray] = None
95
+ frequency_penalty: Optional[np.ndarray] = None
96
+ lora_task_id: Optional[np.ndarray] = None
97
+ lora_weights: Optional[np.ndarray] = None
98
+ lora_config: Optional[np.ndarray] = None
99
+ exclude_input_in_output: Optional[np.ndarray] = None
100
+ return_kv_cache_reuse_stats: Optional[np.ndarray] = None
101
+ guided_decoding_guide_type: Optional[np.ndarray] = None
102
+ guided_decoding_guide: Optional[np.ndarray] = None
103
+
104
+ def validate(self):
105
+ _validate_non_empty(self.text_input, "text_input is required")
106
+ _validate_single_gt_0(self.max_tokens,
107
+ "max_tokens must be a single value > 0")
108
+
109
+ num_draft_tokens = _single_value(self.num_draft_tokens)
110
+ _single_value(self.return_generation_logits)
111
+ context_logits = _single_value(self.return_context_logits)
112
+
113
+ if num_draft_tokens:
114
+ _validate_that(
115
+ not self.stream.any(),
116
+ "streaming is not supported with speculative decoding")
117
+ _validate_that(
118
+ not context_logits,
119
+ "context logits are not supported with speculative decoding")
120
+
121
+
122
+ @dataclass
123
+ class DraftRequest:
124
+ draft_input_ids: Optional[np.ndarray] = None
125
+ draft_logits: Optional[np.ndarray] = None
126
+
127
+
128
+ @dataclass
129
+ class PreprocResponse:
130
+ input_ids: np.ndarray = field(default_factory=lambda: np.array([]))
131
+ decoder_input_ids: np.ndarray = None
132
+ input_lengths: np.ndarray = field(default_factory=lambda: np.array([]))
133
+ decoder_input_lengths: np.ndarray = None
134
+ bad_words_list: Optional[np.ndarray] = None
135
+ stop_words_list: Optional[np.ndarray] = None
136
+ embedding_bias: Optional[np.ndarray] = None
137
+ end_id: Optional[np.ndarray] = None
138
+ pad_id: Optional[np.ndarray] = None
139
+ prompt_table_extra_ids: Optional[np.ndarray] = None
140
+ pixel_values: Optional[np.ndarray] = None
141
+ image_sizes: Optional[np.ndarray] = None
142
+ is_video_input: Optional[np.ndarray] = None
143
+
144
+ @classmethod
145
+ def with_new_inputs(cls,
146
+ other,
147
+ input_ids: Optional[np.ndarray] = None,
148
+ input_lengths: Optional[np.ndarray] = None):
149
+ return cls(input_ids=(input_ids
150
+ if input_ids is not None else other.input_ids),
151
+ input_lengths=(input_lengths if input_lengths is not None
152
+ else other.input_lengths),
153
+ decoder_input_ids=other.decoder_input_ids,
154
+ decoder_input_lengths=other.decoder_input_lengths,
155
+ bad_words_list=other.bad_words_list,
156
+ stop_words_list=other.stop_words_list,
157
+ end_id=other.end_id,
158
+ pad_id=other.pad_id,
159
+ prompt_table_extra_ids=other.prompt_table_extra_ids)
160
+
161
+
162
+ @dataclass
163
+ class MultimodalEncResponse:
164
+ prompt_embedding_table: Optional[torch.Tensor] = None
165
+ prompt_vocab_size: Optional[np.ndarray] = None
166
+
167
+
168
+ @dataclass
169
+ class GenerationResponse:
170
+ output_ids: np.ndarray = field(default_factory=lambda: np.array([]))
171
+ sequence_length: np.ndarray = field(default_factory=lambda: np.array([]))
172
+ cum_log_probs: Optional[np.ndarray] = None
173
+ output_log_probs: Optional[np.ndarray] = None
174
+ context_logits: Optional[np.ndarray] = None
175
+ generation_logits: Optional[np.ndarray] = None
176
+ batch_index: Optional[np.ndarray] = None
177
+ sequence_index: Optional[np.ndarray] = None
178
+ kv_cache_alloc_new_blocks: Optional[np.ndarray] = None
179
+ kv_cache_reused_blocks: Optional[np.ndarray] = None
180
+ kv_cache_alloc_total_blocks: Optional[np.ndarray] = None
181
+
182
+
183
+ @dataclass
184
+ class Response:
185
+ text_output: np.ndarray = field(default_factory=lambda: np.array([]))
186
+ cum_log_probs: Optional[np.ndarray] = None
187
+ output_log_probs: Optional[np.ndarray] = None
188
+ context_logits: Optional[np.ndarray] = None
189
+ generation_logits: Optional[np.ndarray] = None
190
+ batch_index: Optional[np.ndarray] = None
191
+ sequence_index: Optional[np.ndarray] = None
192
+ kv_cache_alloc_new_blocks: Optional[np.ndarray] = None
193
+ kv_cache_reused_blocks: Optional[np.ndarray] = None
194
+ kv_cache_alloc_total_blocks: Optional[np.ndarray] = None
195
+
196
+ def __eq__(self, o) -> bool:
197
+ """Just for testing"""
198
+ if not isinstance(o, Response):
199
+ return False
200
+ return (np.array_equal(self.text_output, o.text_output)
201
+ and np.array_equal(self.cum_log_probs, o.cum_log_probs)
202
+ and np.array_equal(self.output_log_probs, o.output_log_probs)
203
+ and np.array_equal(self.context_logits, o.context_logits)
204
+ and np.array_equal(self.generation_logits, o.generation_logits)
205
+ and np.array_equal(self.batch_index, o.batch_index)
206
+ and np.array_equal(self.sequence_index, o.sequence_index)
207
+ and np.array_equal(self.sequence_index, o.sequence_index)
208
+ and np.array_equal(self.kv_cache_alloc_new_blocks,
209
+ o.kv_cache_alloc_new_blocks)
210
+ and np.array_equal(self.kv_cache_reused_blocks,
211
+ o.kv_cache_reused_blocks)
212
+ and np.array_equal(self.kv_cache_alloc_total_blocks,
213
+ o.kv_cache_alloc_total_blocks))
214
+
215
+
216
+ class Decoder:
217
+
218
+ def __init__(self, streaming=False, accumulate=False):
219
+ self._streaming = streaming
220
+ self._accumulate = accumulate
221
+
222
+ self._accumulated_tokens = []
223
+
224
+ def decode(self,
225
+ request: Request,
226
+ speculative_decoding=False,
227
+ is_multimodal=False) -> Generator[Response, None, None]:
228
+
229
+ batch_size = request.text_input.shape[0]
230
+ self._accumulated_tokens = [None] * batch_size
231
+ preproc_response = self.preprocess(request)
232
+
233
+ multimodal_enc_response = None
234
+ if is_multimodal:
235
+ multimodal_enc_response = self._multimodal_enc_generate(
236
+ request, preproc_response)
237
+
238
+ if speculative_decoding:
239
+ if batch_size > 1:
240
+ raise Exception(
241
+ "speculative decoding is not supported with batch size > 1"
242
+ )
243
+ for gen_response in self._spec_generate(preproc_response, request):
244
+ yield self.postprocess(gen_response, batch_size)
245
+ else:
246
+ if not self._streaming and batch_size == 1:
247
+ gen_response = self._generate_non_streaming(
248
+ preproc_response,
249
+ request,
250
+ multimodal_enc_response=multimodal_enc_response)
251
+ yield self.postprocess(gen_response, batch_size)
252
+ else:
253
+ for gen_response in self._generate(
254
+ preproc_response,
255
+ request,
256
+ multimodal_enc_response=multimodal_enc_response):
257
+ yield self.postprocess(gen_response, batch_size)
258
+
259
+ def encountered_stop_words(self, input_ids, stop_words_ids):
260
+ for stop_word_ids in stop_words_ids:
261
+ if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids):
262
+ return True
263
+ return False
264
+
265
+ def _spec_generate(
266
+ self, preproc: PreprocResponse,
267
+ request: Request) -> Generator[GenerationResponse, None, None]:
268
+
269
+ if preproc.input_ids.shape[0] > 1:
270
+ raise Exception(
271
+ "Speculative decoding does not support batch size > 1.")
272
+
273
+ prompt_input_ids: np.ndarray = preproc.input_ids[0]
274
+ input_ids: np.ndarray = prompt_input_ids
275
+ output_len: int = request.max_tokens[0][0]
276
+ last_input_ids: np.ndarray = None
277
+ draft_output_ids: np.ndarray = None
278
+ draft_logits: np.ndarray = None
279
+
280
+ target_response: GenerationResponse = None
281
+
282
+ cur_preproc = preproc
283
+
284
+ counter = 0
285
+ while True:
286
+ counter += 1
287
+ num_draft_tokens = min(
288
+ request.num_draft_tokens[0][0],
289
+ len(prompt_input_ids) + output_len - len(input_ids) - 1)
290
+
291
+ draft_request = None
292
+ if num_draft_tokens > 0:
293
+ request.min_length = np.array([num_draft_tokens],
294
+ dtype=np.int32)
295
+ draft_response: GenerationResponse = self._draft_generate_non_streaming(
296
+ cur_preproc, request, num_draft_tokens)
297
+ seq_len: int = draft_response.sequence_length[0][0]
298
+ # [1, beamWidth, outputLength] -> [outputLen]
299
+ draft_output_ids = draft_response.output_ids[0][0]
300
+ # [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded]
301
+ if request.use_draft_logits is not None and request.use_draft_logits[
302
+ 0]:
303
+ if draft_response.generation_logits is not None:
304
+ draft_logits = draft_response.generation_logits[0][0]
305
+
306
+ input_draft_tokens = draft_output_ids[len(input_ids):seq_len]
307
+ if len(input_draft_tokens) > 0:
308
+ draft_request = DraftRequest(
309
+ draft_input_ids=np.expand_dims(input_draft_tokens, 0))
310
+ if request.use_draft_logits is not None and request.use_draft_logits[
311
+ 0]:
312
+ draft_request.draft_logits = np.expand_dims(
313
+ draft_logits[-len(input_draft_tokens):], 0)
314
+ else:
315
+ draft_request = DraftRequest()
316
+ request.min_length = None
317
+ else:
318
+ draft_request = DraftRequest()
319
+ target_response = self._generate_non_streaming(
320
+ cur_preproc, request, draft_request)
321
+ last_input_ids = input_ids
322
+ input_ids = target_response.output_ids[0][0]
323
+ cur_preproc = PreprocResponse.with_new_inputs(
324
+ cur_preproc, np.expand_dims(input_ids, 0),
325
+ np.array([[len(input_ids)]], dtype=np.int32))
326
+
327
+ # Evaluate criteria to stop generation loop.
328
+ # If we've hit or exceeded the max output length, should stop
329
+ length_stop = (len(input_ids)
330
+ >= len(prompt_input_ids) + output_len)
331
+ if length_stop:
332
+ break
333
+ # If draft and target have same outputs, should stop. Normally target should return 1 more token.
334
+ # If they are the same length, they should differ at the last token
335
+ target_draft_equal = draft_output_ids is not None and np.array_equal(
336
+ draft_output_ids, input_ids)
337
+ if target_draft_equal:
338
+ break
339
+ # If tokens no longer change, should stop, means we have hit early stopping
340
+ last_current_equal = np.array_equal(last_input_ids, input_ids)
341
+ if last_current_equal:
342
+ break
343
+ # Need to check if stop words was encountered
344
+ hit_stop_words = self.encountered_stop_words(
345
+ input_ids, preproc.stop_words_list[0])
346
+ if hit_stop_words:
347
+ break
348
+
349
+ yield target_response
350
+
351
+ def _draft_generate_non_streaming(
352
+ self, preproc: PreprocResponse, request: Request,
353
+ num_draft_tokens: int) -> GenerationResponse:
354
+ raise NotImplementedError()
355
+
356
+ def _multimodal_enc_generate(
357
+ self,
358
+ request: Request,
359
+ ) -> MultimodalEncResponse:
360
+ raise NotImplementedError()
361
+
362
+ def _generate(
363
+ self,
364
+ preproc: PreprocResponse,
365
+ request: Request,
366
+ draft_request: Optional[DraftRequest] = None,
367
+ multimodal_enc_response: Optional[MultimodalEncResponse] = None,
368
+ ) -> Generator[GenerationResponse, None, None]:
369
+ raise NotImplementedError()
370
+
371
+ def _generate_non_streaming(
372
+ self,
373
+ preproc: PreprocResponse,
374
+ request: Request,
375
+ draft_request: Optional[DraftRequest] = None,
376
+ multimodal_enc_response: Optional[MultimodalEncResponse] = None,
377
+ ) -> GenerationResponse:
378
+ raise NotImplementedError()
379
+
380
+ def postprocess(self, gen_response: GenerationResponse,
381
+ batch_size) -> Response:
382
+ if self._accumulate and self._streaming:
383
+ new_tokens: np.ndarray = gen_response.output_ids
384
+ if new_tokens.ndim != 3:
385
+ raise Exception("Expected output_ids tensor to have 3 dims.")
386
+ if new_tokens.shape[0] != 1:
387
+ raise Exception("Expected batch size of 1")
388
+ if new_tokens.shape[1] != 1:
389
+ raise Exception(
390
+ "Accumulation of tokens is only implemented for beam width = 1"
391
+ )
392
+
393
+ batch_index = gen_response.batch_index
394
+ if batch_index is not None:
395
+ if batch_index.ndim != 2:
396
+ raise Exception(
397
+ "Expected batch_index tensor to have 2 dims.")
398
+ if batch_index.shape[0] != 1:
399
+ raise Exception("Expected batch size of 1")
400
+ if batch_index.shape[1] != 1:
401
+ raise Exception("Expected only one batch_index")
402
+
403
+ batch_index = batch_index[0][0] if batch_index is not None else 0
404
+
405
+ self._accumulated_tokens[batch_index] = new_tokens if (
406
+ self._accumulated_tokens[batch_index]
407
+ is None) else np.concatenate(
408
+ (self._accumulated_tokens[batch_index], new_tokens),
409
+ axis=2)
410
+ sequence_lengths = np.array(
411
+ [[self._accumulated_tokens[batch_index].shape[2]]],
412
+ dtype=np.int32)
413
+ return self._postprocess(self._accumulated_tokens[batch_index],
414
+ sequence_lengths, gen_response)
415
+ else:
416
+ return self._postprocess(gen_response.output_ids, None,
417
+ gen_response)
418
+
419
+ def _postprocess(self, tokens: np.ndarray,
420
+ sequence_lengths: Optional[np.ndarray],
421
+ gen_response: GenerationResponse) -> Response:
422
+ raise NotImplementedError()
423
+
424
+ def preprocess(self, request: Request) -> PreprocResponse:
425
+ raise NotImplementedError()
426
+
427
+ def reset_decoder(self):
428
+ self._accumulated_tokens = []
tensorrt_llm_bls/1/lib/triton_decoder.py ADDED
@@ -0,0 +1,542 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ from collections.abc import Callable
28
+ from typing import Dict, Optional
29
+
30
+ import numpy as np
31
+ import triton_python_backend_utils as pb_utils
32
+ from lib.decode import *
33
+ from torch.utils.dlpack import from_dlpack, to_dlpack
34
+ from typing_extensions import override
35
+
36
+
37
+ class TritonDecoder(Decoder):
38
+
39
+ def __init__(self,
40
+ streaming=False,
41
+ accumulate=False,
42
+ preproc_model_name="preprocessing",
43
+ postproc_model_name="postprocessing",
44
+ llm_model_name="tensorrt_llm",
45
+ draft_llm_model_name: Optional[str] = None,
46
+ multimodal_encoders_name: Optional[str] = None):
47
+ super().__init__(streaming=streaming, accumulate=accumulate)
48
+ self.preproc_model_name = preproc_model_name
49
+ self.postproc_model_name = postproc_model_name
50
+ self.llm_model_name = llm_model_name
51
+ self.draft_llm_model_name = draft_llm_model_name
52
+ self.multimodal_encoders_name = multimodal_encoders_name
53
+
54
+ self._preproc_outputs = [
55
+ "INPUT_ID",
56
+ "DECODER_INPUT_ID",
57
+ "REQUEST_INPUT_LEN",
58
+ "REQUEST_DECODER_INPUT_LEN",
59
+ "BAD_WORDS_IDS",
60
+ "STOP_WORDS_IDS",
61
+ "EMBEDDING_BIAS",
62
+ "OUT_PAD_ID",
63
+ "OUT_END_ID",
64
+ "OUT_PROMPT_TABLE_EXTRA_IDS",
65
+ "PIXEL_VALUES",
66
+ "IMAGE_SIZES",
67
+ "IS_VIDEO_INPUT",
68
+ ]
69
+
70
+ self._multimodal_enc_outputs = [
71
+ "OUT_PROMPT_EMBEDDING_TABLE", "OUT_PROMPT_VOCAB_SIZE"
72
+ ]
73
+
74
+ self._llm_outputs = [
75
+ "output_ids", "sequence_length", "cum_log_probs",
76
+ "output_log_probs", "context_logits", "generation_logits",
77
+ "batch_index", "sequence_index", "kv_cache_alloc_new_blocks",
78
+ "kv_cache_reused_blocks", "kv_cache_alloc_total_blocks"
79
+ ]
80
+
81
+ self._postproc_outputs = [
82
+ "OUTPUT",
83
+ ]
84
+
85
+ self.input_names = [
86
+ "text_input", "decoder_text_input", "image_input",
87
+ "image_bytes_input", "image_url_input", "video_bytes_input",
88
+ "max_tokens", "bad_words", "stop_words", "end_id", "pad_id",
89
+ "top_k", "top_p", "temperature", "length_penalty",
90
+ "repetition_penalty", "min_length", "presence_penalty",
91
+ "frequency_penalty", "random_seed", "return_log_probs",
92
+ "return_context_logits", "return_generation_logits", "beam_width",
93
+ "stream", "prompt_embedding_table", "prompt_vocab_size",
94
+ "prompt_table_extra_id", "embedding_bias_words",
95
+ "embedding_bias_weights", "num_draft_tokens", "use_draft_logits",
96
+ "lora_task_id", "lora_weights", "lora_config",
97
+ "exclude_input_in_output", "return_kv_cache_reuse_stats",
98
+ "guided_decoding_guide_type", "guided_decoding_guide"
99
+ ]
100
+
101
+ self.__undo_reshape_whitelist = {
102
+ "max_tokens", "end_id", "pad_id", "top_k", "top_p", "temperature",
103
+ "length_penalty", "repetition_penalty", "min_length",
104
+ "presence_penalty", "frequency_penalty", "random_seed",
105
+ "return_log_probs", "return_context_logits",
106
+ "return_generation_logits", "beam_width", "stream",
107
+ "prompt_vocab_size", "num_draft_tokens", "use_draft_logits",
108
+ "exclude_input_in_output", "return_kv_cache_reuse_stats",
109
+ "lora_weights", "lora_config", "lora_task_id"
110
+ }
111
+
112
+ def _exec_triton_request(self, request):
113
+ responses = request.exec(decoupled=True)
114
+ for r in responses:
115
+ if r.has_error():
116
+ raise pb_utils.TritonModelException(r.error().message())
117
+ yield r
118
+
119
+ def _exec_triton_request_single(self, request):
120
+ responses = request.exec(decoupled=False)
121
+ if responses.has_error():
122
+ raise pb_utils.TritonModelException(responses.error().message())
123
+ return responses
124
+
125
+ def create_triton_response(self, response: Response):
126
+ name_map = {
127
+ "text_output": "text_output",
128
+ "cum_log_probs": "cum_log_probs",
129
+ "output_log_probs": "output_log_probs",
130
+ "context_logits": "context_logits",
131
+ "generation_logits": "generation_logits",
132
+ "batch_index": "batch_index",
133
+ "sequence_index": "sequence_index",
134
+ "kv_cache_alloc_new_blocks": "kv_cache_alloc_new_blocks",
135
+ "kv_cache_reused_blocks": "kv_cache_reused_blocks",
136
+ "kv_cache_alloc_total_blocks": "kv_cache_alloc_total_blocks"
137
+ }
138
+ tensors = self.create_triton_tensors(response, name_map)
139
+ return pb_utils.InferenceResponse(output_tensors=tensors)
140
+
141
+ def convert_triton_request(self, triton_request) -> Request:
142
+ request = Request()
143
+ for triton_name in self.input_names:
144
+ tensor = pb_utils.get_input_tensor_by_name(triton_request,
145
+ triton_name)
146
+ target_name = triton_name
147
+ if tensor is None:
148
+ continue
149
+ if not hasattr(request, target_name):
150
+ raise AttributeError(
151
+ f"Request has no attribute '{target_name}'")
152
+ setattr(request, target_name, tensor.as_numpy())
153
+ return request
154
+
155
+ def convert_triton_response(self,
156
+ triton_response,
157
+ response_factory: Callable,
158
+ name_map=None):
159
+ response = response_factory()
160
+ for tensor in triton_response.output_tensors():
161
+ if tensor is None:
162
+ continue
163
+ triton_name = tensor.name()
164
+ if tensor.is_cpu():
165
+ value = tensor.as_numpy()
166
+ else:
167
+ # If the tensor is in GPU memory make it torch.Tensor type
168
+ value = from_dlpack(tensor.to_dlpack())
169
+ target_name = triton_name
170
+ if name_map and triton_name in name_map:
171
+ target_name = name_map[triton_name]
172
+ if name_map and not triton_name in name_map:
173
+ continue
174
+ if target_name is None:
175
+ # explicitly ignore this triton input
176
+ continue
177
+ if not hasattr(response, target_name):
178
+ raise AttributeError(
179
+ f"response object has not attribute '{target_name}'")
180
+ setattr(response, target_name, value)
181
+ return response
182
+
183
+ def __undo_reshape(self, x, name):
184
+ if name in self.__undo_reshape_whitelist and len(x.shape) == 1:
185
+ # handle reshapes
186
+ return np.expand_dims(x, 0)
187
+ else:
188
+ return x
189
+
190
+ def create_triton_tensors(self, obj, name_map: dict):
191
+ tensors = []
192
+ for name, triton_name in name_map.items():
193
+ if triton_name is None:
194
+ continue
195
+ value = getattr(obj, name)
196
+ if value is None:
197
+ continue
198
+ if isinstance(value, np.ndarray):
199
+ t = pb_utils.Tensor(triton_name,
200
+ self.__undo_reshape(value, name))
201
+ elif isinstance(value, torch.Tensor):
202
+ t = pb_utils.Tensor.from_dlpack(
203
+ triton_name, to_dlpack(self.__undo_reshape(value, name)))
204
+ tensors.append(t)
205
+ return tensors
206
+
207
+ @override
208
+ def preprocess(self, request: Request) -> PreprocResponse:
209
+ input_tensors = self._get_preproc_tensors(request)
210
+ triton_req = pb_utils.InferenceRequest(
211
+ model_name=self.preproc_model_name,
212
+ inputs=input_tensors,
213
+ requested_output_names=self._preproc_outputs)
214
+ triton_output = self._exec_triton_request_single(triton_req)
215
+ return self._get_preproc_response(triton_output)
216
+
217
+ def _get_preproc_tensors(self, request: Request):
218
+ name_map = {
219
+ "text_input": "QUERY",
220
+ "image_bytes_input": "IMAGE_BYTES",
221
+ "image_url_input": "IMAGE_URL",
222
+ "video_bytes_input": "VIDEO_BYTES",
223
+ "decoder_text_input": "DECODER_QUERY",
224
+ "max_tokens": "REQUEST_OUTPUT_LEN",
225
+ "bad_words": "BAD_WORDS_DICT",
226
+ "stop_words": "STOP_WORDS_DICT",
227
+ "embedding_bias_words": "EMBEDDING_BIAS_WORDS",
228
+ "embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS",
229
+ "pad_id": "PAD_ID",
230
+ "end_id": "END_ID",
231
+ "prompt_table_extra_id": "PROMPT_TABLE_EXTRA_ID",
232
+ }
233
+ return self.create_triton_tensors(request, name_map)
234
+
235
+ def _get_preproc_response(self, triton_output):
236
+ name_map = {
237
+ "INPUT_ID": "input_ids",
238
+ "DECODER_INPUT_ID": "decoder_input_ids",
239
+ "REQUEST_INPUT_LEN": "input_lengths",
240
+ "REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths",
241
+ "BAD_WORDS_IDS": "bad_words_list",
242
+ "STOP_WORDS_IDS": "stop_words_list",
243
+ "EMBEDDING_BIAS": "embedding_bias",
244
+ "OUT_PAD_ID": "pad_id",
245
+ "OUT_END_ID": "end_id",
246
+ "OUT_PROMPT_TABLE_EXTRA_IDS": "prompt_table_extra_ids",
247
+ "PIXEL_VALUES": "pixel_values",
248
+ "IMAGE_SIZES": "image_sizes",
249
+ "IS_VIDEO_INPUT": "is_video_input",
250
+ }
251
+ return self.convert_triton_response(triton_output, PreprocResponse,
252
+ name_map)
253
+
254
+ @override
255
+ def _multimodal_enc_generate(
256
+ self,
257
+ request: Request,
258
+ preproc: PreprocResponse,
259
+ ) -> MultimodalEncResponse:
260
+ input_tensors = self._get_multimodal_enc_tensors(request, preproc)
261
+ triton_req = pb_utils.InferenceRequest(
262
+ model_name=self.multimodal_encoders_name,
263
+ inputs=input_tensors,
264
+ requested_output_names=self._multimodal_enc_outputs)
265
+ triton_output = self._exec_triton_request_single(triton_req)
266
+ return self._get_multimodal_enc_response(triton_output)
267
+
268
+ def _get_multimodal_enc_tensors(self, request: Request,
269
+ preproc: PreprocResponse):
270
+ name_map_request = {
271
+ "image_input": "IMAGE",
272
+ }
273
+ name_map_preproc = {
274
+ "pixel_values": "pixel_values",
275
+ "image_sizes": "image_sizes",
276
+ "is_video_input": "is_video_input"
277
+ }
278
+ tensors = []
279
+ tensors.extend(self.create_triton_tensors(request, name_map_request))
280
+ tensors.extend(self.create_triton_tensors(preproc, name_map_preproc))
281
+ return tensors
282
+
283
+ def _get_multimodal_enc_response(self, triton_output):
284
+ name_map = {
285
+ "OUT_PROMPT_EMBEDDING_TABLE": "prompt_embedding_table",
286
+ "OUT_PROMPT_VOCAB_SIZE": "prompt_vocab_size",
287
+ }
288
+ return self.convert_triton_response(triton_output,
289
+ MultimodalEncResponse, name_map)
290
+
291
+ @override
292
+ def _draft_generate_non_streaming(
293
+ self, preproc: PreprocResponse, request: Request,
294
+ num_draft_tokens: int) -> GenerationResponse:
295
+ input_tensors = self._get_llm_tensors(preproc, request,
296
+ num_draft_tokens, None, True)
297
+ triton_req = pb_utils.InferenceRequest(
298
+ model_name=self.draft_llm_model_name,
299
+ inputs=input_tensors,
300
+ requested_output_names=self._llm_outputs)
301
+ triton_response = self._exec_triton_request_single(triton_req)
302
+ llm_response = self._get_llm_response(triton_response)
303
+ return llm_response
304
+
305
+ @override
306
+ def _generate(
307
+ self,
308
+ preproc: PreprocResponse,
309
+ request: Request,
310
+ draft_request: Optional[DraftRequest] = None,
311
+ multimodal_enc_response: Optional[MultimodalEncResponse] = None
312
+ ) -> Generator[GenerationResponse, None, None]:
313
+ input_tensors = self._get_llm_tensors(
314
+ preproc,
315
+ request,
316
+ None,
317
+ draft_request,
318
+ multimodal_enc_response=multimodal_enc_response)
319
+ triton_req = pb_utils.InferenceRequest(
320
+ model_name=self.llm_model_name,
321
+ inputs=input_tensors,
322
+ requested_output_names=self._llm_outputs)
323
+ for r in self._exec_triton_request(triton_req):
324
+ yield self._get_llm_response(r)
325
+
326
+ @override
327
+ def _generate_non_streaming(
328
+ self,
329
+ preproc: PreprocResponse,
330
+ request: Request,
331
+ draft_request: Optional[DraftRequest] = None,
332
+ multimodal_enc_response: Optional[MultimodalEncResponse] = None
333
+ ) -> GenerationResponse:
334
+ input_tensors = self._get_llm_tensors(
335
+ preproc,
336
+ request,
337
+ None,
338
+ draft_request,
339
+ multimodal_enc_response=multimodal_enc_response)
340
+ triton_req = pb_utils.InferenceRequest(
341
+ model_name=self.llm_model_name,
342
+ inputs=input_tensors,
343
+ requested_output_names=self._llm_outputs)
344
+ r = self._exec_triton_request_single(triton_req)
345
+ return self._get_llm_response(r)
346
+
347
+ def _get_llm_tensors(
348
+ self,
349
+ preproc: PreprocResponse,
350
+ request: Request,
351
+ num_output_tokens: Optional[int] = None,
352
+ draft_request: Optional[DraftRequest] = None,
353
+ is_draft_model_request: bool = False,
354
+ multimodal_enc_response: MultimodalEncResponse = None):
355
+ tensors = []
356
+ tensors.extend(self._get_tensors_from_preproc(preproc))
357
+ if multimodal_enc_response is not None:
358
+ tensors.extend(
359
+ self._get_tensors_from_multimodal_enc(multimodal_enc_response))
360
+ tensors.extend(
361
+ self._get_llm_tensors_from_request(request, num_output_tokens,
362
+ draft_request,
363
+ is_draft_model_request))
364
+ return tensors
365
+
366
+ def _get_tensors_from_preproc(self, preproc: PreprocResponse):
367
+ name_map = {
368
+ "input_ids": "input_ids",
369
+ "decoder_input_ids": "decoder_input_ids",
370
+ "input_lengths": "input_lengths",
371
+ "bad_words_list": "bad_words_list",
372
+ "stop_words_list": "stop_words_list",
373
+ "embedding_bias": "embedding_bias",
374
+ "pad_id": "pad_id",
375
+ "end_id": "end_id",
376
+ "prompt_table_extra_ids": "prompt_table_extra_ids",
377
+ }
378
+ return self.create_triton_tensors(preproc, name_map)
379
+
380
+ def _get_tensors_from_multimodal_enc(
381
+ self, multimodal_enc_response: MultimodalEncResponse):
382
+ name_map = {
383
+ "prompt_embedding_table": "prompt_embedding_table",
384
+ "prompt_vocab_size": "prompt_vocab_size",
385
+ }
386
+ return self.create_triton_tensors(multimodal_enc_response, name_map)
387
+
388
+ def _get_llm_tensors_from_request(
389
+ self,
390
+ request: Request,
391
+ num_output_tokens: Optional[int] = None,
392
+ draft_request: Optional[DraftRequest] = None,
393
+ is_draft_model_request: bool = False):
394
+ name_map: Dict[str, Optional[str]] = {
395
+ "beam_width": "beam_width",
396
+ "top_k": "runtime_top_k",
397
+ "top_p": "runtime_top_p",
398
+ "temperature": "temperature",
399
+ "length_penalty": "len_penalty",
400
+ "repetition_penalty": "repetition_penalty",
401
+ "min_length": "min_length",
402
+ "presence_penalty": "presence_penalty",
403
+ "frequency_penalty": "frequency_penalty",
404
+ "random_seed": "random_seed",
405
+ "return_log_probs": "return_log_probs",
406
+ "stream": "streaming",
407
+ "prompt_embedding_table": "prompt_embedding_table",
408
+ "prompt_vocab_size": "prompt_vocab_size",
409
+ "lora_task_id": "lora_task_id",
410
+ "lora_weights": "lora_weights",
411
+ "lora_config": "lora_config",
412
+ "exclude_input_in_output": "exclude_input_in_output",
413
+ "return_kv_cache_reuse_stats": "return_kv_cache_reuse_stats",
414
+ "guided_decoding_guide_type": "guided_decoding_guide_type",
415
+ "guided_decoding_guide": "guided_decoding_guide"
416
+ }
417
+ batch_size = request.text_input.shape[0]
418
+ tensors = self.create_triton_tensors(request, name_map)
419
+ out_len_tensor = None
420
+ if request.max_tokens is not None:
421
+ out_len_tensor = request.max_tokens
422
+
423
+ out_len = None
424
+ if num_output_tokens is not None:
425
+ out_len = num_output_tokens
426
+ elif draft_request:
427
+ out_len = len(
428
+ draft_request.draft_input_ids[0]
429
+ ) + 1 if draft_request.draft_input_ids is not None else 1
430
+
431
+ if out_len is not None:
432
+ out_len_tensor = [[out_len]] * batch_size
433
+
434
+ if out_len_tensor is None:
435
+ raise Exception("Could not determine request_output_len")
436
+ else:
437
+ tensors.append(
438
+ pb_utils.Tensor("request_output_len",
439
+ np.array(out_len_tensor, dtype=np.int32)))
440
+
441
+ if draft_request:
442
+ if draft_request.draft_input_ids is not None:
443
+ tensors.append(
444
+ pb_utils.Tensor("draft_input_ids",
445
+ draft_request.draft_input_ids))
446
+ if draft_request.draft_logits is not None and request.use_draft_logits is not None and request.use_draft_logits[
447
+ 0]:
448
+ tensors.append(
449
+ pb_utils.Tensor("draft_logits",
450
+ draft_request.draft_logits))
451
+
452
+ return_context_logits_data = [False]
453
+ return_generation_logits_data = [False]
454
+ if draft_request is None:
455
+ if is_draft_model_request:
456
+ return_generation_logits_data = request.use_draft_logits if request.use_draft_logits is not None else [
457
+ False
458
+ ]
459
+ else:
460
+ return_context_logits_data = request.return_context_logits if request.return_context_logits is not None else [
461
+ False
462
+ ]
463
+ return_generation_logits_data = request.return_generation_logits if request.return_generation_logits is not None else [
464
+ False
465
+ ]
466
+ return_context_logits = np.array([return_context_logits_data] *
467
+ batch_size,
468
+ dtype=bool)
469
+ return_generation_logits = np.array([return_generation_logits_data] *
470
+ batch_size,
471
+ dtype=bool)
472
+
473
+ assert len(return_context_logits.shape) == 2
474
+ assert len(return_generation_logits.shape) == 2
475
+
476
+ tensors.append(
477
+ pb_utils.Tensor("return_context_logits", return_context_logits))
478
+ tensors.append(
479
+ pb_utils.Tensor("return_generation_logits",
480
+ return_generation_logits))
481
+ return tensors
482
+
483
+ def _get_llm_response(self, triton_output):
484
+ name_map = {
485
+ "output_ids": "output_ids",
486
+ "sequence_length": "sequence_length",
487
+ "cum_log_probs": "cum_log_probs",
488
+ "output_log_probs": "output_log_probs",
489
+ "context_logits": "context_logits",
490
+ "generation_logits": "generation_logits",
491
+ "batch_index": "batch_index",
492
+ "sequence_index": "sequence_index",
493
+ "kv_cache_alloc_new_blocks": "kv_cache_alloc_new_blocks",
494
+ "kv_cache_reused_blocks": "kv_cache_reused_blocks",
495
+ "kv_cache_alloc_total_blocks": "kv_cache_alloc_total_blocks"
496
+ }
497
+ return self.convert_triton_response(triton_output, GenerationResponse,
498
+ name_map)
499
+
500
+ def _postprocess(self, tokens: np.ndarray,
501
+ sequence_lengths: Optional[np.ndarray],
502
+ gen_response: GenerationResponse) -> Response:
503
+ input_tensors = self._get_postproc_tensors(tokens, sequence_lengths,
504
+ gen_response)
505
+ triton_req = pb_utils.InferenceRequest(
506
+ model_name=self.postproc_model_name,
507
+ inputs=input_tensors,
508
+ requested_output_names=self._postproc_outputs)
509
+ r = self._exec_triton_request_single(triton_req)
510
+ response = self._get_response(r, gen_response)
511
+ return response
512
+
513
+ def _get_postproc_tensors(self, tokens: np.ndarray,
514
+ sequence_lengths: Optional[np.ndarray],
515
+ gen_response: GenerationResponse):
516
+ tensors = [
517
+ pb_utils.Tensor("TOKENS_BATCH", tokens),
518
+ pb_utils.Tensor(
519
+ "SEQUENCE_LENGTH", sequence_lengths
520
+ if sequence_lengths else gen_response.sequence_length)
521
+ ]
522
+ return tensors
523
+
524
+ def _get_response(self, triton_output, gen_res: GenerationResponse):
525
+ tensors = triton_output.output_tensors()
526
+ t_map = {}
527
+ for named_t in tensors:
528
+ name = named_t.name()
529
+ t = named_t.as_numpy()
530
+ t_map[name] = t
531
+ response = Response(
532
+ text_output=t_map["OUTPUT"],
533
+ cum_log_probs=gen_res.cum_log_probs,
534
+ output_log_probs=gen_res.output_log_probs,
535
+ context_logits=gen_res.context_logits,
536
+ generation_logits=gen_res.generation_logits,
537
+ batch_index=gen_res.batch_index,
538
+ sequence_index=gen_res.sequence_index,
539
+ kv_cache_alloc_new_blocks=gen_res.kv_cache_alloc_new_blocks,
540
+ kv_cache_reused_blocks=gen_res.kv_cache_reused_blocks,
541
+ kv_cache_alloc_total_blocks=gen_res.kv_cache_alloc_total_blocks)
542
+ return response
tensorrt_llm_bls/1/model.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ import json
28
+ import traceback
29
+
30
+ import triton_python_backend_utils as pb_utils
31
+ from lib.triton_decoder import TritonDecoder
32
+
33
+
34
+ def get_valid_param_value(param, default_value=''):
35
+ value = param.get('string_value', '')
36
+ return default_value if value.startswith('${') or value == '' else value
37
+
38
+
39
+ class TritonPythonModel:
40
+
41
+ def initialize(self, args):
42
+
43
+ # Parse model configs
44
+ model_config = json.loads(args['model_config'])
45
+
46
+ params = model_config['parameters']
47
+
48
+ accumulate_tokens_str = get_valid_param_value(
49
+ params.get('accumulate_tokens', {}))
50
+ self.accumulate_tokens = accumulate_tokens_str.lower() in [
51
+ 'true', 'yes', '1', 't'
52
+ ]
53
+
54
+ self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
55
+ model_config)
56
+
57
+ self.logger = pb_utils.Logger
58
+
59
+ default_tensorrt_llm_model_name = 'tensorrt_llm'
60
+ self.llm_model_name = get_valid_param_value(
61
+ params.get('tensorrt_llm_model_name', {}),
62
+ default_tensorrt_llm_model_name)
63
+
64
+ self.draft_llm_model_name = get_valid_param_value(
65
+ params.get('tensorrt_llm_draft_model_name', {}), None)
66
+
67
+ self.multimodal_encoders_name = get_valid_param_value(
68
+ params.get('multimodal_encoders_name', {}), None)
69
+
70
+ self.decoder = TritonDecoder(
71
+ streaming=self.decoupled,
72
+ accumulate=self.accumulate_tokens,
73
+ preproc_model_name="preprocessing",
74
+ postproc_model_name="postprocessing",
75
+ llm_model_name=self.llm_model_name,
76
+ draft_llm_model_name=self.draft_llm_model_name,
77
+ multimodal_encoders_name=self.multimodal_encoders_name)
78
+
79
+ def execute(self, requests):
80
+
81
+ responses = []
82
+
83
+ for request in requests:
84
+ if self.decoupled:
85
+ response_sender = request.get_response_sender()
86
+ try:
87
+
88
+ req = self.decoder.convert_triton_request(request)
89
+ req.validate()
90
+ speculative_decode = (req.num_draft_tokens is not None
91
+ and req.num_draft_tokens[0][0] > 0)
92
+ if speculative_decode and (self.draft_llm_model_name is None
93
+ or self.draft_llm_model_name == ""):
94
+ raise Exception(
95
+ "cannot perform speculative decoding without draft model"
96
+ )
97
+ is_multimodal = req.image_input is not None or req.image_bytes_input is not None or req.image_url_input is not None or req.video_bytes_input is not None
98
+
99
+ if speculative_decode and is_multimodal:
100
+ raise Exception(
101
+ "Multimodal and speculative decoding is not currently supported"
102
+ )
103
+ res_gen = self.decoder.decode(
104
+ req,
105
+ speculative_decoding=speculative_decode,
106
+ is_multimodal=is_multimodal)
107
+
108
+ for res in res_gen:
109
+ triton_response = self.decoder.create_triton_response(res)
110
+ if self.decoupled:
111
+ response_sender.send(triton_response)
112
+ else:
113
+ responses.append(triton_response)
114
+
115
+ if self.decoupled:
116
+ response_sender.send(
117
+ flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
118
+
119
+ except Exception:
120
+ self.logger.log_error(traceback.format_exc())
121
+ # If encountering an error, send a response with err msg
122
+ error_response = pb_utils.InferenceResponse(
123
+ output_tensors=[],
124
+ error=pb_utils.TritonError(traceback.format_exc()))
125
+
126
+ if self.decoupled:
127
+ response_sender.send(error_response)
128
+ response_sender.send(
129
+ flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
130
+ else:
131
+ responses.append(error_response)
132
+
133
+ self.decoder.reset_decoder()
134
+
135
+ if self.decoupled:
136
+ return None
137
+ else:
138
+ assert len(responses) == len(requests)
139
+ return responses
140
+
141
+ def finalize(self):
142
+ """`finalize` is called only once when the model is being unloaded.
143
+ Implementing `finalize` function is optional. This function allows
144
+ the model to perform any necessary clean ups before exit.
145
+ """
146
+ print('Cleaning up...')
tensorrt_llm_bls/config.pbtxt ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "tensorrt_llm_bls"
28
+ backend: "python"
29
+ max_batch_size: 32
30
+
31
+ model_transaction_policy {
32
+ decoupled: True
33
+ }
34
+
35
+ input [
36
+ {
37
+ name: "text_input"
38
+ data_type: TYPE_STRING
39
+ dims: [ 1 ]
40
+ },
41
+ {
42
+ name: "decoder_text_input"
43
+ data_type: TYPE_STRING
44
+ dims: [ 1 ]
45
+ optional: true
46
+ },
47
+ {
48
+ name: "image_input"
49
+ data_type: TYPE_FP16
50
+ dims: [ -1, 3, -1, -1 ]
51
+ optional: true
52
+ },
53
+ {
54
+ name: "image_bytes_input"
55
+ data_type: TYPE_UINT8
56
+ dims: [ -1, -1, -1, -1 ]
57
+ optional: true
58
+ },
59
+ {
60
+ name: "image_url_input"
61
+ data_type: TYPE_STRING
62
+ dims: [ 1 ]
63
+ optional: true
64
+ },
65
+ {
66
+ name: "video_bytes_input"
67
+ data_type: TYPE_UINT8
68
+ dims: [ -1, -1, -1, -1 ]
69
+ optional: true
70
+ },
71
+ {
72
+ name: "max_tokens"
73
+ data_type: TYPE_INT32
74
+ dims: [ 1 ]
75
+ },
76
+ {
77
+ name: "bad_words"
78
+ data_type: TYPE_STRING
79
+ dims: [ -1 ]
80
+ optional: true
81
+ },
82
+ {
83
+ name: "stop_words"
84
+ data_type: TYPE_STRING
85
+ dims: [ -1 ]
86
+ optional: true
87
+ },
88
+ {
89
+ name: "exclude_input_in_output"
90
+ data_type: TYPE_BOOL
91
+ dims: [ 1 ]
92
+ optional: true
93
+ },
94
+ {
95
+ name: "end_id"
96
+ data_type: TYPE_INT32
97
+ dims: [ 1 ]
98
+ optional: true
99
+ },
100
+ {
101
+ name: "pad_id"
102
+ data_type: TYPE_INT32
103
+ dims: [ 1 ]
104
+ optional: true
105
+ },
106
+ {
107
+ name: "top_k"
108
+ data_type: TYPE_INT32
109
+ dims: [ 1 ]
110
+ optional: true
111
+ },
112
+ {
113
+ name: "top_p"
114
+ data_type: TYPE_FP32
115
+ dims: [ 1 ]
116
+ optional: true
117
+ },
118
+ {
119
+ name: "temperature"
120
+ data_type: TYPE_FP32
121
+ dims: [ 1 ]
122
+ optional: true
123
+ },
124
+ {
125
+ name: "length_penalty"
126
+ data_type: TYPE_FP32
127
+ dims: [ 1 ]
128
+ optional: true
129
+ },
130
+ {
131
+ name: "repetition_penalty"
132
+ data_type: TYPE_FP32
133
+ dims: [ 1 ]
134
+ optional: true
135
+ },
136
+ {
137
+ name: "min_length"
138
+ data_type: TYPE_INT32
139
+ dims: [ 1 ]
140
+ optional: true
141
+ },
142
+ {
143
+ name: "presence_penalty"
144
+ data_type: TYPE_FP32
145
+ dims: [ 1 ]
146
+ optional: true
147
+ },
148
+ {
149
+ name: "frequency_penalty"
150
+ data_type: TYPE_FP32
151
+ dims: [ 1 ]
152
+ optional: true
153
+ },
154
+ {
155
+ name: "random_seed"
156
+ data_type: TYPE_UINT64
157
+ dims: [ 1 ]
158
+ optional: true
159
+ },
160
+ {
161
+ name: "return_log_probs"
162
+ data_type: TYPE_BOOL
163
+ dims: [ 1 ]
164
+ reshape: { shape: [ ] }
165
+ optional: true
166
+ },
167
+ {
168
+ name: "return_context_logits"
169
+ data_type: TYPE_BOOL
170
+ dims: [ 1 ]
171
+ reshape: { shape: [ ] }
172
+ optional: true
173
+ },
174
+ {
175
+ name: "return_generation_logits"
176
+ data_type: TYPE_BOOL
177
+ dims: [ 1 ]
178
+ reshape: { shape: [ ] }
179
+ optional: true
180
+ },
181
+ {
182
+ name: "num_return_sequences"
183
+ data_type: TYPE_INT32
184
+ dims: [ 1 ]
185
+ reshape: { shape: [ ] }
186
+ optional: true
187
+ },
188
+ {
189
+ name: "beam_width"
190
+ data_type: TYPE_INT32
191
+ dims: [ 1 ]
192
+ optional: true
193
+ },
194
+ {
195
+ name: "stream"
196
+ data_type: TYPE_BOOL
197
+ dims: [ 1 ]
198
+ optional: true
199
+ },
200
+ {
201
+ name: "prompt_embedding_table"
202
+ data_type: TYPE_FP16
203
+ dims: [ -1, -1 ]
204
+ optional: true
205
+ },
206
+ {
207
+ name: "prompt_vocab_size"
208
+ data_type: TYPE_INT32
209
+ dims: [ 1 ]
210
+ optional: true
211
+ },
212
+ {
213
+ name: "prompt_table_extra_id"
214
+ data_type: TYPE_UINT64
215
+ dims: [ 1 ]
216
+ optional: true
217
+ },
218
+ {
219
+ name: "embedding_bias_words"
220
+ data_type: TYPE_STRING
221
+ dims: [ -1 ]
222
+ optional: true
223
+ },
224
+ {
225
+ name: "embedding_bias_weights"
226
+ data_type: TYPE_FP32
227
+ dims: [ -1 ]
228
+ optional: true
229
+ },
230
+ {
231
+ name: "num_draft_tokens",
232
+ data_type: TYPE_INT32,
233
+ dims: [ 1 ]
234
+ optional: true
235
+ },
236
+ {
237
+ name: "use_draft_logits",
238
+ data_type: TYPE_BOOL,
239
+ dims: [ 1 ]
240
+ reshape: { shape: [ ] }
241
+ optional: true
242
+ },
243
+ # the unique task ID for the given LoRA.
244
+ # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
245
+ # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
246
+ # If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached.
247
+ {
248
+ name: "lora_task_id"
249
+ data_type: TYPE_UINT64
250
+ dims: [ 1 ]
251
+ reshape: { shape: [ ] }
252
+ optional: true
253
+ },
254
+ # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
255
+ # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
256
+ # each of the in / out tensors are first flattened and then concatenated together in the format above.
257
+ # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
258
+ {
259
+ name: "lora_weights"
260
+ data_type: TYPE_FP16
261
+ dims: [ -1, -1 ]
262
+ optional: true
263
+ allow_ragged_batch: true
264
+ },
265
+ # module identifier (same size a first dimension of lora_weights)
266
+ # See LoraModule::ModuleType for model id mapping
267
+ #
268
+ # "attn_qkv": 0 # compbined qkv adapter
269
+ # "attn_q": 1 # q adapter
270
+ # "attn_k": 2 # k adapter
271
+ # "attn_v": 3 # v adapter
272
+ # "attn_dense": 4 # adapter for the dense layer in attention
273
+ # "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
274
+ # "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
275
+ # "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
276
+ #
277
+ # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
278
+ {
279
+ name: "lora_config"
280
+ data_type: TYPE_INT32
281
+ dims: [ -1, 3 ]
282
+ optional: true
283
+ allow_ragged_batch: true
284
+ },
285
+ {
286
+ name: "return_kv_cache_reuse_stats"
287
+ data_type: TYPE_BOOL
288
+ dims: [ 1 ]
289
+ reshape: { shape: [ ] }
290
+ optional: true
291
+ },
292
+ {
293
+ name: "guided_decoding_guide_type"
294
+ data_type: TYPE_STRING
295
+ dims: [ 1 ]
296
+ optional: true
297
+ },
298
+ {
299
+ name: "guided_decoding_guide"
300
+ data_type: TYPE_STRING
301
+ dims: [ 1 ]
302
+ optional: true
303
+ }
304
+ ]
305
+ output [
306
+ {
307
+ name: "text_output"
308
+ data_type: TYPE_STRING
309
+ dims: [ -1 ]
310
+ },
311
+ {
312
+ name: "cum_log_probs"
313
+ data_type: TYPE_FP32
314
+ dims: [ -1 ]
315
+ },
316
+ {
317
+ name: "output_log_probs"
318
+ data_type: TYPE_FP32
319
+ dims: [ -1, -1 ]
320
+ },
321
+ {
322
+ name: "context_logits"
323
+ data_type: TYPE_FP16
324
+ dims: [ -1, -1 ]
325
+ },
326
+ {
327
+ name: "generation_logits"
328
+ data_type: TYPE_FP16
329
+ dims: [ -1, -1, -1 ]
330
+ },
331
+ {
332
+ name: "batch_index"
333
+ data_type: TYPE_INT32
334
+ dims: [ 1 ]
335
+ },
336
+ {
337
+ name: "sequence_index"
338
+ data_type: TYPE_INT32
339
+ dims: [ 1 ]
340
+ },
341
+ {
342
+ name: "kv_cache_alloc_new_blocks"
343
+ data_type: TYPE_INT32
344
+ dims: [ 1 ]
345
+ },
346
+ {
347
+ name: "kv_cache_reused_blocks"
348
+ data_type: TYPE_INT32
349
+ dims: [ 1 ]
350
+ },
351
+ {
352
+ name: "kv_cache_alloc_total_blocks"
353
+ data_type: TYPE_INT32
354
+ dims: [ 1 ]
355
+ }
356
+ ]
357
+
358
+ parameters: {
359
+ key: "accumulate_tokens"
360
+ value: {
361
+ string_value: "${accumulate_tokens}"
362
+ }
363
+ }
364
+ parameters: {
365
+ key: "tensorrt_llm_model_name"
366
+ value: {
367
+ string_value: "tensorrt_llm"
368
+ }
369
+ }
370
+ parameters: {
371
+ key: "tensorrt_llm_draft_model_name"
372
+ value: {
373
+ string_value: "${tensorrt_llm_draft_model_name}"
374
+ }
375
+ }
376
+ parameters: {
377
+ key: "multimodal_encoders_name"
378
+ value: {
379
+ string_value: "${multimodal_encoders_name}"
380
+ }
381
+ }
382
+
383
+ instance_group [
384
+ {
385
+ count: 1
386
+ kind : KIND_CPU
387
+ }
388
+ ]