otherhalf-dev
/

Llama-3.3-70B-Instruct-abliterated-fp8-trt-0.18

Model card Files Files and versions Community

otherhalf-dev commited on Feb 17

Commit

a2eca76

verified ·

1 Parent(s): 1f1bf26

Upload folder using huggingface_hub

Browse files

Files changed (22) hide show

.gitattributes +2 -0
ensemble/1/.tmp +0 -0
ensemble/config.pbtxt +606 -0
postprocessing/1/__pycache__/model.cpython-312.pyc +0 -0
postprocessing/1/model.py +177 -0
postprocessing/config.pbtxt +70 -0
preprocessing/1/__pycache__/model.cpython-312.pyc +0 -0
preprocessing/1/model.py +908 -0
preprocessing/config.pbtxt +240 -0
tensorrt_llm/1/.gitkeep +0 -0
tensorrt_llm/1/config.json +362 -0
tensorrt_llm/1/model.py +1386 -0
tensorrt_llm/1/rank0.engine +3 -0
tensorrt_llm/1/rank1.engine +3 -0
tensorrt_llm/config.pbtxt +757 -0
tensorrt_llm_bls/1/__pycache__/model.cpython-312.pyc +0 -0
tensorrt_llm_bls/1/lib/__pycache__/decode.cpython-312.pyc +0 -0
tensorrt_llm_bls/1/lib/__pycache__/triton_decoder.cpython-312.pyc +0 -0
tensorrt_llm_bls/1/lib/decode.py +428 -0
tensorrt_llm_bls/1/lib/triton_decoder.py +542 -0
tensorrt_llm_bls/1/model.py +146 -0
tensorrt_llm_bls/config.pbtxt +388 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tensorrt_llm/1/rank0.engine filter=lfs diff=lfs merge=lfs -text
+tensorrt_llm/1/rank1.engine filter=lfs diff=lfs merge=lfs -text

ensemble/1/.tmp ADDED Viewed

File without changes

ensemble/config.pbtxt ADDED Viewed

	@@ -0,0 +1,606 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+name: "ensemble"
+platform: "ensemble"
+max_batch_size: 32
+input [
+  {
+    name: "text_input"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  },
+  {
+    name: "decoder_text_input"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "max_tokens"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "num_return_sequences"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+   name: "bad_words"
+   data_type: TYPE_STRING
+   dims: [ -1 ]
+   optional: true
+  },
+  {
+   name: "stop_words"
+   data_type: TYPE_STRING
+   dims: [ -1 ]
+   optional: true
+  },
+  {
+    name: "exclude_input_in_output"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "end_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "pad_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "top_k"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "length_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "repetition_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "min_length"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "presence_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "frequency_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "random_seed"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "return_context_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "return_generation_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "return_kv_cache_reuse_stats"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "beam_width"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "stream"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "prompt_embedding_table"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+  },
+  {
+    name: "prompt_table_extra_id"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "prompt_vocab_size"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "embedding_bias_words"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+    optional: true
+  },
+  {
+    name: "embedding_bias_weights"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+    optional: true
+  },
+  # the unique task ID for the given LoRA.
+  # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
+  # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
+  # If the cache is full the oldest LoRA will be evicted to make space for new ones.  An error is returned if `lora_task_id` is not cached.
+  {
+    name: "lora_task_id"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    optional: true
+  },
+  # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
+  # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
+  # each of the in / out tensors are first flattened and then concatenated together in the format above.
+  # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
+  {
+    name: "lora_weights"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  # module identifier (same size a first dimension of lora_weights)
+  # See LoraModule::ModuleType for model id mapping
+  #
+  # "attn_qkv": 0     # compbined qkv adapter
+  # "attn_q": 1       # q adapter
+  # "attn_k": 2       # k adapter
+  # "attn_v": 3       # v adapter
+  # "attn_dense": 4   # adapter for the dense layer in attention
+  # "mlp_h_to_4h": 5  # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
+  # "mlp_4h_to_h": 6  # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
+  # "mlp_gate": 7     # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
+  #
+  # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
+  {
+    name: "lora_config"
+    data_type: TYPE_INT32
+    dims: [ -1, 3 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "guided_decoding_guide_type"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "guided_decoding_guide"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+    optional: true
+    allow_ragged_batch: true
+  }
+]
+output [
+  {
+    name: "text_output"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  },
+  {
+    name: "cum_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "output_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "context_logits"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "generation_logits"
+    data_type: TYPE_FP16
+    dims: [ -1, -1, -1 ]
+  },
+  {
+    name: "batch_index"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "sequence_index"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "kv_cache_alloc_new_blocks"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "kv_cache_reused_blocks"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "kv_cache_alloc_total_blocks"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+ensemble_scheduling {
+  step [
+    {
+      model_name: "preprocessing"
+      model_version: -1
+      input_map {
+        key: "QUERY"
+        value: "text_input"
+      }
+      input_map {
+        key: "DECODER_QUERY"
+        value: "decoder_text_input"
+      }
+      input_map {
+        key: "REQUEST_OUTPUT_LEN"
+        value: "max_tokens"
+      }
+      input_map {
+        key: "BAD_WORDS_DICT"
+        value: "bad_words"
+      }
+      input_map {
+        key: "STOP_WORDS_DICT"
+        value: "stop_words"
+      }
+      input_map {
+        key: "EMBEDDING_BIAS_WORDS"
+        value: "embedding_bias_words"
+      }
+      input_map {
+        key: "EMBEDDING_BIAS_WEIGHTS"
+        value: "embedding_bias_weights"
+      }
+      input_map {
+        key: "END_ID"
+        value: "end_id"
+      }
+      input_map {
+        key: "PAD_ID"
+        value: "pad_id"
+      }
+      input_map {
+        key: "PROMPT_TABLE_EXTRA_ID"
+        value: "prompt_table_extra_id"
+      }
+      output_map {
+        key: "REQUEST_INPUT_LEN"
+        value: "_REQUEST_INPUT_LEN"
+      }
+      output_map {
+        key: "INPUT_ID"
+        value: "_INPUT_ID"
+      }
+      output_map {
+        key: "REQUEST_DECODER_INPUT_LEN"
+        value: "_REQUEST_DECODER_INPUT_LEN"
+      }
+      output_map {
+        key: "DECODER_INPUT_ID"
+        value: "_DECODER_INPUT_ID"
+      }
+      output_map {
+        key: "REQUEST_OUTPUT_LEN"
+        value: "_REQUEST_OUTPUT_LEN"
+      }
+      output_map {
+        key: "STOP_WORDS_IDS"
+        value: "_STOP_WORDS_IDS"
+      }
+      output_map {
+        key: "BAD_WORDS_IDS"
+        value: "_BAD_WORDS_IDS"
+      }
+      output_map {
+        key: "EMBEDDING_BIAS"
+        value: "_EMBEDDING_BIAS"
+      }
+      output_map {
+        key: "OUT_END_ID"
+        value: "_PREPROCESSOR_END_ID"
+      }
+      output_map {
+        key: "OUT_PAD_ID"
+        value: "_PREPROCESSOR_PAD_ID"
+      }
+      output_map {
+        key: "OUT_PROMPT_TABLE_EXTRA_IDS"
+        value: "_OUT_PROMPT_TABLE_EXTRA_IDS"
+      }
+    },
+    {
+      model_name: "tensorrt_llm"
+      model_version: -1
+      input_map {
+        key: "input_ids"
+        value: "_INPUT_ID"
+      }
+      input_map {
+        key: "decoder_input_ids"
+        value: "_DECODER_INPUT_ID"
+      }
+      input_map {
+        key: "input_lengths"
+        value: "_REQUEST_INPUT_LEN"
+      }
+      input_map {
+        key: "decoder_input_lengths"
+        value: "_REQUEST_DECODER_INPUT_LEN"
+      }
+      input_map {
+        key: "exclude_input_in_output"
+        value: "exclude_input_in_output"
+      }
+      input_map {
+        key: "request_output_len"
+        value: "_REQUEST_OUTPUT_LEN"
+      }
+      input_map {
+          key: "end_id"
+          value: "_PREPROCESSOR_END_ID"
+      }
+      input_map {
+          key: "pad_id"
+          value: "_PREPROCESSOR_PAD_ID"
+      }
+      input_map {
+          key: "embedding_bias"
+          value: "_EMBEDDING_BIAS"
+      }
+      input_map {
+          key: "runtime_top_k"
+          value: "top_k"
+      }
+      input_map {
+          key: "runtime_top_p"
+          value: "top_p"
+      }
+      input_map {
+          key: "temperature"
+          value: "temperature"
+      }
+      input_map {
+          key: "len_penalty"
+          value: "length_penalty"
+      }
+      input_map {
+          key: "repetition_penalty"
+          value: "repetition_penalty"
+      }
+      input_map {
+          key: "min_length"
+          value: "min_length"
+      }
+      input_map {
+          key: "presence_penalty"
+          value: "presence_penalty"
+      }
+      input_map {
+          key: "frequency_penalty"
+          value: "frequency_penalty"
+      }
+      input_map {
+          key: "random_seed"
+          value: "random_seed"
+      }
+      input_map {
+          key: "return_log_probs"
+          value: "return_log_probs"
+      }
+      input_map {
+          key: "return_context_logits"
+          value: "return_context_logits"
+      }
+      input_map {
+          key: "return_generation_logits"
+          value: "return_generation_logits"
+      }
+      input_map {
+          key: "return_kv_cache_reuse_stats"
+          value: "return_kv_cache_reuse_stats"
+      }
+      input_map {
+          key: "num_return_sequences"
+          value: "num_return_sequences"
+      }
+      input_map {
+          key: "beam_width"
+          value: "beam_width"
+      }
+      input_map {
+          key: "streaming"
+          value: "stream"
+      }
+      input_map {
+        key: "prompt_embedding_table"
+        value: "prompt_embedding_table"
+      }
+      input_map {
+        key: "prompt_vocab_size"
+        value: "prompt_vocab_size"
+      }
+      input_map {
+        key: "stop_words_list"
+        value: "_STOP_WORDS_IDS"
+      }
+      input_map {
+        key: "bad_words_list"
+        value: "_BAD_WORDS_IDS"
+      }
+      input_map {
+        key: "prompt_table_extra_ids"
+        value: "_OUT_PROMPT_TABLE_EXTRA_IDS"
+      },
+      input_map {
+        key: "lora_task_id",
+        value: "lora_task_id"
+      },
+      input_map {
+        key: "lora_weights",
+        value: "lora_weights"
+      },
+      input_map {
+        key: "lora_config",
+        value: "lora_config"
+      },
+      input_map {
+        key: "guided_decoding_guide_type",
+        value: "guided_decoding_guide_type"
+      },
+      input_map {
+        key: "guided_decoding_guide",
+        value: "guided_decoding_guide"
+      }
+      output_map {
+        key: "output_ids"
+        value: "_TOKENS_BATCH"
+      }
+      output_map {
+        key: "sequence_length"
+        value: "_SEQUENCE_LENGTH"
+      },
+      output_map {
+        key: "cum_log_probs"
+        value: "cum_log_probs"
+      }
+      output_map {
+        key: "output_log_probs"
+        value: "output_log_probs"
+      },
+      output_map {
+        key: "context_logits"
+        value: "context_logits"
+      },
+      output_map {
+        key: "generation_logits"
+        value: "generation_logits"
+      },
+      output_map {
+        key: "batch_index"
+        value: "batch_index"
+      },
+      output_map {
+        key: "sequence_index"
+        value: "sequence_index"
+      },
+      output_map {
+        key: "kv_cache_alloc_new_blocks"
+        value: "kv_cache_alloc_new_blocks"
+      },
+      output_map {
+        key: "kv_cache_reused_blocks"
+        value: "kv_cache_reused_blocks"
+      },
+      output_map {
+        key: "kv_cache_alloc_total_blocks"
+        value: "kv_cache_alloc_total_blocks"
+      }
+    },
+    {
+      model_name: "postprocessing"
+      model_version: -1
+      input_map {
+        key: "TOKENS_BATCH"
+        value: "_TOKENS_BATCH"
+      }
+      input_map {
+        key: "SEQUENCE_LENGTH"
+        value: "_SEQUENCE_LENGTH"
+      }
+      output_map {
+        key: "OUTPUT"
+        value: "text_output"
+      }
+    }
+  ]
+}

postprocessing/1/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (6.52 kB). View file

postprocessing/1/model.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import json
+import numpy as np
+import triton_python_backend_utils as pb_utils
+from transformers import AutoTokenizer
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        # Parse model configs
+        model_config = json.loads(args['model_config'])
+        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
+            'string_value']
+        skip_special_tokens = model_config['parameters'].get(
+            'skip_special_tokens')
+        if skip_special_tokens is not None:
+            skip_special_tokens_str = skip_special_tokens[
+                'string_value'].lower()
+            if skip_special_tokens_str in [
+                    'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
+            ]:
+                self.skip_special_tokens = skip_special_tokens_str in [
+                    'true', '1', 't', 'y', 'yes'
+                ]
+            else:
+                print(
+                    f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default."
+                )
+                self.skip_special_tokens = True
+        else:
+            print(
+                f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default."
+            )
+            self.skip_special_tokens = True
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
+                                                       legacy=False,
+                                                       padding_side='left',
+                                                       trust_remote_code=True)
+        if not self.tokenizer.pad_token:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Parse model output configs
+        output_config = pb_utils.get_output_config_by_name(
+            model_config, "OUTPUT")
+        # Convert Triton types to numpy types
+        self.output_dtype = pb_utils.triton_string_to_numpy(
+            output_config['data_type'])
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        tokens_batch = []
+        sequence_lengths = []
+        for idx, request in enumerate(requests):
+            for input_tensor in request.inputs():
+                if input_tensor.name() == "TOKENS_BATCH":
+                    tokens_batch.append(input_tensor.as_numpy())
+                elif input_tensor.name() == "SEQUENCE_LENGTH":
+                    sequence_lengths.append(input_tensor.as_numpy())
+                else:
+                    raise ValueError(f"unknown input {input_tensor.name}")
+        # batch decode
+        list_of_tokens = []
+        req_idx_offset = 0
+        req_idx_offsets = [req_idx_offset]
+        for idx, token_batch in enumerate(tokens_batch):
+            for batch_idx, beam_tokens in enumerate(token_batch):
+                for beam_idx, tokens in enumerate(beam_tokens):
+                    seq_len = sequence_lengths[idx][batch_idx][beam_idx]
+                    list_of_tokens.append(tokens[:seq_len])
+                    req_idx_offset += 1
+            req_idx_offsets.append(req_idx_offset)
+        all_outputs = self.tokenizer.batch_decode(
+            list_of_tokens, skip_special_tokens=self.skip_special_tokens)
+        # construct responses
+        responses = []
+        for idx, request in enumerate(requests):
+            req_outputs = [
+                x.encode('utf8')
+                for x in all_outputs[req_idx_offsets[idx]:req_idx_offsets[idx +
+                                                                          1]]
+            ]
+            output_tensor = pb_utils.Tensor(
+                'OUTPUT',
+                np.array(req_outputs).astype(self.output_dtype))
+            outputs = [output_tensor]
+            # Create InferenceResponse. You can set an error here in case
+            # there was a problem with handling this inference request.
+            # Below is an example of how you can set errors in inference
+            # response:
+            #
+            # pb_utils.InferenceResponse(
+            #    output_tensors=..., TritonError("An error occurred"))
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=outputs)
+            responses.append(inference_response)
+        # You should return a list of pb_utils.InferenceResponse. Length
+        # of this list must match the length of `requests` list.
+        return responses
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print('Cleaning up...')

postprocessing/config.pbtxt ADDED Viewed

	@@ -0,0 +1,70 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+name: "postprocessing"
+backend: "python"
+max_batch_size: 32
+dynamic_batching {}
+input [
+  {
+    name: "TOKENS_BATCH"
+    data_type: TYPE_INT32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "SEQUENCE_LENGTH"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
+parameters {
+  key: "tokenizer_dir"
+  value: {
+    string_value: "huihui-ai/Llama-3.3-70B-Instruct-abliterated"
+  }
+}
+parameters {
+  key: "skip_special_tokens"
+  value: {
+    string_value: "True"
+  }
+}
+instance_group [
+    {
+        count: 1
+        kind: KIND_CPU
+    }
+]

preprocessing/1/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (39.3 kB). View file

preprocessing/1/model.py ADDED Viewed

	@@ -0,0 +1,908 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import base64
+import io
+import json
+import os
+from typing import List
+import numpy as np
+import requests
+import triton_python_backend_utils as pb_utils
+from PIL import Image
+from transformers import AutoProcessor, AutoTokenizer, T5Tokenizer
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        # Parse model configs
+        model_config = json.loads(args['model_config'])
+        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
+            'string_value']
+        add_special_tokens = model_config['parameters'].get(
+            'add_special_tokens')
+        visual_model_path = model_config['parameters']['visual_model_path'][
+            'string_value']
+        max_num_images = model_config['parameters'].get('max_num_images')
+        if max_num_images is not None:
+            max_num_images_str = max_num_images['string_value']
+            if max_num_images_str.isdigit():
+                self.max_num_images = int(max_num_images_str)
+            else:
+                print(
+                    f"[TensorRT-LLM][WARNING] 'max_num_images' parameter is not set correctly (value is {max_num_images_str}). Will be set to None"
+                )
+                self.max_num_images = None
+        else:
+            print(
+                f"[TensorRT-LLM][WARNING] Don't setup 'max_num_images'. Set it as None by default."
+            )
+            self.max_num_images = None
+        if visual_model_path == "${visual_model_path}" or visual_model_path == "":
+            visual_model_path = None
+        if add_special_tokens is not None:
+            add_special_tokens_str = add_special_tokens['string_value'].lower()
+            if add_special_tokens_str in [
+                    'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
+            ]:
+                self.add_special_tokens = add_special_tokens_str in [
+                    'true', '1', 't', 'y', 'yes'
+                ]
+            else:
+                print(
+                    f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default."
+                )
+                self.add_special_tokens = True
+        else:
+            print(
+                f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default."
+            )
+            self.add_special_tokens = True
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
+                                                       legacy=False,
+                                                       padding_side='left',
+                                                       trust_remote_code=True)
+        if isinstance(self.tokenizer, T5Tokenizer):
+            self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id()
+        if not self.tokenizer.pad_token:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.tokenizer_end_id = self.tokenizer.encode(
+            self.tokenizer.eos_token, add_special_tokens=False)[0]
+        self.tokenizer_pad_id = self.tokenizer.encode(
+            self.tokenizer.pad_token, add_special_tokens=False)[0]
+        self.vocab_size = self.tokenizer.vocab_size
+        self.is_multimodal = False
+        self.model_type = None
+        self.vision_preprocessor = None
+        if visual_model_path is not None:
+            self.is_multimodal = True
+            visual_model_path = os.path.join(visual_model_path, 'config.json')
+            with open(visual_model_path, 'r') as f:
+                visual_model_config = json.load(f)
+            self.model_type = visual_model_config['builder_config'][
+                'model_type']
+            assert self.model_type in [
+                'llava', 'blip2-opt', 'vila', 'mllama', 'llava_onevision'
+            ], f"[TensorRT-LLM][ERROR] Currently supported multi-modal models are llava, blip2-opt, vila, mllama and llava_onevision. Got {self.model_type}."
+            assert self.model_type != 'llava_onevison' or self.max_num_images is None or self.max_num_images <= 1, f"LLaVA-OneVsion is not support multi image inference currently."
+            llm_model_path = model_config['parameters']['gpt_model_path'][
+                'string_value']
+            llm_model_path = os.path.join(llm_model_path, 'config.json')
+            with open(llm_model_path, 'r') as f:
+                llm_model_config = json.load(f)
+            self.vocab_size = int(
+                llm_model_config["pretrained_config"]["vocab_size"])
+            self._setup_ptable_shape(llm_model_config)
+            if self.model_type == 'mllama' or self.model_type == 'llava_onevision':
+                self.vision_preprocessor = VisionPreProcessor(
+                    self.model_type,
+                    AutoProcessor.from_pretrained(tokenizer_dir), model_config)
+        # Parse model output configs and convert Triton types to numpy types
+        output_names = [
+            "INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN",
+            "REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS",
+            "OUT_END_ID", "OUT_PAD_ID", "OUT_PROMPT_TABLE_EXTRA_IDS",
+            "PIXEL_VALUES", "IMAGE_SIZES"
+        ]
+        input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"]
+        for input_name in input_names:
+            setattr(
+                self,
+                input_name.lower() + "_dtype",
+                pb_utils.triton_string_to_numpy(
+                    pb_utils.get_input_config_by_name(
+                        model_config, input_name)['data_type']))
+        for output_name in output_names:
+            setattr(
+                self,
+                output_name.lower() + "_dtype",
+                pb_utils.triton_string_to_numpy(
+                    pb_utils.get_output_config_by_name(
+                        model_config, output_name)['data_type']))
+    def _setup_ptable_shape(self, llm_model_config):
+        max_prompt_embedding_table_size = llm_model_config['build_config'][
+            'max_prompt_embedding_table_size']
+        max_batch_size = llm_model_config['build_config']['max_batch_size']
+        num_visual_features = max_prompt_embedding_table_size // max_batch_size
+        hidden_size = llm_model_config['pretrained_config']['hidden_size']
+        if self.max_num_images is not None:
+            num_visual_features = num_visual_features // self.max_num_images
+        self.ptable_shape = (-1, num_visual_features, hidden_size)
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        responses = []
+        # Every Python backend must iterate over everyone of the requests
+        # and create a pb_utils.InferenceResponse for each of them.
+        for idx, request in enumerate(requests):
+            # Get input tensors
+            query = pb_utils.get_input_tensor_by_name(request,
+                                                      'QUERY').as_numpy()
+            batch_size = query.shape[0]
+            decoder_query = pb_utils.get_input_tensor_by_name(
+                request, 'DECODER_QUERY')
+            if decoder_query is not None:
+                decoder_query = decoder_query.as_numpy()
+            request_output_len = pb_utils.get_input_tensor_by_name(
+                request, 'REQUEST_OUTPUT_LEN').as_numpy()
+            bad_words_dict = pb_utils.get_input_tensor_by_name(
+                request, 'BAD_WORDS_DICT')
+            if bad_words_dict is not None:
+                bad_words_dict = bad_words_dict.as_numpy()
+            stop_words_dict = pb_utils.get_input_tensor_by_name(
+                request, 'STOP_WORDS_DICT')
+            if stop_words_dict is not None:
+                stop_words_dict = stop_words_dict.as_numpy()
+            embedding_bias_words = pb_utils.get_input_tensor_by_name(
+                request, 'EMBEDDING_BIAS_WORDS')
+            if embedding_bias_words is not None:
+                embedding_bias_words = embedding_bias_words.as_numpy()
+            embedding_bias_weights = pb_utils.get_input_tensor_by_name(
+                request, 'EMBEDDING_BIAS_WEIGHTS')
+            if embedding_bias_weights is not None:
+                embedding_bias_weights = embedding_bias_weights.as_numpy()
+            # Take the end_id from the input tensors
+            # If not specified, use tokenizer to get end_id
+            end_id = pb_utils.get_input_tensor_by_name(request, 'END_ID')
+            if end_id is not None:
+                end_id = end_id.as_numpy()
+            else:
+                end_id = [[self.tokenizer_end_id]] * batch_size
+            # Take the pad_id from the input tensors
+            # If not specified, use tokenizer to get pad_id
+            pad_id = pb_utils.get_input_tensor_by_name(request, 'PAD_ID')
+            if pad_id is not None:
+                pad_id = pad_id.as_numpy()
+            else:
+                pad_id = [[self.tokenizer_pad_id]] * batch_size
+            # Take the extra_id from the input tensors
+            # Extra id is used in kv cache reuse for p-tuning
+            prompt_table_extra_id = pb_utils.get_input_tensor_by_name(
+                request, 'PROMPT_TABLE_EXTRA_ID')
+            if prompt_table_extra_id is not None:
+                prompt_table_extra_id = prompt_table_extra_id.as_numpy()
+                assert prompt_table_extra_id.shape[
+                    0] == batch_size, "Prompt table extra id must have the same batch size as Query"
+                assert prompt_table_extra_id.shape[
+                    1] == 1, "Multiple IDs cannot be provided for a single image"
+            # Preprocessing vision input passed as a url or bytes tensor
+            img_urls = pb_utils.get_input_tensor_by_name(request, 'IMAGE_URL')
+            image_bytes = pb_utils.get_input_tensor_by_name(
+                request, 'IMAGE_BYTES')
+            video_bytes = pb_utils.get_input_tensor_by_name(
+                request, 'VIDEO_BYTES')
+            vision_processed_tensors = []
+            visual_tokens = []
+            if self.is_multimodal and (img_urls or image_bytes or video_bytes):
+                assert self.vision_preprocessor != None, "Vision preprocessor for preparing images before encoding is None"
+                processed_tensors = {}
+                if self.model_type == 'mllama':
+                    processed_tensors = self.vision_preprocessor.mllama_process(
+                        queries=query.astype(str).tolist(),
+                        img_urls=img_urls,
+                        image_bytes=image_bytes,
+                    )
+                elif self.model_type == 'llava_onevision':
+                    if video_bytes is None:
+                        processed_tensors, visual_tokens = self.vision_preprocessor.llava_onevision_process_image(
+                            queries=query.astype(str).tolist(),
+                            img_urls=img_urls,
+                            image_bytes=image_bytes,
+                        )
+                    else:
+                        processed_tensors, visual_tokens = self.vision_preprocessor.llava_onevision_process_video(
+                            queries=query.astype(str).tolist(),
+                            video_bytes=video_bytes,
+                        )
+                else:
+                    raise ValueError(
+                        "Unsupported model type for IMAGE_BYTES or IMAGE_URL inputs"
+                    )
+                vision_processed_tensors = [
+                    pb_utils.Tensor.from_dlpack(k, v)
+                    for k, v in processed_tensors.items()
+                ]
+            else:
+                assert self.model_type != "llava_onevision", "Image processing requires IMAGE_BYTES or IMAGE_URL to be provided"
+            # Preprocessing input data.
+            # For the LLaVA_OneVision model, num_visual_features is not a fixed value
+            input_id, request_input_len = self._create_request(
+                query, visual_tokens)
+            if decoder_query is not None:
+                decoder_input_id, request_decoder_input_len = self._create_request(
+                    decoder_query)
+            else:
+                decoder_input_id = pad_id * np.ones((batch_size, 1), np.int32)
+                request_decoder_input_len = 1 * np.ones(
+                    (batch_size, 1), np.int32)
+            bad_words = self._to_word_list_format(bad_words_dict, batch_size)
+            stop_words = self._to_word_list_format(stop_words_dict, batch_size)
+            embedding_bias = self._get_embedding_bias(
+                embedding_bias_words, embedding_bias_weights,
+                self.embedding_bias_weights_dtype, batch_size)
+            if prompt_table_extra_id is not None:
+                prompt_table_extra_ids = np.zeros_like(input_id)
+                for i in range(batch_size):
+                    prompt_table_extra_ids[i] = np.where(
+                        input_id[i] >= self.vocab_size,
+                        prompt_table_extra_id[i], 0)
+            # Create output tensors. You need pb_utils.Tensor
+            # objects to create pb_utils.InferenceResponse.
+            input_id_tensor = pb_utils.Tensor(
+                'INPUT_ID', input_id.astype(self.input_id_dtype))
+            request_input_len_tensor = pb_utils.Tensor(
+                'REQUEST_INPUT_LEN',
+                request_input_len.astype(self.request_input_len_dtype))
+            decoder_input_id_tensor = pb_utils.Tensor(
+                'DECODER_INPUT_ID',
+                decoder_input_id.astype(self.decoder_input_id_dtype))
+            request_decoder_input_len_tensor = pb_utils.Tensor(
+                'REQUEST_DECODER_INPUT_LEN',
+                request_decoder_input_len.astype(
+                    self.request_decoder_input_len_dtype))
+            request_output_len_tensor = pb_utils.Tensor(
+                'REQUEST_OUTPUT_LEN', request_output_len)
+            bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
+            stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
+                                                    stop_words)
+            embedding_bias_tensor = pb_utils.Tensor('EMBEDDING_BIAS',
+                                                    embedding_bias)
+            end_id_tensor = pb_utils.Tensor('OUT_END_ID',
+                                            np.array(end_id, dtype=np.int32))
+            pad_id_tensor = pb_utils.Tensor('OUT_PAD_ID',
+                                            np.array(pad_id, dtype=np.int32))
+            if prompt_table_extra_id is not None:
+                prompt_table_extra_ids_tensor = pb_utils.Tensor(
+                    'OUT_PROMPT_TABLE_EXTRA_IDS',
+                    np.array(prompt_table_extra_ids,
+                             dtype=self.out_prompt_table_extra_ids_dtype))
+                inference_response = pb_utils.InferenceResponse(output_tensors=[
+                    input_id_tensor, decoder_input_id_tensor,
+                    bad_words_ids_tensor, stop_words_ids_tensor,
+                    request_input_len_tensor, request_decoder_input_len_tensor,
+                    request_output_len_tensor, embedding_bias_tensor,
+                    end_id_tensor, pad_id_tensor, prompt_table_extra_ids_tensor
+                ] + vision_processed_tensors)
+            else:
+                inference_response = pb_utils.InferenceResponse(
+                    output_tensors=[
+                        input_id_tensor, decoder_input_id_tensor,
+                        bad_words_ids_tensor, stop_words_ids_tensor,
+                        request_input_len_tensor,
+                        request_decoder_input_len_tensor,
+                        request_output_len_tensor, embedding_bias_tensor,
+                        end_id_tensor, pad_id_tensor
+                    ] + vision_processed_tensors)
+            responses.append(inference_response)
+        # You should return a list of pb_utils.InferenceResponse. Length
+        # of this list must match the length of `requests` list.
+        return responses
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print('Cleaning up...')
+    def _split_prompt_by_images(self,
+                                concatenated_ids,
+                                image_token_index=-200):
+        """
+        Splits tokenized prompts by image placeholders for each sample in the batch.
+        Args:
+            concatenated_ids (np.ndarray): A batch of concatenated token IDs, where image placeholders are indicated by `image_token_index`.
+        Returns:
+            List[List[np.ndarray]]: A list containing lists of token ID arrays for each prompt segment, per batch sample.
+        """
+        batch_splits = []
+        for batch in concatenated_ids:
+            zero_indices = np.where(batch == image_token_index)[0]
+            start_idx = 0
+            splits = []
+            for idx in zero_indices:
+                if start_idx != idx:
+                    splits.append(batch[start_idx:idx].reshape(1, -1))
+                start_idx = idx + 1
+            if start_idx < len(batch):
+                splits.append(batch[start_idx:].reshape(1, -1))
+            splits = [split for split in splits if split.size > 0]
+            batch_splits.append(splits)
+        return batch_splits
+    def _setup_fake_prompts(self, batch_size, batch_split_prompts):
+        """
+        Replaces image placeholders with unique fake prompt IDs for multi-image inputs.
+        Args:
+            batch_size (int): The number of samples in the batch.
+            batch_split_prompts (List[List[np.ndarray]]): Tokenized prompt segments for each batch sample.
+        Returns:
+            np.ndarray: An array of input IDs with image placeholders replaced by fake prompt IDs.
+        """
+        num_visual_features = self.ptable_shape[1]
+        input_ids_list = []
+        for batch_idx in range(batch_size):
+            splits = batch_split_prompts[batch_idx]
+            sample_input_ids = [splits[0]]
+            sample_fake_prompt_counter = self.vocab_size
+            for split_idx in range(len(splits) - 1):
+                fake_prompt_id = np.arange(
+                    sample_fake_prompt_counter,
+                    sample_fake_prompt_counter + num_visual_features)
+                sample_fake_prompt_counter += num_visual_features
+                fake_prompt_id = np.expand_dims(fake_prompt_id, axis=0)
+                sample_input_ids.append(fake_prompt_id)
+                sample_input_ids.append(splits[split_idx + 1])
+            sample_input_ids = np.concatenate(sample_input_ids, axis=1)
+            input_ids_list.append(sample_input_ids)
+        # Pad the input_ids to the same length for bs > 1
+        max_seq_len = max(
+            [sample_input_ids.shape[1] for sample_input_ids in input_ids_list])
+        input_ids_padded = []
+        for sample_input_ids in input_ids_list:
+            seq_len = sample_input_ids.shape[1]
+            pad_width = max_seq_len - seq_len
+            if pad_width > 0:
+                sample_input_ids_padded = np.pad(
+                    sample_input_ids, ((0, 0), (0, pad_width)),
+                    'constant',
+                    constant_values=self.tokenizer_pad_id)
+            else:
+                sample_input_ids_padded = sample_input_ids
+            input_ids_padded.append(sample_input_ids_padded)
+        input_ids = np.stack(input_ids_padded)
+        input_ids = input_ids.reshape(batch_size, -1).astype(np.int32)
+        return input_ids
+    def _process_multi_image_inputs(self, query, image_token_index=-200):
+        """
+        Processes input queries that contain multiple images by tokenizing the input strings and inserting image_token_index between the parts.
+        Args:
+            query (np.ndarray): Batch of input strings.
+        Returns:
+            List[np.ndarray]: List of tokenized input IDs for each sample.
+        """
+        start_ids = []
+        for s in query:
+            parts = s[0].decode().split('<image>')
+            num_images = len(parts) - 1
+            if num_images > self.max_num_images:
+                raise ValueError(
+                    f"The number of images in the request ({num_images}) exceeds the maximum allowed ({self.max_num_images})."
+                )
+            tokenized_parts = [
+                self.tokenizer.encode(part, add_special_tokens=False)
+                for part in parts
+            ]
+            # Insert `image_token_index` between the parts to represent <image>
+            final_ids = []
+            for i, part in enumerate(tokenized_parts):
+                final_ids.extend(part)
+                if i < len(tokenized_parts) - 1:
+                    final_ids.append(image_token_index)
+            start_ids.append(np.array(final_ids).astype(int))
+        return start_ids
+    def _create_request(self, query, visual_tokens=None):
+        """
+            query : batch string (2D numpy array)
+        """
+        if isinstance(self.tokenizer, T5Tokenizer):
+            start_ids = [
+                np.array([self.tokenizer_bos_id] + self.tokenizer.encode(
+                    s[0].decode(), add_special_tokens=self.add_special_tokens)
+                         ).astype(int) for s in query
+            ]
+        else:
+            if self.is_multimodal and self.max_num_images and self.max_num_images > 1:
+                start_ids = self._process_multi_image_inputs(query)
+            else:
+                start_ids = [
+                    np.array(
+                        self.tokenizer.encode(s[0].decode(),
+                                              add_special_tokens=self.
+                                              add_special_tokens)).astype(int)
+                    for s in query
+                ]
+        if self.is_multimodal:
+            if 'blip2' in self.model_type or 'mllama' == self.model_type:
+                pre_prompt = None
+                post_prompt = None
+            elif 'llava' == self.model_type:
+                pre_prompt = "USER:\n"
+                post_prompt = " ASSISTANT:"
+            elif 'vila' == self.model_type:
+                pre_prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: "
+                post_prompt = " ASSISTANT:"
+            elif 'llava_onevision' == self.model_type:
+                pre_prompt = "<|im_start|>user "
+                post_prompt = "<|im_end|><|im_start|>assistant\n"
+            pre_prompt_id = np.array(
+                self.tokenizer.encode(
+                    pre_prompt,
+                    add_special_tokens=self.add_special_tokens,
+                    padding=True)) if pre_prompt is not None else np.array(
+                        [], dtype=int)
+            post_prompt_id = np.array(
+                self.tokenizer.encode(
+                    post_prompt,
+                    add_special_tokens=self.add_special_tokens,
+                    padding=True)) if post_prompt is not None else np.array(
+                        [], dtype=int)
+            if self.max_num_images and self.max_num_images > 1:
+                concatenated_ids = [
+                    np.concatenate((pre_prompt_id, ids, post_prompt_id),
+                                   axis=0) for ids in start_ids
+                ]
+                batch_split_prompts = self._split_prompt_by_images(
+                    concatenated_ids)
+                start_ids = self._setup_fake_prompts(query.shape[0],
+                                                     batch_split_prompts)
+            elif self.model_type == 'llava_onevision':
+                fake_prompt_ids = []
+                extra_id = np.array(
+                    self.tokenizer.encode(
+                        '\n',
+                        add_special_tokens=self.add_special_tokens,
+                        padding=True))
+                for tokens in visual_tokens:
+                    prompt_id = np.arange(self.vocab_size,
+                                          self.vocab_size + tokens)
+                    fake_prompt_ids.append(prompt_id)
+                start_ids = [
+                    np.concatenate((pre_prompt_id, prompt_id, extra_id, ids,
+                                    post_prompt_id),
+                                   axis=0)
+                    for prompt_id, ids in zip(fake_prompt_ids, start_ids)
+                ]
+            else:
+                fake_prompt_id = np.arange(
+                    self.vocab_size, self.vocab_size + self.ptable_shape[1])
+                start_ids = [
+                    np.concatenate(
+                        (pre_prompt_id, fake_prompt_id, ids, post_prompt_id),
+                        axis=0) for ids in start_ids
+                ]
+        start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int)
+        max_len = 0
+        for seq in start_ids:
+            max_len = max(max_len, seq.shape[0])
+        start_ids = np.stack([
+            np.pad(seq, (0, max_len - seq.shape[0]),
+                   'constant',
+                   constant_values=(0, self.tokenizer_pad_id))
+            for seq in start_ids
+        ])
+        return start_ids, start_lengths
+    def _to_word_list_format(self, word_lists: List[List[str | bytes]],
+                             batch_size):
+        '''
+        word_lists format:
+            len(word_lists) == batch_size
+            word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
+        '''
+        assert self.tokenizer != None, "need to set tokenizer"
+        if word_lists is None:
+            # Return an empty array of shape (1,2,0)
+            return np.empty([batch_size, 2, 0], dtype="int32")
+        flat_ids = []
+        offsets = []
+        for word_list in word_lists:
+            item_flat_ids = []
+            item_offsets = []
+            for word in word_list:
+                if isinstance(word, bytes):
+                    word = word.decode()
+                ids = self.tokenizer.encode(word, add_special_tokens=False)
+                if len(ids) == 0:
+                    continue
+                item_flat_ids += ids
+                item_offsets.append(len(ids))
+            flat_ids.append(np.array(item_flat_ids))
+            offsets.append(np.cumsum(np.array(item_offsets)))
+        pad_to = max(1, max(len(ids) for ids in flat_ids))
+        for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
+            flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)),
+                                 constant_values=0)
+            offsets[i] = np.pad(offs, (0, pad_to - len(offs)),
+                                constant_values=-1)
+        return np.array([flat_ids, offsets], dtype="int32").transpose(
+            (1, 0, 2))
+    def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights,
+                            bias_dtype, batch_size):
+        assert self.tokenizer != None, "need to set tokenizer"
+        if embedding_bias_words is None or embedding_bias_weights is None:
+            return np.empty([batch_size, 0],
+                            dtype=self.embedding_bias_weights_dtype)
+        batch_embedding_bias = []
+        for words, weights in zip(embedding_bias_words,
+                                  embedding_bias_weights):
+            vocab_size = len(self.tokenizer.vocab)
+            embedding_bias = [0.] * vocab_size
+            assert len(words) == len(
+                weights
+            ), "Embedding bias words must have same dimension as embedding bias weights"
+            for word, weight in zip(words, weights):
+                if isinstance(word, bytes):
+                    word = word.decode()
+                ids = self.tokenizer.encode(word)
+                if len(ids) == 0:
+                    continue
+                for id in ids:
+                    embedding_bias[id] += weight
+            batch_embedding_bias.append(np.array(embedding_bias))
+        return np.array(batch_embedding_bias, dtype=bias_dtype)
+class VisionPreProcessor:
+    """ A class that can load images from url requests, and process them via a vision model processor,
+    in preparation for the vision encoder.
+    """
+    def __init__(self,
+                 vision_model_type,
+                 vision_model_processor,
+                 preprocessor_model_config={}):
+        # import libraries that are only relevant for multimodal models
+        import torch
+        from torch.utils.dlpack import from_dlpack
+        # NOTE: Due to the behavior of MPI initialization, it is recommended to avoid using import tensorrt_llm
+        #       except for the specific modules tensorrt_llm and multimodal_encoders.
+        #       As a result, the function str_dtype_to_torch has been copied directly from tensorrt_llm._utils.
+        _str_to_torch_dtype_dict = dict(
+            bfloat16=torch.bfloat16,
+            float16=torch.float16,
+            float32=torch.float32,
+            int64=torch.int64,
+            int32=torch.int32,
+            int8=torch.int8,
+            bool=torch.bool,
+            fp8=torch.float8_e4m3fn,
+        )
+        def str_dtype_to_torch(dtype):
+            ret = _str_to_torch_dtype_dict.get(dtype)
+            assert ret is not None, f'Unsupported dtype: {dtype}'
+            return ret
+        self.load_images_tensor = lambda tensor: tensor if not hasattr(
+            tensor, 'to_dlpack') else from_dlpack(tensor.to_dlpack())
+        # extract expected output tensor dtype
+        self.output_str_dtypes = {}
+        for properties in preprocessor_model_config.get('output', []):
+            dtype = properties['data_type']
+            self.output_str_dtypes[properties['name']] = np.dtype(
+                pb_utils.triton_string_to_numpy(dtype)).name
+        # create method for converting output tensors batch to the expected type
+        self.convert_tensor_list_to_tensor = lambda tensor_list: torch.concat(
+            [
+                torch.from_numpy(x) if isinstance(x, np.ndarray) else x
+                for x in tensor_list
+            ],
+            dim=0)
+        self.convert_tensor_to_str_dtype = lambda tensor, dtype: tensor.to(
+            str_dtype_to_torch(dtype))
+        # create model-specific processor
+        self.vision_model_processor = vision_model_processor
+        self.vision_model_type = vision_model_type
+    def load_images_from_urls(self, img_urls):
+        images = []
+        for img_url in img_urls:
+            img_url = img_url.decode()
+            if img_url.startswith("data:image/jpeg;base64,"):
+                image_base64 = img_url.split(",")[1]
+                # Decode the base64 string
+                image_data = base64.b64decode(image_base64)
+                # Create a BytesIO object from the decoded data
+                image_buffer = io.BytesIO(image_data)
+                images.append(Image.open(image_buffer))
+            else:
+                images.append(
+                    Image.open(requests.get(img_url, stream=True).raw))
+        return images
+    def mllama_process(self, queries, img_urls=None, image_bytes=None):
+        vision_processed_tensors = {}
+        if img_urls is not None or image_bytes is not None:
+            if img_urls is not None:
+                # download and read images
+                images = [
+                    self.load_images_from_urls(urls)
+                    for urls in img_urls.as_numpy()
+                ]
+            else:
+                images = [
+                    img for img_list in self.load_images_tensor(image_bytes)
+                    for img in img_list
+                ]
+            batch_size = len(images)
+            preprocessor_outputs = {}
+            possible_output_names = [
+                'PIXEL_VALUES', 'ASPECT_RATIO_IDS', 'ASPECT_RATIO_MASK',
+                'CROSS_ATTENTION_MASK'
+            ]
+            for batch_id in range(batch_size):
+                # Preprocess images and query
+                processed_vision_data = self.vision_model_processor(
+                    images=images[batch_id],
+                    text=queries[batch_id],
+                    return_tensors="pt")
+                # Reshape pixel_values to [num_images, *HWC/CHW]
+                val = processed_vision_data["pixel_values"]
+                val = val.reshape(1, -1, *(val.shape[-3:]))
+                processed_vision_data["pixel_values"] = val
+                # Create vision output tensors
+                for key in possible_output_names:
+                    val = processed_vision_data.get(key.lower())
+                    if val is not None:
+                        if key not in preprocessor_outputs:
+                            preprocessor_outputs[key] = []
+                        preprocessor_outputs[key].append(val)
+            for key, tensor_list in preprocessor_outputs.items():
+                val = self.convert_tensor_list_to_tensor(tensor_list)
+                if key in self.output_str_dtypes:
+                    val = self.convert_tensor_to_str_dtype(
+                        val, self.output_str_dtypes[key])
+                vision_processed_tensors[key] = val
+        return vision_processed_tensors
+    def llava_onevision_process_image(self,
+                                      queries,
+                                      img_urls=None,
+                                      image_bytes=None):
+        import torch
+        vision_processed_tensors = {}
+        if img_urls is not None:
+            # download and read images
+            images = [
+                self.load_images_from_urls(urls)
+                for urls in img_urls.as_numpy()
+            ]
+        else:
+            images = [
+                img for img_list in self.load_images_tensor(image_bytes)
+                for img in img_list
+            ]
+        batch_size = len(images)
+        assert len(
+            queries
+        ) == batch_size, f"Image must have the same batch size as Query."
+        preprocessor_outputs = {}
+        possible_output_names = ['PIXEL_VALUES', 'IMAGE_SIZES']
+        visual_tokens = []
+        for batch_id in range(batch_size):
+            # Preprocess images and query
+            processed_vision_data = self.vision_model_processor(
+                images=images[batch_id], text='<image>', return_tensors="pt")
+            visual_tokens.append(processed_vision_data['input_ids'].shape[1])
+            # Create vision output tensors
+            for key in possible_output_names:
+                val = processed_vision_data.get(key.lower())
+                if val is not None:
+                    if key not in preprocessor_outputs:
+                        preprocessor_outputs[key] = []
+                    preprocessor_outputs[key].append(val)
+        max_patch = max(x.shape[1]
+                        for x in preprocessor_outputs['PIXEL_VALUES'])
+        preprocessor_outputs['PIXEL_VALUES'] = [
+            torch.nn.functional.pad(
+                image, (0, 0, 0, 0, 0, 0, 0, max_patch - image.shape[1], 0, 0),
+                mode='constant')
+            for image in preprocessor_outputs['PIXEL_VALUES']
+        ]
+        for key, tensor_list in preprocessor_outputs.items():
+            val = self.convert_tensor_list_to_tensor(tensor_list)
+            if key in self.output_str_dtypes:
+                val = self.convert_tensor_to_str_dtype(
+                    val, self.output_str_dtypes[key])
+            vision_processed_tensors[key] = val
+        return vision_processed_tensors, visual_tokens
+    def llava_onevision_process_video(self, queries, video_bytes=None):
+        import torch
+        vision_processed_tensors = {}
+        videos = [video for video in self.load_images_tensor(video_bytes)]
+        batch_size = len(videos)
+        assert len(
+            queries
+        ) == batch_size, f"Video must have the same batch size as Query."
+        preprocessor_outputs = {}
+        preprocessor_outputs['PIXEL_VALUES'] = []
+        preprocessor_outputs['IS_VIDEO_INPUT'] = []
+        visual_tokens = []
+        for batch_id in range(len(queries)):
+            processed_vision_data = self.vision_model_processor(
+                videos=list(videos[batch_id]),
+                text='<video>',
+                return_tensors="pt")
+            visual_tokens.append(processed_vision_data['input_ids'].shape[1])
+            preprocessor_outputs['PIXEL_VALUES'].append(
+                processed_vision_data['pixel_values_videos'])
+            preprocessor_outputs['IS_VIDEO_INPUT'].append(
+                torch.ones((1, 1), dtype=torch.bool))
+        for key, tensor_list in preprocessor_outputs.items():
+            val = self.convert_tensor_list_to_tensor(tensor_list)
+            if key in self.output_str_dtypes:
+                val = self.convert_tensor_to_str_dtype(
+                    val, self.output_str_dtypes[key])
+            vision_processed_tensors[key] = val
+        return vision_processed_tensors, visual_tokens

preprocessing/config.pbtxt ADDED Viewed

	@@ -0,0 +1,240 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+name: "preprocessing"
+backend: "python"
+max_batch_size: 32
+input [
+    {
+        name: "QUERY"
+        data_type: TYPE_STRING
+        dims: [ 1 ]
+    },
+    {
+        name: "DECODER_QUERY"
+        data_type: TYPE_STRING
+        dims: [ 1 ]
+        optional: true
+    },
+    {
+        name: "IMAGE_BYTES"
+        data_type: TYPE_UINT8
+        dims: [ -1, -1, -1, -1 ]
+        optional: true
+    },
+    {
+        name: "IMAGE_URL"
+        data_type: TYPE_STRING
+        dims: [ 1 ]
+        optional: true
+    },
+    {
+        name: "VIDEO_BYTES"
+        data_type: TYPE_UINT8
+        dims: [ -1, -1, -1, -1 ]
+        optional: true
+    },
+    {
+        name: "REQUEST_OUTPUT_LEN"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    },
+    {
+        name: "BAD_WORDS_DICT"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+        optional: true
+    },
+    {
+        name: "STOP_WORDS_DICT"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+        optional: true
+    },
+    {
+        name: "EMBEDDING_BIAS_WORDS"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+        optional: true
+    },
+    {
+        name: "EMBEDDING_BIAS_WEIGHTS"
+        data_type: TYPE_FP32
+        dims: [ -1 ]
+        optional: true
+    },
+    {
+        name: "END_ID"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+        optional: true
+    },
+    {
+        name: "PAD_ID"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+        optional: true
+    },
+    {
+        name: "PROMPT_TABLE_EXTRA_ID"
+        data_type: TYPE_UINT64
+        dims: [ 1 ]
+        optional: true
+    }
+]
+output [
+    {
+        name: "INPUT_ID"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    },
+    {
+        name: "REQUEST_INPUT_LEN"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    },
+    {
+        name: "DECODER_INPUT_ID"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    },
+    {
+        name: "REQUEST_DECODER_INPUT_LEN"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    },
+    {
+        name: "BAD_WORDS_IDS"
+        data_type: TYPE_INT32
+        dims: [ 2, -1 ]
+    },
+    {
+        name: "STOP_WORDS_IDS"
+        data_type: TYPE_INT32
+        dims: [ 2, -1 ]
+    },
+    {
+        name: "EMBEDDING_BIAS"
+        data_type: TYPE_FP32
+        dims: [ -1 ]
+    },
+    {
+        name: "REQUEST_OUTPUT_LEN"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    },
+    {
+        name: "OUT_END_ID"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    },
+    {
+        name: "OUT_PAD_ID"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    },
+    {
+        name: "OUT_PROMPT_TABLE_EXTRA_IDS"
+        data_type: TYPE_UINT64
+        dims: [ -1 ]
+    },
+    {
+        name: "PIXEL_VALUES"
+        data_type: TYPE_FP16
+        dims: [ -1, -1, -1, -1 ]
+    },
+    {
+        name: "ASPECT_RATIO_IDS"
+        data_type: TYPE_INT64
+        dims: [ -1 ]
+    },
+    {
+        name: "ASPECT_RATIO_MASK"
+        data_type: TYPE_INT64
+        dims: [ -1, -1 ]
+    },
+    {
+        name: "CROSS_ATTENTION_MASK"
+        data_type: TYPE_INT64
+        dims: [ -1, -1, -1 ]
+    },
+    # Required for image postprocessing in the llava_onevision model
+    {
+        name: "IMAGE_SIZES"
+        data_type: TYPE_INT64
+        dims: [ 2 ]
+    },
+    # Indicates if the input is video in the llava_onevision model
+    {
+        name: "IS_VIDEO_INPUT"
+        data_type: TYPE_BOOL
+        dims: [ 1 ]
+    }
+]
+parameters {
+  key: "tokenizer_dir"
+  value: {
+    string_value: "huihui-ai/Llama-3.3-70B-Instruct-abliterated"
+  }
+}
+parameters {
+  key: "add_special_tokens"
+  value: {
+    string_value: "False"
+  }
+}
+parameters {
+  key: "visual_model_path"
+  value: {
+    string_value: "${visual_model_path}"
+  }
+}
+parameters: {
+  key: "gpt_model_path"
+  value: {
+    string_value: "/all_models/inflight_batcher_llm/tensorrt_llm/1"
+  }
+}
+parameters: {
+  key: "max_num_images"
+  value: {
+    string_value: "${max_num_images}"
+  }
+}
+instance_group [
+    {
+        count: 1
+        kind: KIND_CPU
+    }
+]

tensorrt_llm/1/.gitkeep ADDED Viewed

File without changes

tensorrt_llm/1/config.json ADDED Viewed

	@@ -0,0 +1,362 @@

+{
+    "version": "0.18.0.dev2025020400",
+    "pretrained_config": {
+        "mlp_bias": false,
+        "attn_bias": false,
+        "rotary_base": 500000.0,
+        "rotary_scaling": {
+            "factor": 8.0,
+            "high_freq_factor": 4.0,
+            "low_freq_factor": 1.0,
+            "original_max_position_embeddings": 8192,
+            "rope_type": "llama3"
+        },
+        "residual_mlp": false,
+        "disable_weight_only_quant_plugin": false,
+        "moe": {
+            "num_experts": 0,
+            "shared_expert_intermediate_size": 0,
+            "top_k": 0,
+            "normalization_mode": null,
+            "sparse_mixer_epsilon": 0.01,
+            "tp_mode": 0,
+            "device_limited_n_group": 0,
+            "device_limited_topk_group": 0,
+            "device_limited_routed_scaling_factor": 1.0
+        },
+        "remove_duplicated_kv_heads": false,
+        "fc_after_embed": false,
+        "use_input_layernorm_in_first_layer": true,
+        "use_last_layernorm": true,
+        "layer_idx_offset": 0,
+        "embedding_multiplier": 1.0,
+        "attention_multiplier": 1.0,
+        "residual_multiplier": 1.0,
+        "output_multiplier_scale": 1.0,
+        "has_partial_lora_mask": false,
+        "architecture": "LlamaForCausalLM",
+        "dtype": "float16",
+        "vocab_size": 128256,
+        "hidden_size": 8192,
+        "num_hidden_layers": 80,
+        "num_attention_heads": 64,
+        "hidden_act": "silu",
+        "logits_dtype": "float16",
+        "norm_epsilon": 1e-05,
+        "runtime_defaults": null,
+        "position_embedding_type": "rope_gpt_neox",
+        "num_key_value_heads": 8,
+        "intermediate_size": 28672,
+        "max_position_embeddings": 131072,
+        "mapping": {
+            "world_size": 2,
+            "gpus_per_node": 8,
+            "cp_size": 1,
+            "tp_size": 2,
+            "pp_size": 1,
+            "moe_tp_size": 2,
+            "moe_ep_size": 1,
+            "auto_parallel": false
+        },
+        "quantization": {
+            "quant_algo": "FP8",
+            "kv_cache_quant_algo": "FP8",
+            "group_size": 128,
+            "smoothquant_val": 0.5,
+            "clamp_val": null,
+            "use_meta_recipe": false,
+            "has_zero_point": false,
+            "pre_quant_scale": false,
+            "exclude_modules": [
+                "transformer.layers.33.input_layernorm",
+                "transformer.layers.58.post_layernorm",
+                "transformer.layers.43.post_layernorm",
+                "transformer.layers.45.input_layernorm",
+                "transformer.layers.8.post_layernorm",
+                "transformer.layers.79.input_layernorm",
+                "transformer.layers.70.post_layernorm",
+                "transformer.layers.73.input_layernorm",
+                "transformer.layers.19.input_layernorm",
+                "transformer.layers.46.input_layernorm",
+                "transformer.layers.48.input_layernorm",
+                "transformer.layers.67.post_layernorm",
+                "transformer.layers.12.input_layernorm",
+                "transformer.layers.60.post_layernorm",
+                "transformer.layers.17.post_layernorm",
+                "transformer.layers.57.input_layernorm",
+                "transformer.layers.0.input_layernorm",
+                "transformer.layers.49.input_layernorm",
+                "transformer.layers.4.post_layernorm",
+                "transformer.layers.39.post_layernorm",
+                "transformer.layers.73.post_layernorm",
+                "transformer.layers.44.post_layernorm",
+                "transformer.layers.13.input_layernorm",
+                "transformer.layers.56.post_layernorm",
+                "transformer.layers.62.post_layernorm",
+                "transformer.layers.42.post_layernorm",
+                "transformer.layers.27.input_layernorm",
+                "transformer.layers.22.post_layernorm",
+                "transformer.layers.77.input_layernorm",
+                "transformer.layers.51.input_layernorm",
+                "transformer.layers.21.post_layernorm",
+                "transformer.layers.54.post_layernorm",
+                "transformer.layers.22.input_layernorm",
+                "transformer.layers.47.input_layernorm",
+                "transformer.layers.15.input_layernorm",
+                "transformer.layers.7.input_layernorm",
+                "transformer.layers.63.input_layernorm",
+                "transformer.layers.70.input_layernorm",
+                "transformer.layers.5.input_layernorm",
+                "transformer.layers.29.post_layernorm",
+                "transformer.vocab_embedding",
+                "transformer.layers.2.post_layernorm",
+                "transformer.layers.11.post_layernorm",
+                "transformer.layers.54.input_layernorm",
+                "transformer.layers.45.post_layernorm",
+                "transformer.layers.78.post_layernorm",
+                "transformer.layers.23.post_layernorm",
+                "transformer.layers.30.input_layernorm",
+                "transformer.layers.58.input_layernorm",
+                "transformer.layers.18.input_layernorm",
+                "transformer.layers.3.input_layernorm",
+                "transformer.layers.7.post_layernorm",
+                "transformer.layers.77.post_layernorm",
+                "transformer.layers.47.post_layernorm",
+                "transformer.layers.38.input_layernorm",
+                "transformer.layers.41.post_layernorm",
+                "transformer.layers.55.post_layernorm",
+                "transformer.layers.64.post_layernorm",
+                "transformer.layers.57.post_layernorm",
+                "transformer.layers.29.input_layernorm",
+                "transformer.layers.28.input_layernorm",
+                "transformer.layers.9.input_layernorm",
+                "transformer.layers.43.input_layernorm",
+                "transformer.layers.28.post_layernorm",
+                "transformer.layers.52.post_layernorm",
+                "transformer.layers.17.input_layernorm",
+                "transformer.layers.19.post_layernorm",
+                "transformer.layers.15.post_layernorm",
+                "transformer.layers.25.post_layernorm",
+                "transformer.layers.32.input_layernorm",
+                "transformer.layers.76.post_layernorm",
+                "transformer.layers.16.input_layernorm",
+                "transformer.layers.75.post_layernorm",
+                "transformer.layers.62.input_layernorm",
+                "transformer.layers.50.input_layernorm",
+                "transformer.layers.35.input_layernorm",
+                "transformer.layers.59.input_layernorm",
+                "transformer.layers.68.post_layernorm",
+                "transformer.layers.40.post_layernorm",
+                "transformer.layers.10.post_layernorm",
+                "transformer.layers.50.post_layernorm",
+                "transformer.layers.14.input_layernorm",
+                "transformer.layers.61.post_layernorm",
+                "transformer.layers.41.input_layernorm",
+                "transformer.layers.3.post_layernorm",
+                "transformer.layers.69.input_layernorm",
+                "transformer.layers.2.input_layernorm",
+                "transformer.layers.1.post_layernorm",
+                "transformer.layers.14.post_layernorm",
+                "transformer.layers.1.input_layernorm",
+                "transformer.layers.53.input_layernorm",
+                "transformer.layers.65.input_layernorm",
+                "lm_head",
+                "transformer.layers.32.post_layernorm",
+                "transformer.layers.11.input_layernorm",
+                "transformer.layers.59.post_layernorm",
+                "transformer.layers.37.input_layernorm",
+                "transformer.ln_f",
+                "transformer.layers.4.input_layernorm",
+                "transformer.layers.34.post_layernorm",
+                "transformer.layers.78.input_layernorm",
+                "transformer.layers.44.input_layernorm",
+                "transformer.layers.48.post_layernorm",
+                "transformer.layers.20.post_layernorm",
+                "transformer.layers.49.post_layernorm",
+                "transformer.layers.42.input_layernorm",
+                "transformer.layers.66.post_layernorm",
+                "transformer.layers.74.input_layernorm",
+                "transformer.layers.20.input_layernorm",
+                "transformer.layers.5.post_layernorm",
+                "transformer.layers.69.post_layernorm",
+                "transformer.layers.35.post_layernorm",
+                "transformer.layers.56.input_layernorm",
+                "transformer.layers.79.post_layernorm",
+                "transformer.layers.31.post_layernorm",
+                "transformer.layers.60.input_layernorm",
+                "transformer.layers.36.post_layernorm",
+                "transformer.layers.23.input_layernorm",
+                "transformer.layers.26.post_layernorm",
+                "transformer.layers.66.input_layernorm",
+                "transformer.layers.68.input_layernorm",
+                "transformer.layers.52.input_layernorm",
+                "transformer.layers.72.input_layernorm",
+                "transformer.layers.26.input_layernorm",
+                "transformer.layers.9.post_layernorm",
+                "transformer.layers.71.post_layernorm",
+                "transformer.layers.72.post_layernorm",
+                "transformer.layers.18.post_layernorm",
+                "transformer.layers.6.input_layernorm",
+                "transformer.layers.33.post_layernorm",
+                "transformer.layers.51.post_layernorm",
+                "transformer.layers.76.input_layernorm",
+                "transformer.layers.64.input_layernorm",
+                "transformer.layers.16.post_layernorm",
+                "transformer.layers.25.input_layernorm",
+                "transformer.layers.0.post_layernorm",
+                "transformer.layers.38.post_layernorm",
+                "transformer.layers.63.post_layernorm",
+                "transformer.layers.12.post_layernorm",
+                "transformer.layers.30.post_layernorm",
+                "transformer.layers.67.input_layernorm",
+                "transformer.layers.46.post_layernorm",
+                "transformer.layers.24.input_layernorm",
+                "transformer.layers.53.post_layernorm",
+                "transformer.layers.74.post_layernorm",
+                "transformer.layers.71.input_layernorm",
+                "transformer.layers.55.input_layernorm",
+                "transformer.layers.6.post_layernorm",
+                "transformer.layers.40.input_layernorm",
+                "transformer.layers.13.post_layernorm",
+                "transformer.layers.27.post_layernorm",
+                "transformer.layers.8.input_layernorm",
+                "transformer.layers.24.post_layernorm",
+                "transformer.layers.37.post_layernorm",
+                "transformer.layers.61.input_layernorm",
+                "transformer.layers.34.input_layernorm",
+                "transformer.layers.36.input_layernorm",
+                "transformer.layers.31.input_layernorm",
+                "transformer.layers.65.post_layernorm",
+                "transformer.layers.21.input_layernorm",
+                "transformer.layers.39.input_layernorm",
+                "transformer.layers.10.input_layernorm",
+                "transformer.layers.75.input_layernorm"
+            ]
+        },
+        "use_parallel_embedding": true,
+        "embedding_sharding_dim": 0,
+        "head_size": 128,
+        "qk_layernorm": false,
+        "rotary_embedding_dim": 128,
+        "producer": {
+            "name": "modelopt",
+            "version": "0.23.0"
+        },
+        "share_embedding_table": false,
+        "bias": false,
+        "rotary_pct": 1.0,
+        "rank": 1,
+        "decoder": "llama",
+        "rmsnorm": true,
+        "lm_head_bias": false,
+        "tie_word_embeddings": false,
+        "model_type": "llama"
+    },
+    "build_config": {
+        "max_input_len": 124000,
+        "max_seq_len": 131072,
+        "opt_batch_size": 8,
+        "max_batch_size": 32,
+        "max_beam_width": 1,
+        "max_num_tokens": 128000,
+        "opt_num_tokens": null,
+        "max_prompt_embedding_table_size": 0,
+        "kv_cache_type": "PAGED",
+        "gather_context_logits": false,
+        "gather_generation_logits": false,
+        "strongly_typed": true,
+        "force_num_profiles": null,
+        "profiling_verbosity": "layer_names_only",
+        "enable_debug_output": false,
+        "max_draft_len": 0,
+        "speculative_decoding_mode": 1,
+        "use_refit": false,
+        "input_timing_cache": null,
+        "output_timing_cache": "model.cache",
+        "lora_config": {
+            "lora_dir": [],
+            "lora_ckpt_source": "hf",
+            "max_lora_rank": 64,
+            "lora_target_modules": [],
+            "trtllm_modules_to_hf_modules": {}
+        },
+        "auto_parallel_config": {
+            "world_size": 1,
+            "gpus_per_node": 8,
+            "cluster_key": "H100-PCIe",
+            "cluster_info": null,
+            "sharding_cost_model": "alpha_beta",
+            "comm_cost_model": "alpha_beta",
+            "enable_pipeline_parallelism": false,
+            "enable_shard_unbalanced_shape": false,
+            "enable_shard_dynamic_shape": false,
+            "enable_reduce_scatter": true,
+            "builder_flags": null,
+            "debug_mode": false,
+            "infer_shape": true,
+            "validation_mode": false,
+            "same_buffer_io": {
+                "past_key_value_(\\d+)": "present_key_value_\\1"
+            },
+            "same_spec_io": {},
+            "sharded_io_allowlist": [
+                "past_key_value_\\d+",
+                "present_key_value_\\d*"
+            ],
+            "fill_weights": false,
+            "parallel_config_cache": null,
+            "profile_cache": null,
+            "dump_path": null,
+            "debug_outputs": []
+        },
+        "weight_sparsity": false,
+        "weight_streaming": false,
+        "plugin_config": {
+            "dtype": "float16",
+            "bert_attention_plugin": "auto",
+            "gpt_attention_plugin": "auto",
+            "gemm_plugin": "fp8",
+            "explicitly_disable_gemm_plugin": false,
+            "gemm_swiglu_plugin": null,
+            "fp8_rowwise_gemm_plugin": null,
+            "qserve_gemm_plugin": null,
+            "identity_plugin": null,
+            "nccl_plugin": "float16",
+            "lora_plugin": null,
+            "dora_plugin": false,
+            "weight_only_groupwise_quant_matmul_plugin": null,
+            "weight_only_quant_matmul_plugin": null,
+            "smooth_quant_plugins": true,
+            "smooth_quant_gemm_plugin": null,
+            "layernorm_quantization_plugin": null,
+            "rmsnorm_quantization_plugin": null,
+            "quantize_per_token_plugin": false,
+            "quantize_tensor_plugin": false,
+            "moe_plugin": "auto",
+            "mamba_conv1d_plugin": "auto",
+            "low_latency_gemm_plugin": null,
+            "low_latency_gemm_swiglu_plugin": null,
+            "gemm_allreduce_plugin": null,
+            "context_fmha": true,
+            "bert_context_fmha_fp32_acc": false,
+            "paged_kv_cache": true,
+            "remove_input_padding": true,
+            "reduce_fusion": false,
+            "user_buffer": false,
+            "tokens_per_block": 32,
+            "use_paged_context_fmha": true,
+            "use_fp8_context_fmha": true,
+            "fuse_fp4_quant": false,
+            "multiple_profiles": true,
+            "paged_state": false,
+            "streamingllm": false,
+            "manage_weights": false,
+            "use_fused_mlp": true,
+            "pp_reduce_scatter": false
+        },
+        "use_strip_plan": false,
+        "max_encoder_input_len": 1024,
+        "monitor_memory": false,
+        "use_mrope": false
+    }
+}

tensorrt_llm/1/model.py ADDED Viewed

	@@ -0,0 +1,1386 @@

+import datetime
+import json
+import os
+import sys
+import time
+from dataclasses import dataclass
+from random import randint
+from threading import Lock, Thread
+from typing import Any, List
+import numpy as np
+import torch
+import triton_python_backend_utils as pb_utils
+from torch import from_numpy
+from torch.utils.dlpack import from_dlpack
+import tensorrt_llm.bindings.executor as trtllm
+from tensorrt_llm.llmapi.tokenizer import _xgrammar_tokenizer_info
+METRIC_TOTAL_OUTPUT_TOKENS = "total_output_tokens"
+METRIC_TOTAL_INPUT_TOKENS = "total_input_tokens"
+import tensorrt_llm.logger as logger
+# From https://github.com/pytorch/pytorch/blob/39425feac799905402abe4d15667fa47c344f2d7/torch/testing/_internal/common_utils.py#L1761
+# Dict of NumPy dtype -> torch dtype (when the correspondence exists)
+numpy_to_torch_dtype_dict = {
+    np.bool_: torch.bool,
+    np.uint8: torch.uint8,
+    np.uint16: torch.uint16,
+    np.uint32: torch.uint32,
+    np.uint64: torch.uint64,
+    np.int8: torch.int8,
+    np.int16: torch.int16,
+    np.int32: torch.int32,
+    np.int64: torch.int64,
+    np.float16: torch.float16,
+    np.float32: torch.float32,
+    np.float64: torch.float64,
+    np.complex64: torch.complex64,
+    np.complex128: torch.complex128
+}
+# Dict of torch dtype -> NumPy dtype
+torch_to_numpy_dtype_dict = {
+    value: key
+    for (key, value) in numpy_to_torch_dtype_dict.items()
+}
+torch_to_numpy_dtype_dict.update({
+    torch.bfloat16: np.float32,
+    torch.complex32: np.complex64
+})
+@dataclass
+class RequestData:
+    triton_req_id: int
+    triton_user_id: str
+    batch_index: int
+    batch_size: int
+    num_return_sequences: int
+    num_input_tokens: int
+    num_output_tokens: int
+    response_sender: Any
+def mpi_comm():
+    from mpi4py import MPI
+    return MPI.COMM_WORLD
+def mpi_rank():
+    return mpi_comm().Get_rank()
+def get_input_tensor_by_name(request,
+                             name,
+                             expected_batch_size=None,
+                             batch_index=None,
+                             force_on_torch=False):
+    tensor = pb_utils.get_input_tensor_by_name(request, name)
+    if tensor is None:
+        return None
+    if tensor.is_cpu() and not force_on_torch:
+        tensor = tensor.as_numpy()
+    else:
+        tensor = from_dlpack(tensor.to_dlpack())
+    if expected_batch_size is not None and tensor.shape[
+            0] != expected_batch_size:
+        raise pb_utils.TritonModelException(
+            f"Expected batch size doesn't match batch size for tensor {name}. Expected {expected_batch_size} got {tensor.shape[0]}"
+        )
+    if batch_index is not None and expected_batch_size is not None and batch_index >= expected_batch_size:
+        raise pb_utils.TritonModelException(
+            f"Invalid batch index in get_input_tensor_by_name for {name}")
+    if batch_index is not None:
+        # Add leading 1 batch dimension
+        if isinstance(tensor, np.ndarray):
+            return np.expand_dims(tensor[batch_index], axis=0)
+        elif isinstance(tensor, torch.Tensor):
+            return torch.unsqueeze(tensor[batch_index], dim=0)
+    else:
+        return tensor
+def get_input_scalar_by_name(request,
+                             name,
+                             expected_batch_size=1,
+                             batch_index=0):
+    tensor = pb_utils.get_input_tensor_by_name(request, name)
+    if tensor is None:
+        return None
+    tensor = tensor.as_numpy()
+    if tensor.size != expected_batch_size:
+        raise pb_utils.TritonModelException(
+            f"Expected a scalar tensor for tensor {name}")
+    return tensor.item(batch_index)
+def read_parameter_as_type(value, name, pytype=str):
+    if value == "":
+        return None
+    if value.startswith("${") and value.endswith("}"):
+        return None
+    if pytype is bool:
+        return value.lower() in ["1", "true"]
+    try:
+        result = pytype(value)
+        return result
+    except:
+        pb_utils.Logger.log_warning(
+            f"Could not read parameter '{name}' with value '{value}', will use default."
+        )
+        return None
+def get_parameter(model_config, name, pytype=str):
+    if name not in model_config['parameters']:
+        return None
+    return read_parameter_as_type(
+        model_config['parameters'][name]['string_value'], name, pytype)
+def convert_word_list(word_list):
+    if word_list is None:
+        return None
+    word_list = word_list.tolist()
+    if len(word_list) == 0 or len(word_list[0]) != 2:
+        raise pb_utils.TritonModelException(f"Invalid format for word list.")
+    words, indices = word_list[0]
+    result = []
+    current_index = 0
+    for i in indices:
+        if i == -1:
+            continue
+        if i > len(words):
+            raise pb_utils.TritonModelException(
+                f"Invalid format for word list.")
+        current_word = []
+        while current_index < i:
+            current_word.append(words[current_index])
+            current_index += 1
+        result.append(current_word)
+    return result
+def parse_medusa_choices(medusa_choices):
+    if medusa_choices is None:
+        return None
+    try:
+        result = json.loads(
+            "[" + medusa_choices.replace("{", "[").replace("}", "]") + "]")
+        assert isinstance(result, list) and len(result) > 0
+        assert all([isinstance(x, list) for x in result])
+        assert all([isinstance(y, int) for x in result for y in x])
+    except Exception:
+        raise pb_utils.TritonModelException(
+            "Invalid format for medusa_choices")
+    return result
+def parse_eagle_choices(eagle_choices):
+    return parse_medusa_choices(eagle_choices)
+def get_sampling_config_from_request(request, batch_size=1, batch_index=0):
+    kwargs = {}
+    kwargs['beam_width'] = get_input_scalar_by_name(
+        request, 'beam_width', batch_size, batch_index) or 1
+    kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k',
+                                               batch_size, batch_index)
+    kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p',
+                                               batch_size, batch_index)
+    kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[
+        'top_p'] <= 0 else kwargs['top_p']
+    kwargs['random_seed'] = get_input_scalar_by_name(request, 'random_seed',
+                                                     batch_size, batch_index)
+    kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature',
+                                                     batch_size, batch_index)
+    kwargs['min_length'] = get_input_scalar_by_name(request, 'min_length',
+                                                    batch_size, batch_index)
+    kwargs['repetition_penalty'] = get_input_scalar_by_name(
+        request, 'repetition_penalty', batch_size, batch_index)
+    kwargs['presence_penalty'] = get_input_scalar_by_name(
+        request, 'presence_penalty', batch_size, batch_index)
+    kwargs['frequency_penalty'] = get_input_scalar_by_name(
+        request, 'frequency_penalty', batch_size, batch_index)
+    kwargs['length_penalty'] = get_input_scalar_by_name(
+        request, 'len_penalty', batch_size, batch_index)
+    kwargs['top_p_min'] = get_input_scalar_by_name(request,
+                                                   'runtime_top_p_min',
+                                                   batch_size, batch_index)
+    kwargs['top_p_reset_ids'] = get_input_scalar_by_name(
+        request, 'runtime_top_p_reset_ids', batch_size, batch_index)
+    kwargs['top_p_decay'] = get_input_scalar_by_name(request,
+                                                     'runtime_top_p_decay',
+                                                     batch_size, batch_index)
+    kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name(
+        request, 'beam_search_diversity_rate', batch_size, batch_index)
+    kwargs['early_stopping'] = get_input_scalar_by_name(
+        request, 'early_stopping', batch_size, batch_index)
+    kwargs['num_return_sequences'] = get_input_scalar_by_name(
+        request, 'num_return_sequences', batch_size, batch_index) or 1
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    return trtllm.SamplingConfig(**kwargs)
+def get_output_config_from_request(request, batch_size=1, batch_index=0):
+    kwargs = {}
+    kwargs["return_log_probs"] = get_input_scalar_by_name(
+        request, 'return_log_probs', batch_size, batch_index)
+    kwargs["return_context_logits"] = get_input_scalar_by_name(
+        request, 'return_context_logits', batch_size, batch_index)
+    kwargs["return_generation_logits"] = get_input_scalar_by_name(
+        request, 'return_generation_logits', batch_size, batch_index)
+    kwargs["return_perf_metrics"] = get_input_scalar_by_name(
+        request, 'return_kv_cache_reuse_stats', batch_size, batch_index)
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    return trtllm.OutputConfig(**kwargs)
+def get_external_draft_tokens_config_from_request(request,
+                                                  batch_size=1,
+                                                  batch_index=0):
+    kwargs = {}
+    draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids',
+                                               batch_size, batch_index)
+    if draft_input_ids is not None:
+        kwargs['tokens'] = draft_input_ids[0].tolist()
+    draft_logits = get_input_tensor_by_name(request, 'draft_logits',
+                                            batch_size, batch_index)
+    if draft_logits is not None:
+        kwargs['logits'] = from_numpy(draft_logits).squeeze(dim=0)
+    kwargs['acceptance_threshold'] = get_input_scalar_by_name(
+        request, 'draft_acceptance_threshold', batch_size, batch_index)
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    if len(kwargs) > 0:
+        return trtllm.ExternalDraftTokensConfig(**kwargs)
+    return None
+def get_prompt_tuning_config_from_request(request,
+                                          batch_size=1,
+                                          batch_index=0,
+                                          input_length=0):
+    # prompt_vocab_size is unused by executor.
+    kwargs = {}
+    prompt_embedding_table = get_input_tensor_by_name(
+        request, 'prompt_embedding_table', batch_size, batch_index)
+    prompt_table_extra_ids = get_input_tensor_by_name(
+        request, 'prompt_table_extra_ids', batch_size, batch_index)
+    if prompt_embedding_table is not None:
+        if isinstance(prompt_embedding_table, np.ndarray):
+            kwargs["embedding_table"] = from_numpy(
+                prompt_embedding_table).squeeze(dim=0)
+        elif isinstance(prompt_embedding_table, torch.Tensor):
+            kwargs["embedding_table"] = prompt_embedding_table.squeeze(dim=0)
+        if prompt_table_extra_ids is not None:
+            prompt_table_extra_ids = prompt_table_extra_ids[0].tolist()
+            if len(prompt_table_extra_ids) != 0:
+                kwargs["input_token_extra_ids"] = prompt_table_extra_ids[
+                    0:input_length]
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    if len(kwargs) > 0:
+        return trtllm.PromptTuningConfig(**kwargs)
+    return None
+def get_lora_config_from_request(request, batch_size=1, batch_index=0):
+    kwargs = {}
+    kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id',
+                                                 batch_size, batch_index)
+    lora_weights = get_input_tensor_by_name(request, 'lora_weights',
+                                            batch_size, batch_index)
+    if lora_weights is not None:
+        kwargs["weights"] = from_numpy(lora_weights).squeeze(dim=0)
+    lora_config = get_input_tensor_by_name(request, 'lora_config', batch_size,
+                                           batch_index)
+    if lora_config is not None:
+        kwargs["config"] = from_numpy(lora_config).squeeze(dim=0)
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    if len(kwargs) > 0:
+        return trtllm.LoraConfig(**kwargs)
+    return None
+def get_guided_decoding_params_from_request(request,
+                                            batch_size=1,
+                                            batch_index=0):
+    kwargs = {}
+    guided_decoding_guide_type = get_input_tensor_by_name(
+        request, 'guided_decoding_guide_type', batch_size, batch_index)
+    if guided_decoding_guide_type is not None:
+        guided_decoding_guide_type = guided_decoding_guide_type.squeeze(
+            axis=0)[0].decode()
+        guided_decoding_guide_type_mapping = {
+            "json": trtllm.GuidedDecodingParams.GuideType.JSON,
+            "json_schema": trtllm.GuidedDecodingParams.GuideType.JSON_SCHEMA,
+            "regex": trtllm.GuidedDecodingParams.GuideType.REGEX,
+            "ebnf_grammar": trtllm.GuidedDecodingParams.GuideType.EBNF_GRAMMAR
+        }
+        guided_decoding_guide_type = guided_decoding_guide_type_mapping.get(
+            guided_decoding_guide_type)
+    kwargs['guide_type'] = guided_decoding_guide_type
+    guided_decoding_guide = get_input_tensor_by_name(request,
+                                                     'guided_decoding_guide',
+                                                     batch_size, batch_index)
+    if guided_decoding_guide is not None:
+        kwargs['guide'] = guided_decoding_guide.squeeze(axis=0)[0].decode()
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    if len(kwargs) > 0:
+        return trtllm.GuidedDecodingParams(**kwargs)
+    return None
+def get_kv_cache_retention_config_from_request(request,
+                                               batch_size=1,
+                                               batch_index=0):
+    def get_tensor_and_check_length(name: str, expected_length: int):
+        tensor = get_input_tensor_by_name(request, name, batch_size,
+                                          batch_index)
+        if tensor is None:
+            raise RuntimeError(f"{name} must be provided.")
+        tensor = np.squeeze(tensor, axis=0)
+        if len(tensor) != expected_length:
+            raise RuntimeError(
+                f"Invalid {name} length. Expected length {expected_length}, got length {len(tensor)}"
+            )
+        return tensor
+    token_range_starts = get_input_tensor_by_name(
+        request, "retention_token_range_starts", batch_size, batch_index)
+    if token_range_starts is not None:
+        token_range_starts = np.squeeze(token_range_starts, axis=0)
+        token_range_ends = get_tensor_and_check_length(
+            "retention_token_range_ends", len(token_range_starts))
+        token_range_ends = [
+            None if end == -1 else end for end in token_range_ends
+        ]
+        token_range_priorities = get_tensor_and_check_length(
+            "retention_token_range_priorities", len(token_range_starts))
+        token_range_durations_ms = get_input_tensor_by_name(
+            request, "retention_token_range_durations_ms", batch_size,
+            batch_index)
+        if token_range_durations_ms is None:
+            token_range_durations_ms = [None] * len(token_range_starts)
+        else:
+            token_range_durations_ms = np.squeeze(token_range_durations_ms,
+                                                  axis=0)
+            token_range_durations_ms = [
+                None if duration == -1 else duration
+                for duration in token_range_durations_ms
+            ]
+            if len(token_range_durations_ms) != len(token_range_starts):
+                raise RuntimeError(
+                    f"Invalid retention_token_range_durations length. Expected length {len(token_range_starts)}, got length {len(token_range_durations_ms)}"
+                )
+        ranges = []
+        for start, end, priority, duration_ms in zip(token_range_starts,
+                                                     token_range_ends,
+                                                     token_range_priorities,
+                                                     token_range_durations_ms):
+            ranges.append(
+                trtllm.KvCacheRetentionConfig.TokenRangeRetentionConfig(
+                    token_start=start,
+                    token_end=end,
+                    priority=priority.item(),
+                    duration_ms=None if duration_ms is None else
+                    datetime.timedelta(milliseconds=duration_ms.item())))
+        decode_args = {}
+        decode_priority = get_input_scalar_by_name(
+            request, "retention_decode_priority", batch_size, batch_index)
+        if decode_priority is not None:
+            decode_args['decode_retention_priority'] = decode_priority
+        decode_duration_ms = get_input_scalar_by_name(
+            request, "retention_decode_duration_ms", batch_size, batch_index)
+        if decode_duration_ms is not None:
+            decode_args[
+                'decode_duration_ms'] = decode_duration_ms if decode_duration_ms != -1 else None
+        return trtllm.KvCacheRetentionConfig(
+            token_range_retention_configs=ranges, **decode_args)
+    return None
+def build_1_2_5_buckets(max_value: int) -> List[int]:
+    """
+    Builds a list of buckets with increasing powers of 10 multiplied by
+    mantissa values (1, 5), starting from 10 until the value exceeds
+    the specified maximum.
+    Example:
+    >>> build_1_2_5_buckets(1000)
+    [10, 50, 100, 500, 1000]
+    """
+    mantissa_lst = [1, 5]
+    exponent = 1  # Start from exponent 1 instead of 0
+    buckets: List[int] = []
+    while True:
+        for m in mantissa_lst:
+            value = m * 10**exponent
+            if value <= max_value:
+                buckets.append(value)
+            else:
+                return buckets
+        exponent += 1
+def convert_request(request, exclude_input_from_output, decoupled):
+    inputs = {}
+    input_token_ids = get_input_tensor_by_name(request, 'input_ids')
+    if input_token_ids is None:
+        raise pb_utils.TritonModelException(
+            "A value is required for input_ids")
+    if len(input_token_ids.shape) != 2:
+        raise pb_utils.TritonModelException(f"Invalid format for input_ids")
+    batch_size = input_token_ids.shape[0]
+    requests = []
+    for batch_index in range(0, batch_size):
+        input_token_ids = get_input_tensor_by_name(request, 'input_ids',
+                                                   batch_size, batch_index)[0]
+        if input_token_ids is None:
+            raise pb_utils.TritonModelException(
+                "A value is required for input_ids")
+        input_token_ids = input_token_ids.tolist()
+        if len(input_token_ids) == 0:
+            raise pb_utils.TritonModelException(
+                f"Invalid format for input_ids")
+        input_length = get_input_scalar_by_name(request, 'input_lengths',
+                                                batch_size, batch_index)
+        if input_length is None:
+            input_length = len(input_token_ids)
+        # Trim input token ids with input_lengths
+        inputs['input_token_ids'] = input_token_ids[0:input_length]
+        inputs['max_new_tokens'] = get_input_scalar_by_name(
+            request, 'request_output_len', batch_size, batch_index)
+        if inputs['max_new_tokens'] is None:
+            raise pb_utils.TritonModelException(
+                "A value is required for request_output_len")
+        inputs['streaming'] = get_input_scalar_by_name(request, 'streaming',
+                                                       batch_size, batch_index)
+        if inputs['streaming'] and not decoupled:
+            raise pb_utils.TritonModelException(
+                "Streaming is only supported in decoupled mode.")
+        inputs['end_id'] = get_input_scalar_by_name(request, 'end_id',
+                                                    batch_size, batch_index)
+        inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id',
+                                                    batch_size, batch_index)
+        inputs['stop_words'] = convert_word_list(
+            get_input_tensor_by_name(request, 'stop_words_list', batch_size,
+                                     batch_index))
+        inputs['bad_words'] = convert_word_list(
+            get_input_tensor_by_name(request, 'bad_words_list', batch_size,
+                                     batch_index))
+        embedding_bias = get_input_tensor_by_name(request, 'embedding_bias',
+                                                  batch_size, batch_index)
+        if embedding_bias is not None and embedding_bias.size != 0:
+            inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze(
+                dim=0)
+        sampling_config = get_sampling_config_from_request(
+            request, batch_size, batch_index)
+        output_config = get_output_config_from_request(request, batch_size,
+                                                       batch_index)
+        req_exclude_input_from_output = get_input_scalar_by_name(
+            request, 'exclude_input_in_output', batch_size, batch_index)
+        if req_exclude_input_from_output is None:
+            # if request doesn't specify exclude_input_from_output, try to use the parameter
+            output_config.exclude_input_from_output = (
+                exclude_input_from_output
+                if exclude_input_from_output is not None else False)
+        else:
+            output_config.exclude_input_from_output = req_exclude_input_from_output
+        external_draft_tokens_config = get_external_draft_tokens_config_from_request(
+            request, batch_size, batch_index)
+        prompt_tuning_config = get_prompt_tuning_config_from_request(
+            request, batch_size, batch_index, input_length)
+        lora_config = get_lora_config_from_request(request, batch_size,
+                                                   batch_index)
+        kv_cache_retention_config = get_kv_cache_retention_config_from_request(
+            request, batch_size, batch_index)
+        # Inputs for mllama support
+        encoder_input_features = get_input_tensor_by_name(
+            request, 'encoder_input_features', batch_size, batch_index)
+        if encoder_input_features is not None:
+            if isinstance(encoder_input_features, np.ndarray):
+                encoder_input_features = from_numpy(
+                    encoder_input_features).squeeze(dim=0)
+            elif isinstance(encoder_input_features, torch.Tensor):
+                encoder_input_features = encoder_input_features.squeeze(dim=0)
+            inputs['encoder_input_features'] = encoder_input_features
+            logger.debug(
+                f"inputs to llm: encoder_input_features ({encoder_input_features.shape}"
+            )
+            encoder_output_length = get_input_tensor_by_name(
+                request, 'encoder_output_lengths', batch_size, batch_index)
+            if encoder_output_length is not None:
+                inputs['encoder_output_length'] = np.squeeze(
+                    encoder_output_length, axis=0)
+            cross_attention_mask = get_input_tensor_by_name(
+                request, 'cross_attention_mask', batch_size, batch_index)
+            if cross_attention_mask is not None:
+                inputs['cross_attention_mask'] = cross_attention_mask[0]
+                logger.debug(
+                    f"inputs to llm: cross_attention_mask ({ cross_attention_mask.shape})"
+                )
+            skip_cross_attn_blocks = get_input_tensor_by_name(
+                request,
+                'skip_cross_attn_blocks',
+                batch_size,
+                batch_index,
+                force_on_torch=True)
+            if skip_cross_attn_blocks is not None:
+                inputs['skip_cross_attn_blocks'] = skip_cross_attn_blocks[0]
+                logger.debug(
+                    f"inputs to llm: skip_cross_attn_blocks ({ skip_cross_attn_blocks.shape})"
+                )
+        guided_decoding_params = get_guided_decoding_params_from_request(
+            request, batch_size, batch_index)
+        requests.append(
+            trtllm.Request(
+                **inputs,
+                sampling_config=sampling_config,
+                output_config=output_config,
+                external_draft_tokens_config=external_draft_tokens_config,
+                prompt_tuning_config=prompt_tuning_config,
+                lora_config=lora_config,
+                guided_decoding_params=guided_decoding_params,
+                kv_cache_retention_config=kv_cache_retention_config))
+    return requests
+def convert_response(response,
+                     batch_index,
+                     batch_size,
+                     num_return_sequences,
+                     expected_logits_dtype=torch.float32):
+    if response.has_error():
+        return pb_utils.InferenceResponse(output_tensors=[],
+                                          error=pb_utils.TritonError(
+                                              response.error_msg)), True, 0
+    result = response.result
+    beam_lengths = np.expand_dims(
+        np.array([len(beam) for beam in result.output_token_ids], np.int32), 0)
+    max_beam_length = max([len(beam) for beam in result.output_token_ids])
+    output_ids = np.full((1, len(result.output_token_ids), max_beam_length),
+                         -1, np.int32)
+    for idx, beam in enumerate(result.output_token_ids):
+        output_ids[0, idx, :len(beam)] = beam
+    output_lengths = output_ids.size
+    output_tensors = [
+        pb_utils.Tensor("output_ids", output_ids),
+        pb_utils.Tensor("sequence_length", beam_lengths),
+    ]
+    if result.cum_log_probs is not None:
+        output_tensors.append(
+            pb_utils.Tensor(
+                "cum_log_probs",
+                np.expand_dims(np.array(result.cum_log_probs, np.float32), 0)))
+    if result.log_probs is not None:
+        output_tensors.append(
+            pb_utils.Tensor(
+                "output_log_probs",
+                np.expand_dims(np.array(result.log_probs, np.float32), 0)))
+    if result.context_logits is not None:
+        assert (result.context_logits.dtype is expected_logits_dtype)
+        output_tensors.append(
+            pb_utils.Tensor(
+                "context_logits",
+                np.expand_dims(
+                    np.array(
+                        result.context_logits, torch_to_numpy_dtype_dict[
+                            result.context_logits.dtype]), 0)))
+    if result.generation_logits is not None:
+        assert (result.generation_logits.dtype is expected_logits_dtype)
+        output_tensors.append(
+            pb_utils.Tensor(
+                "generation_logits",
+                np.expand_dims(
+                    np.array(
+                        result.generation_logits, torch_to_numpy_dtype_dict[
+                            result.generation_logits.dtype]), 0)))
+    if batch_size > 1:
+        output_tensors.append(
+            pb_utils.Tensor(
+                "batch_index",
+                np.expand_dims(np.array([batch_index], np.int32), 0)))
+    if num_return_sequences > 1:
+        output_tensors.append(
+            pb_utils.Tensor(
+                "sequence_index",
+                np.expand_dims(np.array([result.sequence_index], np.int32),
+                               0)))
+    if result.request_perf_metrics is not None:
+        kv_cache_metrics = result.request_perf_metrics.kv_cache_metrics
+        output_tensors.append(
+            pb_utils.Tensor(
+                "kv_cache_alloc_new_blocks",
+                np.expand_dims(
+                    np.array([kv_cache_metrics.num_new_allocated_blocks],
+                             np.int32), 0)))
+        output_tensors.append(
+            pb_utils.Tensor(
+                "kv_cache_reused_blocks",
+                np.expand_dims(
+                    np.array([kv_cache_metrics.num_reused_blocks], np.int32),
+                    0)))
+        output_tensors.append(
+            pb_utils.Tensor(
+                "kv_cache_alloc_total_blocks",
+                np.expand_dims(
+                    np.array([kv_cache_metrics.num_total_allocated_blocks],
+                             np.int32), 0)))
+    return pb_utils.InferenceResponse(
+        output_tensors), result.is_final, output_lengths
+def convert_scheduler_policy(batch_scheduler_policy: str):
+    if batch_scheduler_policy.lower() == "max_utilization":
+        return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
+    elif batch_scheduler_policy.lower() == "guaranteed_no_evict":
+        return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
+    raise pb_utils.TritonModelException(
+        f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported."
+    )
+def convert_batching_type(gpt_model_type: str):
+    if gpt_model_type is None:
+        return None
+    if gpt_model_type.lower(
+    ) == "inflight_fused_batching" or gpt_model_type.lower(
+    ) == "inflight_batching":
+        return trtllm.BatchingType.INFLIGHT
+    elif gpt_model_type.lower() == "v1":
+        return trtllm.BatchingType.STATIC
+    raise pb_utils.TritonModelException(
+        f"gpt_model_type value of '{gpt_model_type}' is not supported.")
+def convert_decoding_mode(decoding_mode: str):
+    if decoding_mode is None:
+        return None
+    elif decoding_mode == "auto":
+        return trtllm.DecodingMode.Auto()
+    elif decoding_mode == "top_k":
+        return trtllm.DecodingMode.TopK()
+    elif decoding_mode == "top_p":
+        return trtllm.DecodingMode.TopP()
+    elif decoding_mode == "top_k_top_p":
+        return trtllm.DecodingMode.TopKTopP()
+    elif decoding_mode == "beam_search":
+        return trtllm.DecodingMode.BeamSearch()
+    elif decoding_mode == "medusa":
+        return trtllm.DecodingMode.Medusa()
+    elif decoding_mode == "redrafter":
+        return trtllm.DecodingMode.ExplicitDraftTokens()
+    elif decoding_mode == "lookahead":
+        return trtllm.DecodingMode.Lookahead()
+    elif decoding_mode == "eagle":
+        return trtllm.DecodingMode.Eagle()
+    raise pb_utils.TritonModelException(
+        f"decoding_mode value of '{decoding_mode}' is not supported.")
+def convert_timestamp_to_seconds(timestamp: str):
+    return int(
+        datetime.datetime.strptime(timestamp,
+                                   "%m-%d-%Y %H:%M:%S.%f").timestamp())
+def triton_string_to_torch(dtype):
+    type_map = {
+        "TYPE_BOOL": torch.bool,
+        "TYPE_UINT8": torch.uint8,
+        "TYPE_INT8": torch.int8,
+        "TYPE_INT16": torch.int16,
+        "TYPE_INT32": torch.int32,
+        "TYPE_INT64": torch.int64,
+        "TYPE_FP16": torch.float16,
+        "TYPE_FP32": torch.float32,
+        "TYPE_FP64": torch.float64,
+        "TYPE_BF16": torch.bfloat16
+    }
+    return type_map[dtype]
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def get_scheduler_config(self, model_config):
+        batch_scheduler_policy = get_parameter(model_config,
+                                               "batch_scheduler_policy")
+        if batch_scheduler_policy is None:
+            return trtllm.SchedulerConfig()
+        return trtllm.SchedulerConfig(
+            convert_scheduler_policy(batch_scheduler_policy))
+    def get_kv_cache_config(self, model_config):
+        kwargs = {
+            "enable_block_reuse":
+            get_parameter(model_config, "enable_kv_cache_reuse", bool),
+            "max_tokens":
+            get_parameter(model_config, "max_tokens_in_paged_kv_cache", int),
+            "sink_token_length":
+            get_parameter(model_config, "sink_token_length", int),
+            "free_gpu_memory_fraction":
+            get_parameter(model_config, "kv_cache_free_gpu_mem_fraction",
+                          float),
+            "cross_kv_cache_fraction":
+            get_parameter(model_config, "cross_kv_cache_fraction", float),
+            "host_cache_size":
+            get_parameter(model_config, "kv_cache_host_memory_bytes", int),
+            "onboard_blocks":
+            get_parameter(model_config, "kv_cache_onboard_blocks", bool),
+        }
+        max_attention_window_size = get_parameter(model_config,
+                                                  "max_attention_window_size")
+        if max_attention_window_size:
+            kwargs["max_attention_window"] = [
+                int(x) for x in max_attention_window_size.split(",")
+            ]
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.KvCacheConfig(**kwargs)
+    def get_parallel_config(self, model_config):
+        kwargs = {}
+        gpu_device_ids = get_parameter(model_config, "gpu_device_ids")
+        if gpu_device_ids:
+            kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")]
+        self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR",
+                                                    "0") == "1"
+        if self.use_orchestrator_mode:
+            kwargs[
+                "communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR
+            worker_path = get_parameter(model_config, "worker_path")
+            spawn_processes = os.environ.get(
+                "TRTLLM_ORCHESTRATOR_SPAWN_PROCESSES", "1") == "1"
+            if not spawn_processes:
+                raise pb_utils.TritonModelException(
+                    "Orchestrator mode with --disable-spawn-processes is not supported in the Python backend."
+                )
+            is_orchestrator = (mpi_rank() == 0) if spawn_processes else True
+            if worker_path is not None:
+                raise pb_utils.TritonModelException(
+                    "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable."
+                )
+            executor_worker_path = get_parameter(model_config,
+                                                 "executor_worker_path")
+            kwargs["orchestrator_config"] = trtllm.OrchestratorConfig(
+                is_orchestrator, executor_worker_path)
+        if len(kwargs) > 0:
+            return trtllm.ParallelConfig(**kwargs)
+        return None
+    def get_peft_cache_config(self, model_config):
+        kwargs = {
+            "optimal_adapter_size":
+            get_parameter(model_config, "lora_cache_optimal_adapter_size",
+                          int),
+            "max_adapter_size":
+            get_parameter(model_config, "lora_cache_max_adapter_size", int),
+            "device_cache_percent":
+            get_parameter(model_config, "lora_cache_gpu_memory_fraction",
+                          float),
+            "host_cache_size":
+            get_parameter(model_config, "lora_cache_host_memory_bytes", int),
+        }
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.PeftCacheConfig(**kwargs)
+    def get_decoding_config(self, model_config):
+        eagle_choices = parse_eagle_choices(
+            get_parameter(model_config, "eagle_choices"))
+        kwargs = {
+            "medusa_choices":
+            parse_medusa_choices(get_parameter(model_config,
+                                               "medusa_choices")),
+            "eagle_config":
+            None
+            if eagle_choices is None else trtllm.EagleConfig(eagle_choices),
+            "decoding_mode":
+            convert_decoding_mode(get_parameter(model_config,
+                                                "decoding_mode")),
+        }
+        print(kwargs)
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.DecodingConfig(**kwargs)
+    def get_extended_runtime_perf_knob_config(self, model_config):
+        kwargs = {
+            "multi_block_mode":
+            get_parameter(model_config, "multi_block_mode", bool),
+            "enable_context_fmha_fp32_acc":
+            get_parameter(model_config, "enable_context_fmha_fp32_acc", bool),
+            "cuda_graph_mode":
+            get_parameter(model_config, "cuda_graph_mode", bool),
+            "cuda_graph_cache_size":
+            get_parameter(model_config, "cuda_graph_cache_size", int),
+        }
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.ExtendedRuntimePerfKnobConfig(**kwargs)
+    def get_guided_decoding_config(self, model_config):
+        guided_decoding_backend = get_parameter(model_config,
+                                                "guided_decoding_backend", str)
+        tokenizer_dir = get_parameter(model_config, "tokenizer_dir", str)
+        if guided_decoding_backend not in ['xgrammar']:
+            if tokenizer_dir:
+                pb_utils.Logger.log_warn(
+                    f"Guided decoding backend has not been set but tokenizer_dir is given. Tokenizer_dir will be ignored."
+                )
+            return None
+        if guided_decoding_backend == 'xgrammar':
+            guided_decoding_backend = trtllm.GuidedDecodingConfig.GuidedDecodingBackend.XGRAMMAR
+        if not tokenizer_dir:
+            raise ValueError(
+                "Guided decoding requires tokenizer's information. Please provide 'tokenizer_dir'."
+            )
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+        pb_utils.Logger.log_info(
+            f"Guided decoding has been set with {guided_decoding_backend} backend"
+        )
+        return trtllm.GuidedDecodingConfig(
+            backend=guided_decoding_backend,
+            **_xgrammar_tokenizer_info(tokenizer))
+    def get_executor_config(self, model_config):
+        kwargs = {
+            "max_beam_width":
+            get_parameter(model_config, "max_beam_width", int),
+            "scheduler_config":
+            self.get_scheduler_config(model_config),
+            "kv_cache_config":
+            self.get_kv_cache_config(model_config),
+            "enable_chunked_context":
+            get_parameter(model_config, "enable_chunked_context", bool),
+            "normalize_log_probs":
+            get_parameter(model_config, "normalize_log_probs", bool),
+            "batching_type":
+            convert_batching_type(get_parameter(model_config,
+                                                "gpt_model_type")),
+            "parallel_config":
+            self.get_parallel_config(model_config),
+            "peft_cache_config":
+            self.get_peft_cache_config(model_config),
+            "decoding_config":
+            self.get_decoding_config(model_config),
+            "max_queue_size":
+            model_config.get(
+                "dynamic_batching",
+                {},
+            ).get(
+                "default_queue_policy",
+                {},
+            ).get("max_queue_size"),
+            "extended_runtime_perf_knob_config":
+            self.get_extended_runtime_perf_knob_config(model_config),
+            "guided_decoding_config":
+            self.get_guided_decoding_config(model_config)
+        }
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.ExecutorConfig(**kwargs)
+    def create_metrics(self, model: str, version: str, is_v1_model: bool):
+        self.request_metric_family = pb_utils.MetricFamily(
+            name="nv_trt_llm_request_metrics",
+            description="TRT LLM request metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        self.runtime_memory_metric_family = pb_utils.MetricFamily(
+            name="nv_trt_llm_runtime_memory_metrics",
+            description="TRT LLM runtime memory metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        self.kv_cache_metric_family = pb_utils.MetricFamily(
+            name="nv_trt_llm_kv_cache_block_metrics",
+            description="TRT LLM KV cache block metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        model_type = "v1" if is_v1_model else "inflight_batcher"
+        self.model_type_metric_family = pb_utils.MetricFamily(
+            name=f"nv_trt_llm_{model_type}_metrics",
+            description=f"TRT LLM {model_type}-specific metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        self.general_metric_family = pb_utils.MetricFamily(
+            name="nv_trt_llm_general_metrics",
+            description="General TRT LLM metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        # Set the metric using self.general_metric_output_family.observe(string_size)
+        self.request_tokens_metric_family = pb_utils.MetricFamily(
+            name="nv_llm_input_token_len",
+            description="TRT LLM response metrics",
+            kind=pb_utils.MetricFamily.HISTOGRAM,
+        )
+        self.response_tokens_metric_family = pb_utils.MetricFamily(
+            name="nv_llm_output_token_len",
+            description="TRT LLM response metrics",
+            kind=pb_utils.MetricFamily.HISTOGRAM,
+        )
+        common_labels = {"model": model, "version": version}
+        self.all_metrics = {
+            # Request metrics
+            "num_active_requests":
+            self.request_metric_family.Metric(labels={
+                "request_type": "active",
+                **common_labels
+            }),
+            "max_num_active_requests":
+            self.request_metric_family.Metric(labels={
+                "request_type": "max",
+                **common_labels
+            }),
+            "num_scheduled_requests":
+            self.request_metric_family.Metric(labels={
+                "request_type": "scheduled",
+                **common_labels
+            }),
+            "num_context_requests":
+            self.request_metric_family.Metric(labels={
+                "request_type": "context",
+                **common_labels
+            }),
+            # Runtime metrics
+            "cpu_mem_usage":
+            self.runtime_memory_metric_family.Metric(labels={
+                "memory_type": "cpu",
+                **common_labels
+            }),
+            "gpu_mem_usage":
+            self.runtime_memory_metric_family.Metric(labels={
+                "memory_type": "gpu",
+                **common_labels
+            }),
+            "pinned_mem_usage":
+            self.runtime_memory_metric_family.Metric(labels={
+                "memory_type": "pinned",
+                **common_labels
+            }),
+            # KV cache metrics
+            "max_num_blocks":
+            self.kv_cache_metric_family.Metric(labels={
+                "kv_cache_block_type": "max",
+                **common_labels
+            }),
+            "free_num_blocks":
+            self.kv_cache_metric_family.Metric(labels={
+                "kv_cache_block_type": "free",
+                **common_labels
+            }),
+            "used_num_blocks":
+            self.kv_cache_metric_family.Metric(labels={
+                "kv_cache_block_type": "used",
+                **common_labels
+            }),
+            "tokens_per_block":
+            self.kv_cache_metric_family.Metric(labels={
+                "kv_cache_block_type": "tokens_per",
+                **common_labels
+            }),
+            # General metrics
+            "timestamp":
+            self.general_metric_family.Metric(labels={
+                "general_type": "timestamp",
+                **common_labels
+            }),
+            "iter":
+            self.general_metric_family.Metric(labels={
+                "general_type": "iteration_counter",
+                **common_labels
+            }),
+            METRIC_TOTAL_OUTPUT_TOKENS:
+            self.response_tokens_metric_family.Metric(
+                labels={
+                    "response_metric_type": METRIC_TOTAL_OUTPUT_TOKENS,
+                    **common_labels
+                },
+                buckets=build_1_2_5_buckets(1000)),
+            METRIC_TOTAL_INPUT_TOKENS:
+            self.request_tokens_metric_family.Metric(
+                labels={
+                    "response_metric_type": METRIC_TOTAL_INPUT_TOKENS,
+                    **common_labels
+                },
+                buckets=build_1_2_5_buckets(1000)),
+        }
+        if is_v1_model:
+            self.all_metrics.update({
+                "num_ctx_tokens":
+                self.model_type_metric_family.Metric(labels={
+                    "v1_specific_metric": "total_context_tokens",
+                    **common_labels
+                }),
+                "num_gen_tokens":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "v1_specific_metric": "total_generation_tokens",
+                        **common_labels
+                    }),
+                "empty_gen_slots":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "v1_specific_metric": "empty_generation_slots",
+                        **common_labels
+                    }),
+            })
+        else:
+            self.all_metrics.update({
+                "num_ctx_tokens":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "inflight_batcher_specific_metric":
+                        "total_context_tokens",
+                        **common_labels
+                    }),
+                "num_gen_requests":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "inflight_batcher_specific_metric":
+                        "generation_requests",
+                        **common_labels
+                    }),
+                "micro_batch_id":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "inflight_batcher_specific_metric": "micro_batch_id",
+                        **common_labels
+                    }),
+                "num_paused_requests":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "inflight_batcher_specific_metric": "paused_requests",
+                        **common_labels
+                    }),
+            })
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        model_config = json.loads(args['model_config'])
+        gpt_model_path = get_parameter(model_config, "gpt_model_path")
+        if get_parameter(model_config, "enable_trt_overlap", bool):
+            raise pb_utils.TritonModelException(
+                f"enable_trt_overlap=true is not supported.")
+        self.exclude_input_from_output = get_parameter(
+            model_config, "exclude_input_in_output", bool)
+        executor_config = self.get_executor_config(model_config)
+        self.executor = trtllm.Executor(gpt_model_path,
+                                        trtllm.ModelType.DECODER_ONLY,
+                                        executor_config)
+        self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
+            model_config)
+        self.cancellation_check_period_ms = get_parameter(
+            model_config, "cancellation_check_period_ms", int) or 100
+        self.stats_check_period_ms = get_parameter(
+            model_config, "stats_check_period_ms", int) or 100
+        self.logits_dtype = None
+        for output in model_config['output']:
+            if output['name'] == 'context_logits' or output[
+                    'name'] == 'generation_logits':
+                self.logits_dtype = triton_string_to_torch(output['data_type'])
+        self.create_metrics(args["model_name"],
+                            args["model_version"],
+                            is_v1_model=executor_config.batching_type ==
+                            trtllm.BatchingType.STATIC)
+        self.triton_user_id_to_req_ids = {}
+        self.triton_req_id_to_req_ids = {}
+        self.req_id_to_request_data = {}
+        self.lock = Lock()
+        self.running = False
+        self.awaiter_thread = Thread(target=self.awaiter_loop)
+        self.cancellation_thread = Thread(target=self.cancellation_loop)
+        self.metrics_thread = Thread(target=self.metrics_loop)
+        if self.executor.can_enqueue_requests():
+            self.running = True
+            self.awaiter_thread.start()
+            self.cancellation_thread.start()
+            self.metrics_thread.start()
+        else:
+            # In leader mode, worker ranks will wait here until leader is done.
+            self.executor.shutdown()
+    def handle_stop_request(self, triton_user_id, response_sender):
+        if triton_user_id is None or triton_user_id == "":
+            response_sender.send(
+                pb_utils.InferenceResponse(error=pb_utils.TritonError(
+                    "A request id must be provided for request cancellation")),
+                flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+            return
+        with self.lock:
+            if triton_user_id in self.triton_user_id_to_req_ids:
+                req_ids = self.triton_user_id_to_req_ids[triton_user_id]
+                for req_id in req_ids:
+                    self.executor.cancel_request(req_id)
+        response_sender.send(
+            pb_utils.InferenceResponse(),
+            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        if not self.executor.can_enqueue_requests():
+            return
+        # Convert to executor requests.
+        triton_requests = []
+        executor_requests = []
+        batch_indices = []
+        triton_user_ids = []
+        triton_req_ids = []
+        for request in requests:
+            triton_user_id = request.request_id()
+            response_sender = request.get_response_sender()
+            stop = get_input_scalar_by_name(request, 'stop')
+            if stop:
+                self.handle_stop_request(triton_user_id, response_sender)
+            else:
+                #Unique request id used to identify each triton request
+                triton_req_id = str(randint(0, sys.maxsize))
+                self.triton_req_id_to_req_ids[triton_req_id] = set()
+                if triton_user_id is not None and triton_user_id != "":
+                    self.triton_user_id_to_req_ids[triton_user_id] = set()
+                try:
+                    converted_reqs = convert_request(
+                        request, self.exclude_input_from_output,
+                        self.decoupled)
+                except Exception as e:
+                    response_sender.send(
+                        pb_utils.InferenceResponse(error=pb_utils.TritonError(
+                            f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'"
+                        )),
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                else:
+                    for batch_index, converted_req in enumerate(
+                            converted_reqs):
+                        triton_requests.append(request)
+                        executor_requests.append(converted_req)
+                        triton_user_ids.append(triton_user_id)
+                        triton_req_ids.append(triton_req_id)
+                        batch_indices.append(batch_index)
+        with self.lock:
+            request_ids = self.executor.enqueue_requests(executor_requests)
+            for req_id, triton_req_id, triton_user_id, executor_request, triton_request, batch_index in zip(
+                    request_ids, triton_req_ids, triton_user_ids,
+                    executor_requests, triton_requests, batch_indices):
+                self.req_id_to_request_data[req_id] = RequestData(
+                    triton_req_id, triton_user_id, batch_index,
+                    len(batch_indices),
+                    executor_request.sampling_config.num_return_sequences, 0,
+                    0, triton_request.get_response_sender())
+                self.triton_req_id_to_req_ids[triton_req_id].add(req_id)
+                input_len = len(
+                    executor_request.input_token_ids
+                ) if executor_request.input_token_ids is not None else 0
+                self.req_id_to_request_data[
+                    req_id].num_input_tokens += input_len
+                # This checks both request level and instance config level
+                if executor_request.output_config.exclude_input_from_output == False and executor_request.streaming == False:
+                    self.req_id_to_request_data[
+                        req_id].num_output_tokens -= self.req_id_to_request_data[
+                            req_id].num_input_tokens * executor_request.sampling_config.beam_width
+                if triton_user_id is not None and triton_user_id != "":
+                    self.triton_user_id_to_req_ids[triton_user_id].add(req_id)
+        return None
+    def awaiter_loop(self):
+        """Gets responses from executor and returns the results."""
+        while self.running:
+            for response in self.executor.await_responses(
+                    timeout=datetime.timedelta(milliseconds=1)):
+                req_id = response.request_id
+                request_data = None
+                with self.lock:
+                    if req_id not in self.req_id_to_request_data:
+                        continue
+                    request_data = self.req_id_to_request_data[req_id]
+                triton_response, is_final, output_length = convert_response(
+                    response, request_data.batch_index,
+                    request_data.batch_size, request_data.num_return_sequences,
+                    self.logits_dtype)
+                with self.lock:
+                    self.req_id_to_request_data[
+                        req_id].num_output_tokens += output_length
+                triton_request_final = False
+                if is_final:
+                    with self.lock:
+                        # Check if all executor requests part of that triton request are finished
+                        self.triton_req_id_to_req_ids[
+                            request_data.triton_req_id].remove(req_id)
+                        if len(self.triton_req_id_to_req_ids[
+                                request_data.triton_req_id]) == 0:
+                            pb_utils.Logger.log_info(
+                                f"DELETING Req id {req_id}, triton_req_id {request_data.triton_req_id} "
+                            )
+                            triton_request_final = True
+                            del self.triton_req_id_to_req_ids[
+                                request_data.triton_req_id]
+                            if request_data.triton_user_id is not None and request_data.triton_user_id != "":
+                                del self.triton_user_id_to_req_ids[
+                                    request_data.triton_user_id]
+                        self.update_metrics_per_request(req_id)
+                        del self.req_id_to_request_data[req_id]
+                request_data.response_sender.send(
+                    triton_response,
+                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                    if triton_request_final else 0)
+    def cancellation_loop(self):
+        """Checks if any pending requests have been cancelled."""
+        while self.running:
+            time.sleep(self.cancellation_check_period_ms / 1000.0)
+            with self.lock:
+                for req_id, request_data in self.req_id_to_request_data.items(
+                ):
+                    if request_data.response_sender.is_cancelled():
+                        self.executor.cancel_request(req_id)
+    def update_metrics_per_request(self, req_id):
+        """Updates triton metrics after completing one request"""
+        output_tokens = self.req_id_to_request_data[req_id].num_output_tokens
+        input_tokens = self.req_id_to_request_data[req_id].num_input_tokens
+        self.all_metrics[METRIC_TOTAL_OUTPUT_TOKENS].observe(output_tokens)
+        self.all_metrics[METRIC_TOTAL_INPUT_TOKENS].observe(input_tokens)
+    def metrics_loop(self):
+        """Updates triton metrics using stats from the executor."""
+        while self.running:
+            time.sleep(self.stats_check_period_ms / 1000.0)
+            for stat in self.executor.get_latest_iteration_stats():
+                try:
+                    for key, metric in self.all_metrics.items():
+                        # Skip processing for both histogram metrics
+                        if isinstance(key, str) and key in [
+                                METRIC_TOTAL_OUTPUT_TOKENS,
+                                METRIC_TOTAL_INPUT_TOKENS
+                        ]:
+                            continue
+                        value = None
+                        if hasattr(stat, key):
+                            value = getattr(stat, key)
+                        elif stat.kv_cache_stats is not None and hasattr(
+                                stat.kv_cache_stats, key):
+                            value = getattr(stat.kv_cache_stats, key)
+                        elif stat.static_batching_stats is not None and hasattr(
+                                stat.static_batching_stats, key):
+                            value = getattr(stat.static_batching_stats, key)
+                        elif stat.inflight_batching_stats is not None and hasattr(
+                                stat.inflight_batching_stats, key):
+                            value = getattr(stat.inflight_batching_stats, key)
+                        if value is not None:
+                            if key == "timestamp":
+                                value = convert_timestamp_to_seconds(value)
+                            metric.set(value)
+                        else:
+                            pb_utils.Logger.log_warn(
+                                f"Metric \"{key}\" not found.")
+                except Exception as e:
+                    pb_utils.Logger.log_warn(
+                        f"Error while processing metrics: {e}")
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        if self.executor.can_enqueue_requests():
+            self.running = False
+            self.awaiter_thread.join()
+            self.cancellation_thread.join()
+            self.metrics_thread.join()
+            self.executor.shutdown()

tensorrt_llm/1/rank0.engine ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7595e62baa9d736243148716820f6258fbc253d709a52778771f7593dfde37a6
+size 36509691604

tensorrt_llm/1/rank1.engine ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce45fa6b73f60436052b12754ccf229b02c319b94fecafe52d513b05900cf244
+size 36509692228

tensorrt_llm/config.pbtxt ADDED Viewed

	@@ -0,0 +1,757 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+name: "tensorrt_llm"
+backend: "tensorrtllm"
+max_batch_size: 32
+model_transaction_policy {
+  decoupled: True
+}
+dynamic_batching {
+    preferred_batch_size: [ 32 ]
+    max_queue_delay_microseconds: 0
+    default_queue_policy: { max_queue_size: 32 }
+}
+input [
+  {
+    name: "input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    allow_ragged_batch: true
+    optional: true
+  },
+  {
+    name: "encoder_input_features"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    allow_ragged_batch: true
+    optional: true
+  },
+  {
+    name: "encoder_output_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "input_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "request_output_len"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "num_return_sequences"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "draft_input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "decoder_input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "decoder_input_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "draft_logits"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "draft_acceptance_threshold"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "end_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "pad_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "stop_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "bad_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "embedding_bias"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "beam_width"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_k"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_min"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_decay"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_reset_ids"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "len_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "early_stopping"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "repetition_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "min_length"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "beam_search_diversity_rate"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "presence_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "frequency_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "random_seed"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_context_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_generation_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_kv_cache_reuse_stats"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "exclude_input_in_output"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "stop"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "streaming"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "prompt_embedding_table"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "prompt_table_extra_ids"
+    data_type: TYPE_UINT64
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "prompt_vocab_size"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  # cross_attention_mask shape `[bs, seq_len, num_images*num_tiles]`
+  {
+    name: "cross_attention_mask"
+    data_type: TYPE_BOOL
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  # the unique task ID for the given LoRA.
+  # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
+  # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
+  # If the cache is full the oldest LoRA will be evicted to make space for new ones.  An error is returned if `lora_task_id` is not cached.
+  {
+    name: "lora_task_id"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
+  # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
+  # each of the in / out tensors are first flattened and then concatenated together in the format above.
+  # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
+  {
+    name: "lora_weights"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  # module identifier (same size a first dimension of lora_weights)
+  # See LoraModule::ModuleType for model id mapping
+  #
+  # "attn_qkv": 0     # compbined qkv adapter
+  # "attn_q": 1       # q adapter
+  # "attn_k": 2       # k adapter
+  # "attn_v": 3       # v adapter
+  # "attn_dense": 4   # adapter for the dense layer in attention
+  # "mlp_h_to_4h": 5  # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
+  # "mlp_4h_to_h": 6  # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
+  # "mlp_gate": 7     # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
+  #
+  # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
+  {
+    name: "lora_config"
+    data_type: TYPE_INT32
+    dims: [ -1, 3 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "context_phase_params"
+    data_type: TYPE_UINT8
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  # skip_cross_attn_blocks shape `[bs, 1]`, only used in mllama
+  {
+    name: "skip_cross_attn_blocks"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "retention_token_range_starts"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "retention_token_range_ends"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "retention_token_range_priorities"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "retention_token_range_durations_ms"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "retention_decode_priority"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "retention_decode_duration_ms"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "guided_decoding_guide_type"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "guided_decoding_guide"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+    optional: true
+    allow_ragged_batch: true
+  }
+]
+output [
+  {
+    name: "output_ids"
+    data_type: TYPE_INT32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "sequence_length"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+    name: "cum_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "output_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "context_logits"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "generation_logits"
+    data_type: TYPE_FP16
+    dims: [ -1, -1, -1 ]
+  },
+  {
+    name: "batch_index"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "sequence_index"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "context_phase_params"
+    data_type: TYPE_UINT8
+    dims: [ -1 ]
+  },
+  {
+    name: "kv_cache_alloc_new_blocks"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "kv_cache_reused_blocks"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "kv_cache_alloc_total_blocks"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
+parameters: {
+  key: "max_beam_width"
+  value: {
+    string_value: "1"
+  }
+}
+parameters: {
+  key: "FORCE_CPU_ONLY_INPUT_TENSORS"
+  value: {
+    string_value: "no"
+  }
+}
+parameters: {
+  key: "gpt_model_type"
+  value: {
+    string_value: "inflight_fused_batching"
+  }
+}
+parameters: {
+  key: "gpt_model_path"
+  value: {
+    string_value: "/all_models/inflight_batcher_llm/tensorrt_llm/1"
+  }
+}
+parameters: {
+  key: "encoder_model_path"
+  value: {
+    string_value: "${encoder_engine_dir}"
+  }
+}
+parameters: {
+  key: "max_tokens_in_paged_kv_cache"
+  value: {
+    string_value: "${max_tokens_in_paged_kv_cache}"
+  }
+}
+parameters: {
+  key: "max_attention_window_size"
+  value: {
+    string_value: "${max_attention_window_size}"
+  }
+}
+parameters: {
+  key: "sink_token_length"
+  value: {
+    string_value: "${sink_token_length}"
+  }
+}
+parameters: {
+  key: "batch_scheduler_policy"
+  value: {
+    string_value: "guaranteed_no_evict"
+  }
+}
+parameters: {
+  key: "kv_cache_free_gpu_mem_fraction"
+  value: {
+    string_value: "${kv_cache_free_gpu_mem_fraction}"
+  }
+}
+parameters: {
+  key: "cross_kv_cache_fraction"
+  value: {
+    string_value: "${cross_kv_cache_fraction}"
+  }
+}
+parameters: {
+  key: "kv_cache_host_memory_bytes"
+  value: {
+    string_value: "${kv_cache_host_memory_bytes}"
+  }
+}
+# kv_cache_onboard_blocks is for internal implementation.
+parameters: {
+  key: "kv_cache_onboard_blocks"
+  value: {
+    string_value: "${kv_cache_onboard_blocks}"
+  }
+}
+# enable_trt_overlap is deprecated and doesn't have any effect on the runtime
+# parameters: {
+#   key: "enable_trt_overlap"
+#   value: {
+#     string_value: "${enable_trt_overlap}"
+#   }
+# }
+parameters: {
+  key: "exclude_input_in_output"
+  value: {
+    string_value: "True"
+  }
+}
+parameters: {
+  key: "cancellation_check_period_ms"
+  value: {
+    string_value: "${cancellation_check_period_ms}"
+  }
+}
+parameters: {
+  key: "stats_check_period_ms"
+  value: {
+    string_value: "${stats_check_period_ms}"
+  }
+}
+parameters: {
+  key: "iter_stats_max_iterations"
+  value: {
+    string_value: "${iter_stats_max_iterations}"
+  }
+}
+parameters: {
+  key: "request_stats_max_iterations"
+  value: {
+    string_value: "${request_stats_max_iterations}"
+  }
+}
+parameters: {
+  key: "enable_kv_cache_reuse"
+  value: {
+    string_value: "True"
+  }
+}
+parameters: {
+  key: "normalize_log_probs"
+  value: {
+    string_value: "${normalize_log_probs}"
+  }
+}
+parameters: {
+  key: "enable_chunked_context"
+  value: {
+    string_value: "${enable_chunked_context}"
+  }
+}
+parameters: {
+  key: "gpu_device_ids"
+  value: {
+    string_value: "${gpu_device_ids}"
+  }
+}
+parameters: {
+  key: "participant_ids"
+  value: {
+    string_value: "${participant_ids}"
+  }
+}
+parameters: {
+  key: "lora_cache_optimal_adapter_size"
+  value: {
+    string_value: "${lora_cache_optimal_adapter_size}"
+  }
+}
+parameters: {
+  key: "lora_cache_max_adapter_size"
+  value: {
+    string_value: "${lora_cache_max_adapter_size}"
+  }
+}
+parameters: {
+  key: "lora_cache_gpu_memory_fraction"
+  value: {
+    string_value: "${lora_cache_gpu_memory_fraction}"
+  }
+}
+parameters: {
+  key: "lora_cache_host_memory_bytes"
+  value: {
+    string_value: "${lora_cache_host_memory_bytes}"
+  }
+}
+parameters: {
+  key: "decoding_mode"
+  value: {
+    string_value: "${decoding_mode}"
+  }
+}
+parameters: {
+  key: "executor_worker_path"
+  value: {
+    string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
+  }
+}
+parameters: {
+  key: "medusa_choices"
+    value: {
+      string_value: "${medusa_choices}"
+  }
+}
+parameters: {
+  key: "eagle_choices"
+    value: {
+      string_value: "${eagle_choices}"
+  }
+}
+parameters: {
+  key: "gpu_weights_percent"
+    value: {
+      string_value: "${gpu_weights_percent}"
+  }
+}
+parameters: {
+  key: "enable_context_fmha_fp32_acc"
+  value: {
+    string_value: "${enable_context_fmha_fp32_acc}"
+  }
+}
+parameters: {
+  key: "multi_block_mode"
+  value: {
+    string_value: "${multi_block_mode}"
+  }
+}
+parameters: {
+  key: "cuda_graph_mode"
+  value: {
+    string_value: "${cuda_graph_mode}"
+  }
+}
+parameters: {
+  key: "cuda_graph_cache_size"
+  value: {
+    string_value: "${cuda_graph_cache_size}"
+  }
+}
+parameters: {
+  key: "speculative_decoding_fast_logits"
+  value: {
+    string_value: "${speculative_decoding_fast_logits}"
+  }
+}
+parameters: {
+  key: "tokenizer_dir"
+  value: {
+    string_value: "${tokenizer_dir}"
+  }
+}
+parameters: {
+  key: "guided_decoding_backend"
+  value: {
+    string_value: "${guided_decoding_backend}"
+  }
+}

tensorrt_llm_bls/1/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (5.53 kB). View file

tensorrt_llm_bls/1/lib/__pycache__/decode.cpython-312.pyc ADDED Viewed

Binary file (21.7 kB). View file

tensorrt_llm_bls/1/lib/__pycache__/triton_decoder.cpython-312.pyc ADDED Viewed

Binary file (19.4 kB). View file

tensorrt_llm_bls/1/lib/decode.py ADDED Viewed

	@@ -0,0 +1,428 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from collections.abc import Generator
+from dataclasses import dataclass, field
+from typing import Optional
+import numpy as np
+import torch
+class RequestValidationError(Exception):
+    pass
+def _validate_that(condition: bool, msg: str):
+    if not condition:
+        raise RequestValidationError(msg)
+def _validate_non_empty(data, msg: str):
+    if isinstance(data, torch.Tensor):
+        _validate_that(data is not None and data.numel() > 0, msg)
+    else:
+        _validate_that(data is not None and data.size > 0, msg)
+def _validate_single_gt_0(data, msg: str):
+    _validate_non_empty(data, msg)
+    _validate_that(data.flatten()[0] > 0, msg)
+def _single_value(data: Optional[np.ndarray]):
+    if data is None:
+        return None
+    return data.flatten()[0]
+@dataclass
+class Request:
+    text_input: np.ndarray = field(default_factory=lambda: np.array([]))
+    decoder_text_input: np.ndarray = None
+    image_input: Optional[np.ndarray] = None
+    image_bytes_input: Optional[np.ndarray] = None
+    image_url_input: Optional[np.ndarray] = None
+    video_bytes_input: Optional[np.ndarray] = None
+    max_tokens: Optional[np.ndarray] = None
+    bad_words: Optional[np.ndarray] = None
+    stop_words: Optional[np.ndarray] = None
+    end_id: Optional[np.ndarray] = None
+    pad_id: Optional[np.ndarray] = None
+    top_k: Optional[np.ndarray] = None
+    top_p: Optional[np.ndarray] = None
+    temperature: Optional[np.ndarray] = None
+    length_penalty: Optional[np.ndarray] = None
+    repetition_penalty: Optional[np.ndarray] = None
+    min_length: Optional[np.ndarray] = None
+    return_log_probs: Optional[np.ndarray] = None
+    prompt_embedding_table: Optional[np.ndarray] = None
+    prompt_vocab_size: Optional[np.ndarray] = None
+    prompt_table_extra_id: Optional[np.ndarray] = None
+    embedding_bias_words: Optional[np.ndarray] = None
+    embedding_bias_weights: Optional[np.ndarray] = None
+    num_draft_tokens: Optional[np.ndarray] = None
+    use_draft_logits: Optional[np.ndarray] = None
+    stream: Optional[np.ndarray] = None
+    beam_width: Optional[np.ndarray] = None
+    return_context_logits: Optional[np.ndarray] = None
+    return_generation_logits: Optional[np.ndarray] = None
+    random_seed: Optional[np.ndarray] = None
+    presence_penalty: Optional[np.ndarray] = None
+    frequency_penalty: Optional[np.ndarray] = None
+    lora_task_id: Optional[np.ndarray] = None
+    lora_weights: Optional[np.ndarray] = None
+    lora_config: Optional[np.ndarray] = None
+    exclude_input_in_output: Optional[np.ndarray] = None
+    return_kv_cache_reuse_stats: Optional[np.ndarray] = None
+    guided_decoding_guide_type: Optional[np.ndarray] = None
+    guided_decoding_guide: Optional[np.ndarray] = None
+    def validate(self):
+        _validate_non_empty(self.text_input, "text_input is required")
+        _validate_single_gt_0(self.max_tokens,
+                              "max_tokens must be a single value > 0")
+        num_draft_tokens = _single_value(self.num_draft_tokens)
+        _single_value(self.return_generation_logits)
+        context_logits = _single_value(self.return_context_logits)
+        if num_draft_tokens:
+            _validate_that(
+                not self.stream.any(),
+                "streaming is not supported with speculative decoding")
+            _validate_that(
+                not context_logits,
+                "context logits are not supported with speculative decoding")
+@dataclass
+class DraftRequest:
+    draft_input_ids: Optional[np.ndarray] = None
+    draft_logits: Optional[np.ndarray] = None
+@dataclass
+class PreprocResponse:
+    input_ids: np.ndarray = field(default_factory=lambda: np.array([]))
+    decoder_input_ids: np.ndarray = None
+    input_lengths: np.ndarray = field(default_factory=lambda: np.array([]))
+    decoder_input_lengths: np.ndarray = None
+    bad_words_list: Optional[np.ndarray] = None
+    stop_words_list: Optional[np.ndarray] = None
+    embedding_bias: Optional[np.ndarray] = None
+    end_id: Optional[np.ndarray] = None
+    pad_id: Optional[np.ndarray] = None
+    prompt_table_extra_ids: Optional[np.ndarray] = None
+    pixel_values: Optional[np.ndarray] = None
+    image_sizes: Optional[np.ndarray] = None
+    is_video_input: Optional[np.ndarray] = None
+    @classmethod
+    def with_new_inputs(cls,
+                        other,
+                        input_ids: Optional[np.ndarray] = None,
+                        input_lengths: Optional[np.ndarray] = None):
+        return cls(input_ids=(input_ids
+                              if input_ids is not None else other.input_ids),
+                   input_lengths=(input_lengths if input_lengths is not None
+                                  else other.input_lengths),
+                   decoder_input_ids=other.decoder_input_ids,
+                   decoder_input_lengths=other.decoder_input_lengths,
+                   bad_words_list=other.bad_words_list,
+                   stop_words_list=other.stop_words_list,
+                   end_id=other.end_id,
+                   pad_id=other.pad_id,
+                   prompt_table_extra_ids=other.prompt_table_extra_ids)
+@dataclass
+class MultimodalEncResponse:
+    prompt_embedding_table: Optional[torch.Tensor] = None
+    prompt_vocab_size: Optional[np.ndarray] = None
+@dataclass
+class GenerationResponse:
+    output_ids: np.ndarray = field(default_factory=lambda: np.array([]))
+    sequence_length: np.ndarray = field(default_factory=lambda: np.array([]))
+    cum_log_probs: Optional[np.ndarray] = None
+    output_log_probs: Optional[np.ndarray] = None
+    context_logits: Optional[np.ndarray] = None
+    generation_logits: Optional[np.ndarray] = None
+    batch_index: Optional[np.ndarray] = None
+    sequence_index: Optional[np.ndarray] = None
+    kv_cache_alloc_new_blocks: Optional[np.ndarray] = None
+    kv_cache_reused_blocks: Optional[np.ndarray] = None
+    kv_cache_alloc_total_blocks: Optional[np.ndarray] = None
+@dataclass
+class Response:
+    text_output: np.ndarray = field(default_factory=lambda: np.array([]))
+    cum_log_probs: Optional[np.ndarray] = None
+    output_log_probs: Optional[np.ndarray] = None
+    context_logits: Optional[np.ndarray] = None
+    generation_logits: Optional[np.ndarray] = None
+    batch_index: Optional[np.ndarray] = None
+    sequence_index: Optional[np.ndarray] = None
+    kv_cache_alloc_new_blocks: Optional[np.ndarray] = None
+    kv_cache_reused_blocks: Optional[np.ndarray] = None
+    kv_cache_alloc_total_blocks: Optional[np.ndarray] = None
+    def __eq__(self, o) -> bool:
+        """Just for testing"""
+        if not isinstance(o, Response):
+            return False
+        return (np.array_equal(self.text_output, o.text_output)
+                and np.array_equal(self.cum_log_probs, o.cum_log_probs)
+                and np.array_equal(self.output_log_probs, o.output_log_probs)
+                and np.array_equal(self.context_logits, o.context_logits)
+                and np.array_equal(self.generation_logits, o.generation_logits)
+                and np.array_equal(self.batch_index, o.batch_index)
+                and np.array_equal(self.sequence_index, o.sequence_index)
+                and np.array_equal(self.sequence_index, o.sequence_index)
+                and np.array_equal(self.kv_cache_alloc_new_blocks,
+                                   o.kv_cache_alloc_new_blocks)
+                and np.array_equal(self.kv_cache_reused_blocks,
+                                   o.kv_cache_reused_blocks)
+                and np.array_equal(self.kv_cache_alloc_total_blocks,
+                                   o.kv_cache_alloc_total_blocks))
+class Decoder:
+    def __init__(self, streaming=False, accumulate=False):
+        self._streaming = streaming
+        self._accumulate = accumulate
+        self._accumulated_tokens = []
+    def decode(self,
+               request: Request,
+               speculative_decoding=False,
+               is_multimodal=False) -> Generator[Response, None, None]:
+        batch_size = request.text_input.shape[0]
+        self._accumulated_tokens = [None] * batch_size
+        preproc_response = self.preprocess(request)
+        multimodal_enc_response = None
+        if is_multimodal:
+            multimodal_enc_response = self._multimodal_enc_generate(
+                request, preproc_response)
+        if speculative_decoding:
+            if batch_size > 1:
+                raise Exception(
+                    "speculative decoding is not supported with batch size > 1"
+                )
+            for gen_response in self._spec_generate(preproc_response, request):
+                yield self.postprocess(gen_response, batch_size)
+        else:
+            if not self._streaming and batch_size == 1:
+                gen_response = self._generate_non_streaming(
+                    preproc_response,
+                    request,
+                    multimodal_enc_response=multimodal_enc_response)
+                yield self.postprocess(gen_response, batch_size)
+            else:
+                for gen_response in self._generate(
+                        preproc_response,
+                        request,
+                        multimodal_enc_response=multimodal_enc_response):
+                    yield self.postprocess(gen_response, batch_size)
+    def encountered_stop_words(self, input_ids, stop_words_ids):
+        for stop_word_ids in stop_words_ids:
+            if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids):
+                return True
+        return False
+    def _spec_generate(
+            self, preproc: PreprocResponse,
+            request: Request) -> Generator[GenerationResponse, None, None]:
+        if preproc.input_ids.shape[0] > 1:
+            raise Exception(
+                "Speculative decoding does not support batch size > 1.")
+        prompt_input_ids: np.ndarray = preproc.input_ids[0]
+        input_ids: np.ndarray = prompt_input_ids
+        output_len: int = request.max_tokens[0][0]
+        last_input_ids: np.ndarray = None
+        draft_output_ids: np.ndarray = None
+        draft_logits: np.ndarray = None
+        target_response: GenerationResponse = None
+        cur_preproc = preproc
+        counter = 0
+        while True:
+            counter += 1
+            num_draft_tokens = min(
+                request.num_draft_tokens[0][0],
+                len(prompt_input_ids) + output_len - len(input_ids) - 1)
+            draft_request = None
+            if num_draft_tokens > 0:
+                request.min_length = np.array([num_draft_tokens],
+                                              dtype=np.int32)
+                draft_response: GenerationResponse = self._draft_generate_non_streaming(
+                    cur_preproc, request, num_draft_tokens)
+                seq_len: int = draft_response.sequence_length[0][0]
+                # [1, beamWidth, outputLength] -> [outputLen]
+                draft_output_ids = draft_response.output_ids[0][0]
+                # [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded]
+                if request.use_draft_logits is not None and request.use_draft_logits[
+                        0]:
+                    if draft_response.generation_logits is not None:
+                        draft_logits = draft_response.generation_logits[0][0]
+                input_draft_tokens = draft_output_ids[len(input_ids):seq_len]
+                if len(input_draft_tokens) > 0:
+                    draft_request = DraftRequest(
+                        draft_input_ids=np.expand_dims(input_draft_tokens, 0))
+                    if request.use_draft_logits is not None and request.use_draft_logits[
+                            0]:
+                        draft_request.draft_logits = np.expand_dims(
+                            draft_logits[-len(input_draft_tokens):], 0)
+                else:
+                    draft_request = DraftRequest()
+                request.min_length = None
+            else:
+                draft_request = DraftRequest()
+            target_response = self._generate_non_streaming(
+                cur_preproc, request, draft_request)
+            last_input_ids = input_ids
+            input_ids = target_response.output_ids[0][0]
+            cur_preproc = PreprocResponse.with_new_inputs(
+                cur_preproc, np.expand_dims(input_ids, 0),
+                np.array([[len(input_ids)]], dtype=np.int32))
+            # Evaluate criteria to stop generation loop.
+            # If we've hit or exceeded the max output length, should stop
+            length_stop = (len(input_ids)
+                           >= len(prompt_input_ids) + output_len)
+            if length_stop:
+                break
+            # If draft and target have same outputs, should stop. Normally target should return 1 more token.
+            # If they are the same length, they should differ at the last token
+            target_draft_equal = draft_output_ids is not None and np.array_equal(
+                draft_output_ids, input_ids)
+            if target_draft_equal:
+                break
+            # If tokens no longer change, should stop, means we have hit early stopping
+            last_current_equal = np.array_equal(last_input_ids, input_ids)
+            if last_current_equal:
+                break
+            # Need to check if stop words was encountered
+            hit_stop_words = self.encountered_stop_words(
+                input_ids, preproc.stop_words_list[0])
+            if hit_stop_words:
+                break
+        yield target_response
+    def _draft_generate_non_streaming(
+            self, preproc: PreprocResponse, request: Request,
+            num_draft_tokens: int) -> GenerationResponse:
+        raise NotImplementedError()
+    def _multimodal_enc_generate(
+        self,
+        request: Request,
+    ) -> MultimodalEncResponse:
+        raise NotImplementedError()
+    def _generate(
+        self,
+        preproc: PreprocResponse,
+        request: Request,
+        draft_request: Optional[DraftRequest] = None,
+        multimodal_enc_response: Optional[MultimodalEncResponse] = None,
+    ) -> Generator[GenerationResponse, None, None]:
+        raise NotImplementedError()
+    def _generate_non_streaming(
+        self,
+        preproc: PreprocResponse,
+        request: Request,
+        draft_request: Optional[DraftRequest] = None,
+        multimodal_enc_response: Optional[MultimodalEncResponse] = None,
+    ) -> GenerationResponse:
+        raise NotImplementedError()
+    def postprocess(self, gen_response: GenerationResponse,
+                    batch_size) -> Response:
+        if self._accumulate and self._streaming:
+            new_tokens: np.ndarray = gen_response.output_ids
+            if new_tokens.ndim != 3:
+                raise Exception("Expected output_ids tensor to have 3 dims.")
+            if new_tokens.shape[0] != 1:
+                raise Exception("Expected batch size of 1")
+            if new_tokens.shape[1] != 1:
+                raise Exception(
+                    "Accumulation of tokens is only implemented for beam width = 1"
+                )
+            batch_index = gen_response.batch_index
+            if batch_index is not None:
+                if batch_index.ndim != 2:
+                    raise Exception(
+                        "Expected batch_index tensor to have 2 dims.")
+                if batch_index.shape[0] != 1:
+                    raise Exception("Expected batch size of 1")
+                if batch_index.shape[1] != 1:
+                    raise Exception("Expected only one batch_index")
+            batch_index = batch_index[0][0] if batch_index is not None else 0
+            self._accumulated_tokens[batch_index] = new_tokens if (
+                self._accumulated_tokens[batch_index]
+                is None) else np.concatenate(
+                    (self._accumulated_tokens[batch_index], new_tokens),
+                    axis=2)
+            sequence_lengths = np.array(
+                [[self._accumulated_tokens[batch_index].shape[2]]],
+                dtype=np.int32)
+            return self._postprocess(self._accumulated_tokens[batch_index],
+                                     sequence_lengths, gen_response)
+        else:
+            return self._postprocess(gen_response.output_ids, None,
+                                     gen_response)
+    def _postprocess(self, tokens: np.ndarray,
+                     sequence_lengths: Optional[np.ndarray],
+                     gen_response: GenerationResponse) -> Response:
+        raise NotImplementedError()
+    def preprocess(self, request: Request) -> PreprocResponse:
+        raise NotImplementedError()
+    def reset_decoder(self):
+        self._accumulated_tokens = []

tensorrt_llm_bls/1/lib/triton_decoder.py ADDED Viewed

	@@ -0,0 +1,542 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from collections.abc import Callable
+from typing import Dict, Optional
+import numpy as np
+import triton_python_backend_utils as pb_utils
+from lib.decode import *
+from torch.utils.dlpack import from_dlpack, to_dlpack
+from typing_extensions import override
+class TritonDecoder(Decoder):
+    def __init__(self,
+                 streaming=False,
+                 accumulate=False,
+                 preproc_model_name="preprocessing",
+                 postproc_model_name="postprocessing",
+                 llm_model_name="tensorrt_llm",
+                 draft_llm_model_name: Optional[str] = None,
+                 multimodal_encoders_name: Optional[str] = None):
+        super().__init__(streaming=streaming, accumulate=accumulate)
+        self.preproc_model_name = preproc_model_name
+        self.postproc_model_name = postproc_model_name
+        self.llm_model_name = llm_model_name
+        self.draft_llm_model_name = draft_llm_model_name
+        self.multimodal_encoders_name = multimodal_encoders_name
+        self._preproc_outputs = [
+            "INPUT_ID",
+            "DECODER_INPUT_ID",
+            "REQUEST_INPUT_LEN",
+            "REQUEST_DECODER_INPUT_LEN",
+            "BAD_WORDS_IDS",
+            "STOP_WORDS_IDS",
+            "EMBEDDING_BIAS",
+            "OUT_PAD_ID",
+            "OUT_END_ID",
+            "OUT_PROMPT_TABLE_EXTRA_IDS",
+            "PIXEL_VALUES",
+            "IMAGE_SIZES",
+            "IS_VIDEO_INPUT",
+        ]
+        self._multimodal_enc_outputs = [
+            "OUT_PROMPT_EMBEDDING_TABLE", "OUT_PROMPT_VOCAB_SIZE"
+        ]
+        self._llm_outputs = [
+            "output_ids", "sequence_length", "cum_log_probs",
+            "output_log_probs", "context_logits", "generation_logits",
+            "batch_index", "sequence_index", "kv_cache_alloc_new_blocks",
+            "kv_cache_reused_blocks", "kv_cache_alloc_total_blocks"
+        ]
+        self._postproc_outputs = [
+            "OUTPUT",
+        ]
+        self.input_names = [
+            "text_input", "decoder_text_input", "image_input",
+            "image_bytes_input", "image_url_input", "video_bytes_input",
+            "max_tokens", "bad_words", "stop_words", "end_id", "pad_id",
+            "top_k", "top_p", "temperature", "length_penalty",
+            "repetition_penalty", "min_length", "presence_penalty",
+            "frequency_penalty", "random_seed", "return_log_probs",
+            "return_context_logits", "return_generation_logits", "beam_width",
+            "stream", "prompt_embedding_table", "prompt_vocab_size",
+            "prompt_table_extra_id", "embedding_bias_words",
+            "embedding_bias_weights", "num_draft_tokens", "use_draft_logits",
+            "lora_task_id", "lora_weights", "lora_config",
+            "exclude_input_in_output", "return_kv_cache_reuse_stats",
+            "guided_decoding_guide_type", "guided_decoding_guide"
+        ]
+        self.__undo_reshape_whitelist = {
+            "max_tokens", "end_id", "pad_id", "top_k", "top_p", "temperature",
+            "length_penalty", "repetition_penalty", "min_length",
+            "presence_penalty", "frequency_penalty", "random_seed",
+            "return_log_probs", "return_context_logits",
+            "return_generation_logits", "beam_width", "stream",
+            "prompt_vocab_size", "num_draft_tokens", "use_draft_logits",
+            "exclude_input_in_output", "return_kv_cache_reuse_stats",
+            "lora_weights", "lora_config", "lora_task_id"
+        }
+    def _exec_triton_request(self, request):
+        responses = request.exec(decoupled=True)
+        for r in responses:
+            if r.has_error():
+                raise pb_utils.TritonModelException(r.error().message())
+            yield r
+    def _exec_triton_request_single(self, request):
+        responses = request.exec(decoupled=False)
+        if responses.has_error():
+            raise pb_utils.TritonModelException(responses.error().message())
+        return responses
+    def create_triton_response(self, response: Response):
+        name_map = {
+            "text_output": "text_output",
+            "cum_log_probs": "cum_log_probs",
+            "output_log_probs": "output_log_probs",
+            "context_logits": "context_logits",
+            "generation_logits": "generation_logits",
+            "batch_index": "batch_index",
+            "sequence_index": "sequence_index",
+            "kv_cache_alloc_new_blocks": "kv_cache_alloc_new_blocks",
+            "kv_cache_reused_blocks": "kv_cache_reused_blocks",
+            "kv_cache_alloc_total_blocks": "kv_cache_alloc_total_blocks"
+        }
+        tensors = self.create_triton_tensors(response, name_map)
+        return pb_utils.InferenceResponse(output_tensors=tensors)
+    def convert_triton_request(self, triton_request) -> Request:
+        request = Request()
+        for triton_name in self.input_names:
+            tensor = pb_utils.get_input_tensor_by_name(triton_request,
+                                                       triton_name)
+            target_name = triton_name
+            if tensor is None:
+                continue
+            if not hasattr(request, target_name):
+                raise AttributeError(
+                    f"Request has no attribute '{target_name}'")
+            setattr(request, target_name, tensor.as_numpy())
+        return request
+    def convert_triton_response(self,
+                                triton_response,
+                                response_factory: Callable,
+                                name_map=None):
+        response = response_factory()
+        for tensor in triton_response.output_tensors():
+            if tensor is None:
+                continue
+            triton_name = tensor.name()
+            if tensor.is_cpu():
+                value = tensor.as_numpy()
+            else:
+                # If the tensor is in GPU memory make it torch.Tensor type
+                value = from_dlpack(tensor.to_dlpack())
+            target_name = triton_name
+            if name_map and triton_name in name_map:
+                target_name = name_map[triton_name]
+            if name_map and not triton_name in name_map:
+                continue
+            if target_name is None:
+                # explicitly ignore this triton input
+                continue
+            if not hasattr(response, target_name):
+                raise AttributeError(
+                    f"response object has not attribute '{target_name}'")
+            setattr(response, target_name, value)
+        return response
+    def __undo_reshape(self, x, name):
+        if name in self.__undo_reshape_whitelist and len(x.shape) == 1:
+            # handle reshapes
+            return np.expand_dims(x, 0)
+        else:
+            return x
+    def create_triton_tensors(self, obj, name_map: dict):
+        tensors = []
+        for name, triton_name in name_map.items():
+            if triton_name is None:
+                continue
+            value = getattr(obj, name)
+            if value is None:
+                continue
+            if isinstance(value, np.ndarray):
+                t = pb_utils.Tensor(triton_name,
+                                    self.__undo_reshape(value, name))
+            elif isinstance(value, torch.Tensor):
+                t = pb_utils.Tensor.from_dlpack(
+                    triton_name, to_dlpack(self.__undo_reshape(value, name)))
+            tensors.append(t)
+        return tensors
+    @override
+    def preprocess(self, request: Request) -> PreprocResponse:
+        input_tensors = self._get_preproc_tensors(request)
+        triton_req = pb_utils.InferenceRequest(
+            model_name=self.preproc_model_name,
+            inputs=input_tensors,
+            requested_output_names=self._preproc_outputs)
+        triton_output = self._exec_triton_request_single(triton_req)
+        return self._get_preproc_response(triton_output)
+    def _get_preproc_tensors(self, request: Request):
+        name_map = {
+            "text_input": "QUERY",
+            "image_bytes_input": "IMAGE_BYTES",
+            "image_url_input": "IMAGE_URL",
+            "video_bytes_input": "VIDEO_BYTES",
+            "decoder_text_input": "DECODER_QUERY",
+            "max_tokens": "REQUEST_OUTPUT_LEN",
+            "bad_words": "BAD_WORDS_DICT",
+            "stop_words": "STOP_WORDS_DICT",
+            "embedding_bias_words": "EMBEDDING_BIAS_WORDS",
+            "embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS",
+            "pad_id": "PAD_ID",
+            "end_id": "END_ID",
+            "prompt_table_extra_id": "PROMPT_TABLE_EXTRA_ID",
+        }
+        return self.create_triton_tensors(request, name_map)
+    def _get_preproc_response(self, triton_output):
+        name_map = {
+            "INPUT_ID": "input_ids",
+            "DECODER_INPUT_ID": "decoder_input_ids",
+            "REQUEST_INPUT_LEN": "input_lengths",
+            "REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths",
+            "BAD_WORDS_IDS": "bad_words_list",
+            "STOP_WORDS_IDS": "stop_words_list",
+            "EMBEDDING_BIAS": "embedding_bias",
+            "OUT_PAD_ID": "pad_id",
+            "OUT_END_ID": "end_id",
+            "OUT_PROMPT_TABLE_EXTRA_IDS": "prompt_table_extra_ids",
+            "PIXEL_VALUES": "pixel_values",
+            "IMAGE_SIZES": "image_sizes",
+            "IS_VIDEO_INPUT": "is_video_input",
+        }
+        return self.convert_triton_response(triton_output, PreprocResponse,
+                                            name_map)
+    @override
+    def _multimodal_enc_generate(
+        self,
+        request: Request,
+        preproc: PreprocResponse,
+    ) -> MultimodalEncResponse:
+        input_tensors = self._get_multimodal_enc_tensors(request, preproc)
+        triton_req = pb_utils.InferenceRequest(
+            model_name=self.multimodal_encoders_name,
+            inputs=input_tensors,
+            requested_output_names=self._multimodal_enc_outputs)
+        triton_output = self._exec_triton_request_single(triton_req)
+        return self._get_multimodal_enc_response(triton_output)
+    def _get_multimodal_enc_tensors(self, request: Request,
+                                    preproc: PreprocResponse):
+        name_map_request = {
+            "image_input": "IMAGE",
+        }
+        name_map_preproc = {
+            "pixel_values": "pixel_values",
+            "image_sizes": "image_sizes",
+            "is_video_input": "is_video_input"
+        }
+        tensors = []
+        tensors.extend(self.create_triton_tensors(request, name_map_request))
+        tensors.extend(self.create_triton_tensors(preproc, name_map_preproc))
+        return tensors
+    def _get_multimodal_enc_response(self, triton_output):
+        name_map = {
+            "OUT_PROMPT_EMBEDDING_TABLE": "prompt_embedding_table",
+            "OUT_PROMPT_VOCAB_SIZE": "prompt_vocab_size",
+        }
+        return self.convert_triton_response(triton_output,
+                                            MultimodalEncResponse, name_map)
+    @override
+    def _draft_generate_non_streaming(
+            self, preproc: PreprocResponse, request: Request,
+            num_draft_tokens: int) -> GenerationResponse:
+        input_tensors = self._get_llm_tensors(preproc, request,
+                                              num_draft_tokens, None, True)
+        triton_req = pb_utils.InferenceRequest(
+            model_name=self.draft_llm_model_name,
+            inputs=input_tensors,
+            requested_output_names=self._llm_outputs)
+        triton_response = self._exec_triton_request_single(triton_req)
+        llm_response = self._get_llm_response(triton_response)
+        return llm_response
+    @override
+    def _generate(
+        self,
+        preproc: PreprocResponse,
+        request: Request,
+        draft_request: Optional[DraftRequest] = None,
+        multimodal_enc_response: Optional[MultimodalEncResponse] = None
+    ) -> Generator[GenerationResponse, None, None]:
+        input_tensors = self._get_llm_tensors(
+            preproc,
+            request,
+            None,
+            draft_request,
+            multimodal_enc_response=multimodal_enc_response)
+        triton_req = pb_utils.InferenceRequest(
+            model_name=self.llm_model_name,
+            inputs=input_tensors,
+            requested_output_names=self._llm_outputs)
+        for r in self._exec_triton_request(triton_req):
+            yield self._get_llm_response(r)
+    @override
+    def _generate_non_streaming(
+        self,
+        preproc: PreprocResponse,
+        request: Request,
+        draft_request: Optional[DraftRequest] = None,
+        multimodal_enc_response: Optional[MultimodalEncResponse] = None
+    ) -> GenerationResponse:
+        input_tensors = self._get_llm_tensors(
+            preproc,
+            request,
+            None,
+            draft_request,
+            multimodal_enc_response=multimodal_enc_response)
+        triton_req = pb_utils.InferenceRequest(
+            model_name=self.llm_model_name,
+            inputs=input_tensors,
+            requested_output_names=self._llm_outputs)
+        r = self._exec_triton_request_single(triton_req)
+        return self._get_llm_response(r)
+    def _get_llm_tensors(
+            self,
+            preproc: PreprocResponse,
+            request: Request,
+            num_output_tokens: Optional[int] = None,
+            draft_request: Optional[DraftRequest] = None,
+            is_draft_model_request: bool = False,
+            multimodal_enc_response: MultimodalEncResponse = None):
+        tensors = []
+        tensors.extend(self._get_tensors_from_preproc(preproc))
+        if multimodal_enc_response is not None:
+            tensors.extend(
+                self._get_tensors_from_multimodal_enc(multimodal_enc_response))
+        tensors.extend(
+            self._get_llm_tensors_from_request(request, num_output_tokens,
+                                               draft_request,
+                                               is_draft_model_request))
+        return tensors
+    def _get_tensors_from_preproc(self, preproc: PreprocResponse):
+        name_map = {
+            "input_ids": "input_ids",
+            "decoder_input_ids": "decoder_input_ids",
+            "input_lengths": "input_lengths",
+            "bad_words_list": "bad_words_list",
+            "stop_words_list": "stop_words_list",
+            "embedding_bias": "embedding_bias",
+            "pad_id": "pad_id",
+            "end_id": "end_id",
+            "prompt_table_extra_ids": "prompt_table_extra_ids",
+        }
+        return self.create_triton_tensors(preproc, name_map)
+    def _get_tensors_from_multimodal_enc(
+            self, multimodal_enc_response: MultimodalEncResponse):
+        name_map = {
+            "prompt_embedding_table": "prompt_embedding_table",
+            "prompt_vocab_size": "prompt_vocab_size",
+        }
+        return self.create_triton_tensors(multimodal_enc_response, name_map)
+    def _get_llm_tensors_from_request(
+            self,
+            request: Request,
+            num_output_tokens: Optional[int] = None,
+            draft_request: Optional[DraftRequest] = None,
+            is_draft_model_request: bool = False):
+        name_map: Dict[str, Optional[str]] = {
+            "beam_width": "beam_width",
+            "top_k": "runtime_top_k",
+            "top_p": "runtime_top_p",
+            "temperature": "temperature",
+            "length_penalty": "len_penalty",
+            "repetition_penalty": "repetition_penalty",
+            "min_length": "min_length",
+            "presence_penalty": "presence_penalty",
+            "frequency_penalty": "frequency_penalty",
+            "random_seed": "random_seed",
+            "return_log_probs": "return_log_probs",
+            "stream": "streaming",
+            "prompt_embedding_table": "prompt_embedding_table",
+            "prompt_vocab_size": "prompt_vocab_size",
+            "lora_task_id": "lora_task_id",
+            "lora_weights": "lora_weights",
+            "lora_config": "lora_config",
+            "exclude_input_in_output": "exclude_input_in_output",
+            "return_kv_cache_reuse_stats": "return_kv_cache_reuse_stats",
+            "guided_decoding_guide_type": "guided_decoding_guide_type",
+            "guided_decoding_guide": "guided_decoding_guide"
+        }
+        batch_size = request.text_input.shape[0]
+        tensors = self.create_triton_tensors(request, name_map)
+        out_len_tensor = None
+        if request.max_tokens is not None:
+            out_len_tensor = request.max_tokens
+        out_len = None
+        if num_output_tokens is not None:
+            out_len = num_output_tokens
+        elif draft_request:
+            out_len = len(
+                draft_request.draft_input_ids[0]
+            ) + 1 if draft_request.draft_input_ids is not None else 1
+        if out_len is not None:
+            out_len_tensor = [[out_len]] * batch_size
+        if out_len_tensor is None:
+            raise Exception("Could not determine request_output_len")
+        else:
+            tensors.append(
+                pb_utils.Tensor("request_output_len",
+                                np.array(out_len_tensor, dtype=np.int32)))
+        if draft_request:
+            if draft_request.draft_input_ids is not None:
+                tensors.append(
+                    pb_utils.Tensor("draft_input_ids",
+                                    draft_request.draft_input_ids))
+                if draft_request.draft_logits is not None and request.use_draft_logits is not None and request.use_draft_logits[
+                        0]:
+                    tensors.append(
+                        pb_utils.Tensor("draft_logits",
+                                        draft_request.draft_logits))
+        return_context_logits_data = [False]
+        return_generation_logits_data = [False]
+        if draft_request is None:
+            if is_draft_model_request:
+                return_generation_logits_data = request.use_draft_logits if request.use_draft_logits is not None else [
+                    False
+                ]
+            else:
+                return_context_logits_data = request.return_context_logits if request.return_context_logits is not None else [
+                    False
+                ]
+                return_generation_logits_data = request.return_generation_logits if request.return_generation_logits is not None else [
+                    False
+                ]
+        return_context_logits = np.array([return_context_logits_data] *
+                                         batch_size,
+                                         dtype=bool)
+        return_generation_logits = np.array([return_generation_logits_data] *
+                                            batch_size,
+                                            dtype=bool)
+        assert len(return_context_logits.shape) == 2
+        assert len(return_generation_logits.shape) == 2
+        tensors.append(
+            pb_utils.Tensor("return_context_logits", return_context_logits))
+        tensors.append(
+            pb_utils.Tensor("return_generation_logits",
+                            return_generation_logits))
+        return tensors
+    def _get_llm_response(self, triton_output):
+        name_map = {
+            "output_ids": "output_ids",
+            "sequence_length": "sequence_length",
+            "cum_log_probs": "cum_log_probs",
+            "output_log_probs": "output_log_probs",
+            "context_logits": "context_logits",
+            "generation_logits": "generation_logits",
+            "batch_index": "batch_index",
+            "sequence_index": "sequence_index",
+            "kv_cache_alloc_new_blocks": "kv_cache_alloc_new_blocks",
+            "kv_cache_reused_blocks": "kv_cache_reused_blocks",
+            "kv_cache_alloc_total_blocks": "kv_cache_alloc_total_blocks"
+        }
+        return self.convert_triton_response(triton_output, GenerationResponse,
+                                            name_map)
+    def _postprocess(self, tokens: np.ndarray,
+                     sequence_lengths: Optional[np.ndarray],
+                     gen_response: GenerationResponse) -> Response:
+        input_tensors = self._get_postproc_tensors(tokens, sequence_lengths,
+                                                   gen_response)
+        triton_req = pb_utils.InferenceRequest(
+            model_name=self.postproc_model_name,
+            inputs=input_tensors,
+            requested_output_names=self._postproc_outputs)
+        r = self._exec_triton_request_single(triton_req)
+        response = self._get_response(r, gen_response)
+        return response
+    def _get_postproc_tensors(self, tokens: np.ndarray,
+                              sequence_lengths: Optional[np.ndarray],
+                              gen_response: GenerationResponse):
+        tensors = [
+            pb_utils.Tensor("TOKENS_BATCH", tokens),
+            pb_utils.Tensor(
+                "SEQUENCE_LENGTH", sequence_lengths
+                if sequence_lengths else gen_response.sequence_length)
+        ]
+        return tensors
+    def _get_response(self, triton_output, gen_res: GenerationResponse):
+        tensors = triton_output.output_tensors()
+        t_map = {}
+        for named_t in tensors:
+            name = named_t.name()
+            t = named_t.as_numpy()
+            t_map[name] = t
+        response = Response(
+            text_output=t_map["OUTPUT"],
+            cum_log_probs=gen_res.cum_log_probs,
+            output_log_probs=gen_res.output_log_probs,
+            context_logits=gen_res.context_logits,
+            generation_logits=gen_res.generation_logits,
+            batch_index=gen_res.batch_index,
+            sequence_index=gen_res.sequence_index,
+            kv_cache_alloc_new_blocks=gen_res.kv_cache_alloc_new_blocks,
+            kv_cache_reused_blocks=gen_res.kv_cache_reused_blocks,
+            kv_cache_alloc_total_blocks=gen_res.kv_cache_alloc_total_blocks)
+        return response

tensorrt_llm_bls/1/model.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import json
+import traceback
+import triton_python_backend_utils as pb_utils
+from lib.triton_decoder import TritonDecoder
+def get_valid_param_value(param, default_value=''):
+    value = param.get('string_value', '')
+    return default_value if value.startswith('${') or value == '' else value
+class TritonPythonModel:
+    def initialize(self, args):
+        # Parse model configs
+        model_config = json.loads(args['model_config'])
+        params = model_config['parameters']
+        accumulate_tokens_str = get_valid_param_value(
+            params.get('accumulate_tokens', {}))
+        self.accumulate_tokens = accumulate_tokens_str.lower() in [
+            'true', 'yes', '1', 't'
+        ]
+        self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
+            model_config)
+        self.logger = pb_utils.Logger
+        default_tensorrt_llm_model_name = 'tensorrt_llm'
+        self.llm_model_name = get_valid_param_value(
+            params.get('tensorrt_llm_model_name', {}),
+            default_tensorrt_llm_model_name)
+        self.draft_llm_model_name = get_valid_param_value(
+            params.get('tensorrt_llm_draft_model_name', {}), None)
+        self.multimodal_encoders_name = get_valid_param_value(
+            params.get('multimodal_encoders_name', {}), None)
+        self.decoder = TritonDecoder(
+            streaming=self.decoupled,
+            accumulate=self.accumulate_tokens,
+            preproc_model_name="preprocessing",
+            postproc_model_name="postprocessing",
+            llm_model_name=self.llm_model_name,
+            draft_llm_model_name=self.draft_llm_model_name,
+            multimodal_encoders_name=self.multimodal_encoders_name)
+    def execute(self, requests):
+        responses = []
+        for request in requests:
+            if self.decoupled:
+                response_sender = request.get_response_sender()
+            try:
+                req = self.decoder.convert_triton_request(request)
+                req.validate()
+                speculative_decode = (req.num_draft_tokens is not None
+                                      and req.num_draft_tokens[0][0] > 0)
+                if speculative_decode and (self.draft_llm_model_name is None
+                                           or self.draft_llm_model_name == ""):
+                    raise Exception(
+                        "cannot perform speculative decoding without draft model"
+                    )
+                is_multimodal = req.image_input is not None or req.image_bytes_input is not None or req.image_url_input is not None or req.video_bytes_input is not None
+                if speculative_decode and is_multimodal:
+                    raise Exception(
+                        "Multimodal and speculative decoding is not currently supported"
+                    )
+                res_gen = self.decoder.decode(
+                    req,
+                    speculative_decoding=speculative_decode,
+                    is_multimodal=is_multimodal)
+                for res in res_gen:
+                    triton_response = self.decoder.create_triton_response(res)
+                    if self.decoupled:
+                        response_sender.send(triton_response)
+                    else:
+                        responses.append(triton_response)
+                if self.decoupled:
+                    response_sender.send(
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+            except Exception:
+                self.logger.log_error(traceback.format_exc())
+                # If encountering an error, send a response with err msg
+                error_response = pb_utils.InferenceResponse(
+                    output_tensors=[],
+                    error=pb_utils.TritonError(traceback.format_exc()))
+                if self.decoupled:
+                    response_sender.send(error_response)
+                    response_sender.send(
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                else:
+                    responses.append(error_response)
+            self.decoder.reset_decoder()
+        if self.decoupled:
+            return None
+        else:
+            assert len(responses) == len(requests)
+            return responses
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print('Cleaning up...')

tensorrt_llm_bls/config.pbtxt ADDED Viewed

	@@ -0,0 +1,388 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+name: "tensorrt_llm_bls"
+backend: "python"
+max_batch_size: 32
+model_transaction_policy {
+  decoupled: True
+}
+input [
+  {
+    name: "text_input"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  },
+  {
+    name: "decoder_text_input"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "image_input"
+    data_type: TYPE_FP16
+    dims: [ -1, 3, -1, -1 ]
+    optional: true
+  },
+  {
+    name: "image_bytes_input"
+    data_type: TYPE_UINT8
+    dims: [ -1, -1, -1, -1 ]
+    optional: true
+  },
+  {
+    name: "image_url_input"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "video_bytes_input"
+    data_type: TYPE_UINT8
+    dims: [ -1, -1, -1, -1 ]
+    optional: true
+  },
+  {
+    name: "max_tokens"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+   name: "bad_words"
+   data_type: TYPE_STRING
+   dims: [ -1 ]
+   optional: true
+  },
+  {
+   name: "stop_words"
+   data_type: TYPE_STRING
+   dims: [ -1 ]
+   optional: true
+  },
+  {
+    name: "exclude_input_in_output"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "end_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "pad_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "top_k"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "length_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "repetition_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "min_length"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "presence_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "frequency_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "random_seed"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_context_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_generation_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "num_return_sequences"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "beam_width"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "stream"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "prompt_embedding_table"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+  },
+  {
+    name: "prompt_vocab_size"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "prompt_table_extra_id"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+      name: "embedding_bias_words"
+      data_type: TYPE_STRING
+      dims: [ -1 ]
+      optional: true
+  },
+  {
+      name: "embedding_bias_weights"
+      data_type: TYPE_FP32
+      dims: [ -1 ]
+      optional: true
+  },
+  {
+      name: "num_draft_tokens",
+      data_type: TYPE_INT32,
+      dims: [ 1 ]
+      optional: true
+  },
+  {
+      name: "use_draft_logits",
+      data_type: TYPE_BOOL,
+      dims: [ 1 ]
+      reshape: { shape: [ ] }
+      optional: true
+  },
+  # the unique task ID for the given LoRA.
+  # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
+  # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
+  # If the cache is full the oldest LoRA will be evicted to make space for new ones.  An error is returned if `lora_task_id` is not cached.
+  {
+    name: "lora_task_id"
+	data_type: TYPE_UINT64
+	dims: [ 1 ]
+    reshape: { shape: [ ] }
+	optional: true
+  },
+  # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
+  # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
+  # each of the in / out tensors are first flattened and then concatenated together in the format above.
+  # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
+  {
+    name: "lora_weights"
+	data_type: TYPE_FP16
+	dims: [ -1, -1 ]
+	optional: true
+	allow_ragged_batch: true
+  },
+  # module identifier (same size a first dimension of lora_weights)
+  # See LoraModule::ModuleType for model id mapping
+  #
+  # "attn_qkv": 0     # compbined qkv adapter
+  # "attn_q": 1       # q adapter
+  # "attn_k": 2       # k adapter
+  # "attn_v": 3       # v adapter
+  # "attn_dense": 4   # adapter for the dense layer in attention
+  # "mlp_h_to_4h": 5  # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
+  # "mlp_4h_to_h": 6  # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
+  # "mlp_gate": 7     # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
+  #
+  # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
+  {
+    name: "lora_config"
+	data_type: TYPE_INT32
+	dims: [ -1, 3 ]
+	optional: true
+	allow_ragged_batch: true
+  },
+  {
+    name: "return_kv_cache_reuse_stats"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "guided_decoding_guide_type"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "guided_decoding_guide"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+    optional: true
+  }
+]
+output [
+  {
+    name: "text_output"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  },
+  {
+    name: "cum_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "output_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "context_logits"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "generation_logits"
+    data_type: TYPE_FP16
+    dims: [ -1, -1, -1 ]
+  },
+  {
+    name: "batch_index"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "sequence_index"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "kv_cache_alloc_new_blocks"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "kv_cache_reused_blocks"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "kv_cache_alloc_total_blocks"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+parameters: {
+  key: "accumulate_tokens"
+  value: {
+    string_value: "${accumulate_tokens}"
+  }
+}
+parameters: {
+  key: "tensorrt_llm_model_name"
+  value: {
+    string_value: "tensorrt_llm"
+  }
+}
+parameters: {
+  key: "tensorrt_llm_draft_model_name"
+  value: {
+    string_value: "${tensorrt_llm_draft_model_name}"
+  }
+}
+parameters: {
+  key: "multimodal_encoders_name"
+  value: {
+    string_value: "${multimodal_encoders_name}"
+  }
+}
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]