otherhalf-dev
/

llama-3.2-1B-trt

Model card Files Files and versions Community

otherhalf-dev commited on Feb 5

Commit

7567662

verified ·

1 Parent(s): f6f26d0

Upload folder using huggingface_hub

Browse files

Files changed (22) hide show

.gitattributes +2 -0
ensemble/1/.tmp +0 -0
ensemble/config.pbtxt +487 -0
postprocessing/1/__pycache__/model.cpython-310.pyc +0 -0
postprocessing/1/model.py +250 -0
postprocessing/config.pbtxt +124 -0
preprocessing/1/__pycache__/model.cpython-310.pyc +0 -0
preprocessing/1/model.py +439 -0
preprocessing/config.pbtxt +170 -0
tensorrt_llm/1/.gitkeep +0 -0
tensorrt_llm/1/config.json +170 -0
tensorrt_llm/1/model.py +947 -0
tensorrt_llm/1/rank0.engine +3 -0
tensorrt_llm/1/rank1.engine +3 -0
tensorrt_llm/config.pbtxt +556 -0
tensorrt_llm_bls/1/__pycache__/model.cpython-310.pyc +0 -0
tensorrt_llm_bls/1/lib/__pycache__/decode.cpython-310.pyc +0 -0
tensorrt_llm_bls/1/lib/__pycache__/triton_decoder.cpython-310.pyc +0 -0
tensorrt_llm_bls/1/lib/decode.py +386 -0
tensorrt_llm_bls/1/lib/triton_decoder.py +523 -0
tensorrt_llm_bls/1/model.py +145 -0
tensorrt_llm_bls/config.pbtxt +270 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tensorrt_llm/1/rank0.engine filter=lfs diff=lfs merge=lfs -text
+tensorrt_llm/1/rank1.engine filter=lfs diff=lfs merge=lfs -text

ensemble/1/.tmp ADDED Viewed

File without changes

ensemble/config.pbtxt ADDED Viewed

	@@ -0,0 +1,487 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+name: "ensemble"
+platform: "ensemble"
+max_batch_size: 32
+input [
+  {
+    name: "text_input"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  },
+  {
+    name: "decoder_text_input"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "max_tokens"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+   name: "bad_words"
+   data_type: TYPE_STRING
+   dims: [ -1 ]
+   optional: true
+  },
+  {
+   name: "stop_words"
+   data_type: TYPE_STRING
+   dims: [ -1 ]
+   optional: true
+  },
+  {
+    name: "end_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "pad_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "top_k"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "length_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "repetition_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "min_length"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "presence_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "frequency_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "random_seed"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "return_context_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "return_generation_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "beam_width"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "stream"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "prompt_embedding_table"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+  },
+  {
+    name: "prompt_vocab_size"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "embedding_bias_words"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+    optional: true
+  },
+  {
+    name: "embedding_bias_weights"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+    optional: true
+  }
+]
+output [
+  {
+    name: "text_output"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  },
+  {
+    name: "cum_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "output_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "context_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "generation_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1, -1 ]
+  },
+  {
+    name: "batch_index"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+ensemble_scheduling {
+  step [
+    {
+      model_name: "preprocessing"
+      model_version: -1
+      input_map {
+        key: "QUERY"
+        value: "text_input"
+      }
+      input_map {
+        key: "DECODER_QUERY"
+        value: "decoder_text_input"
+      }
+      input_map {
+        key: "REQUEST_OUTPUT_LEN"
+        value: "max_tokens"
+      }
+      input_map {
+        key: "BAD_WORDS_DICT"
+        value: "bad_words"
+      }
+      input_map {
+        key: "STOP_WORDS_DICT"
+        value: "stop_words"
+      }
+      input_map {
+        key: "EMBEDDING_BIAS_WORDS"
+        value: "embedding_bias_words"
+      }
+      input_map {
+        key: "EMBEDDING_BIAS_WEIGHTS"
+        value: "embedding_bias_weights"
+      }
+      input_map {
+        key: "END_ID"
+        value: "end_id"
+      }
+      input_map {
+        key: "PAD_ID"
+        value: "pad_id"
+      }
+      output_map {
+        key: "REQUEST_INPUT_LEN"
+        value: "_REQUEST_INPUT_LEN"
+      }
+      output_map {
+        key: "INPUT_ID"
+        value: "_INPUT_ID"
+      }
+      output_map {
+        key: "REQUEST_DECODER_INPUT_LEN"
+        value: "_REQUEST_DECODER_INPUT_LEN"
+      }
+      output_map {
+        key: "DECODER_INPUT_ID"
+        value: "_DECODER_INPUT_ID"
+      }
+      output_map {
+        key: "REQUEST_OUTPUT_LEN"
+        value: "_REQUEST_OUTPUT_LEN"
+      }
+      output_map {
+        key: "STOP_WORDS_IDS"
+        value: "_STOP_WORDS_IDS"
+      }
+      output_map {
+        key: "BAD_WORDS_IDS"
+        value: "_BAD_WORDS_IDS"
+      }
+      output_map {
+        key: "EMBEDDING_BIAS"
+        value: "_EMBEDDING_BIAS"
+      }
+      output_map {
+        key: "OUT_END_ID"
+        value: "_PREPROCESSOR_END_ID"
+      }
+      output_map {
+        key: "OUT_PAD_ID"
+        value: "_PREPROCESSOR_PAD_ID"
+      }
+    },
+    {
+      model_name: "tensorrt_llm"
+      model_version: -1
+      input_map {
+        key: "input_ids"
+        value: "_INPUT_ID"
+      }
+      input_map {
+        key: "decoder_input_ids"
+        value: "_DECODER_INPUT_ID"
+      }
+      input_map {
+        key: "input_lengths"
+        value: "_REQUEST_INPUT_LEN"
+      }
+      input_map {
+        key: "decoder_input_lengths"
+        value: "_REQUEST_DECODER_INPUT_LEN"
+      }
+      input_map {
+        key: "request_output_len"
+        value: "_REQUEST_OUTPUT_LEN"
+      }
+      input_map {
+          key: "end_id"
+          value: "_PREPROCESSOR_END_ID"
+      }
+      input_map {
+          key: "pad_id"
+          value: "_PREPROCESSOR_PAD_ID"
+      }
+      input_map {
+          key: "embedding_bias"
+          value: "_EMBEDDING_BIAS"
+      }
+      input_map {
+          key: "runtime_top_k"
+          value: "top_k"
+      }
+      input_map {
+          key: "runtime_top_p"
+          value: "top_p"
+      }
+      input_map {
+          key: "temperature"
+          value: "temperature"
+      }
+      input_map {
+          key: "len_penalty"
+          value: "length_penalty"
+      }
+      input_map {
+          key: "repetition_penalty"
+          value: "repetition_penalty"
+      }
+      input_map {
+          key: "min_length"
+          value: "min_length"
+      }
+      input_map {
+          key: "presence_penalty"
+          value: "presence_penalty"
+      }
+      input_map {
+          key: "frequency_penalty"
+          value: "frequency_penalty"
+      }
+      input_map {
+          key: "random_seed"
+          value: "random_seed"
+      }
+      input_map {
+          key: "return_log_probs"
+          value: "return_log_probs"
+      }
+      input_map {
+          key: "return_context_logits"
+          value: "return_context_logits"
+      }
+      input_map {
+          key: "return_generation_logits"
+          value: "return_generation_logits"
+      }
+      input_map {
+          key: "beam_width"
+          value: "beam_width"
+      }
+      input_map {
+          key: "streaming"
+          value: "stream"
+      }
+      input_map {
+        key: "prompt_embedding_table"
+        value: "prompt_embedding_table"
+      }
+      input_map {
+        key: "prompt_vocab_size"
+        value: "prompt_vocab_size"
+      }
+      input_map {
+        key: "stop_words_list"
+        value: "_STOP_WORDS_IDS"
+      }
+      input_map {
+        key: "bad_words_list"
+        value: "_BAD_WORDS_IDS"
+      }
+      output_map {
+        key: "output_ids"
+        value: "_TOKENS_BATCH"
+      }
+      output_map {
+        key: "sequence_length"
+        value: "_SEQUENCE_LENGTH"
+      },
+      output_map {
+        key: "cum_log_probs"
+        value: "_CUM_LOG_PROBS"
+      }
+      output_map {
+        key: "output_log_probs"
+        value: "_OUTPUT_LOG_PROBS"
+      },
+      output_map {
+        key: "context_logits"
+        value: "_CONTEXT_LOGITS"
+      },
+      output_map {
+        key: "generation_logits"
+        value: "_GENERATION_LOGITS"
+      },
+      output_map {
+        key: "batch_index"
+        value: "_BATCH_INDEX"
+      }
+    },
+    {
+      model_name: "postprocessing"
+      model_version: -1
+      input_map {
+        key: "TOKENS_BATCH"
+        value: "_TOKENS_BATCH"
+      }
+      input_map {
+        key: "CUM_LOG_PROBS"
+        value: "_CUM_LOG_PROBS"
+      }
+      input_map {
+        key: "OUTPUT_LOG_PROBS"
+        value: "_OUTPUT_LOG_PROBS"
+      }
+      input_map {
+        key: "CONTEXT_LOGITS"
+        value: "_CONTEXT_LOGITS"
+      }
+      input_map {
+        key: "GENERATION_LOGITS"
+        value: "_GENERATION_LOGITS"
+      }
+      input_map {
+        key: "SEQUENCE_LENGTH"
+        value: "_SEQUENCE_LENGTH"
+      }
+      input_map {
+        key: "BATCH_INDEX"
+        value: "_BATCH_INDEX"
+      }
+      output_map {
+        key: "OUTPUT"
+        value: "text_output"
+      }
+      output_map {
+        key: "OUT_OUTPUT_LOG_PROBS"
+        value: "output_log_probs"
+      }
+      output_map {
+        key: "OUT_CUM_LOG_PROBS"
+        value: "cum_log_probs"
+      }
+      output_map {
+        key: "OUT_CONTEXT_LOGITS"
+        value: "context_logits"
+      }
+      output_map {
+        key: "OUT_GENERATION_LOGITS"
+        value: "generation_logits"
+      }
+      output_map {
+        key: "OUT_BATCH_INDEX"
+        value: "batch_index"
+      }
+    }
+  ]
+}

postprocessing/1/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (5.61 kB). View file

postprocessing/1/model.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import json
+import numpy as np
+import triton_python_backend_utils as pb_utils
+from transformers import AutoTokenizer
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        # Parse model configs
+        model_config = json.loads(args['model_config'])
+        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
+            'string_value']
+        skip_special_tokens = model_config['parameters'].get(
+            'skip_special_tokens')
+        if skip_special_tokens is not None:
+            skip_special_tokens_str = skip_special_tokens[
+                'string_value'].lower()
+            if skip_special_tokens_str in [
+                    'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
+            ]:
+                self.skip_special_tokens = skip_special_tokens_str in [
+                    'true', '1', 't', 'y', 'yes'
+                ]
+            else:
+                print(
+                    f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default."
+                )
+                self.skip_special_tokens = True
+        else:
+            print(
+                f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default."
+            )
+            self.skip_special_tokens = True
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
+                                                       legacy=False,
+                                                       padding_side='left',
+                                                       trust_remote_code=True)
+        if not self.tokenizer.pad_token:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Parse model output configs
+        output_config = pb_utils.get_output_config_by_name(
+            model_config, "OUTPUT")
+        # Convert Triton types to numpy types
+        self.output_dtype = pb_utils.triton_string_to_numpy(
+            output_config['data_type'])
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        responses = []
+        # Every Python backend must iterate over everyone of the requests
+        # and create a pb_utils.InferenceResponse for each of them.
+        for idx, request in enumerate(requests):
+            # Get input tensors
+            tokens_batch = pb_utils.get_input_tensor_by_name(
+                request, 'TOKENS_BATCH').as_numpy()
+            # Get sequence length
+            sequence_lengths = pb_utils.get_input_tensor_by_name(
+                request, 'SEQUENCE_LENGTH').as_numpy()
+            # Get cum log probs
+            cum_log_probs = pb_utils.get_input_tensor_by_name(
+                request, 'CUM_LOG_PROBS')
+            # Get sequence length
+            output_log_probs = pb_utils.get_input_tensor_by_name(
+                request, 'OUTPUT_LOG_PROBS')
+            # Get context logits
+            context_logits = pb_utils.get_input_tensor_by_name(
+                request, 'CONTEXT_LOGITS')
+            # Get generation logits
+            generation_logits = pb_utils.get_input_tensor_by_name(
+                request, 'GENERATION_LOGITS')
+            # Get the batch index
+            batch_index = pb_utils.get_input_tensor_by_name(
+                request, 'BATCH_INDEX')
+            # Reshape Input
+            # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
+            # tokens_batch = tokens_batch.T
+            # Postprocessing output data.
+            outputs = self._postprocessing(tokens_batch, sequence_lengths)
+            # Create output tensors. You need pb_utils.Tensor
+            # objects to create pb_utils.InferenceResponse.
+            output_tensor = pb_utils.Tensor(
+                'OUTPUT',
+                np.array(outputs).astype(self.output_dtype))
+            outputs = []
+            outputs.append(output_tensor)
+            if cum_log_probs:
+                out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
+                                                    cum_log_probs.as_numpy())
+                outputs.append(out_cum_log_probs)
+            else:
+                out_cum_log_probs = pb_utils.Tensor(
+                    'OUT_CUM_LOG_PROBS', np.array([[0.0]], dtype=np.float32))
+                outputs.append(out_cum_log_probs)
+            if output_log_probs:
+                out_output_log_probs = pb_utils.Tensor(
+                    'OUT_OUTPUT_LOG_PROBS', output_log_probs.as_numpy())
+                outputs.append(out_output_log_probs)
+            else:
+                out_output_log_probs = pb_utils.Tensor(
+                    'OUT_OUTPUT_LOG_PROBS',
+                    np.array([[[0.0]]], dtype=np.float32))
+                outputs.append(out_output_log_probs)
+            if context_logits:
+                out_context_logits = pb_utils.Tensor('OUT_CONTEXT_LOGITS',
+                                                     context_logits.as_numpy())
+                outputs.append(out_context_logits)
+            else:
+                out_context_logits = pb_utils.Tensor(
+                    'OUT_CONTEXT_LOGITS', np.array([[[0.0]]],
+                                                   dtype=np.float32))
+                outputs.append(out_context_logits)
+            if generation_logits:
+                out_generation_logits = pb_utils.Tensor(
+                    'OUT_GENERATION_LOGITS', generation_logits.as_numpy())
+                outputs.append(out_generation_logits)
+            else:
+                out_generation_logits = pb_utils.Tensor(
+                    'OUT_GENERATION_LOGITS',
+                    np.array([[[[0.0]]]], dtype=np.float32))
+                outputs.append(out_generation_logits)
+            if batch_index:
+                out_batch_index = pb_utils.Tensor('OUT_BATCH_INDEX',
+                                                  batch_index.as_numpy())
+                outputs.append(out_batch_index)
+            else:
+                out_batch_index = pb_utils.Tensor(
+                    'OUT_BATCH_INDEX', np.array([[0]], dtype=np.int32))
+                outputs.append(out_batch_index)
+            # Create InferenceResponse. You can set an error here in case
+            # there was a problem with handling this inference request.
+            # Below is an example of how you can set errors in inference
+            # response:
+            #
+            # pb_utils.InferenceResponse(
+            #    output_tensors=..., TritonError("An error occurred"))
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=outputs)
+            responses.append(inference_response)
+        # You should return a list of pb_utils.InferenceResponse. Length
+        # of this list must match the length of `requests` list.
+        return responses
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print('Cleaning up...')
+    def _postprocessing(self, tokens_batch, sequence_lengths):
+        outputs = []
+        for batch_idx, beam_tokens in enumerate(tokens_batch):
+            for beam_idx, tokens in enumerate(beam_tokens):
+                seq_len = sequence_lengths[batch_idx][beam_idx]
+                # Exclude fake ids in multimodal models
+                fake_id_len = 0
+                for i in range(seq_len):
+                    if tokens[i] < self.tokenizer.vocab_size:
+                        fake_id_len = i
+                        break
+                output = self.tokenizer.decode(
+                    tokens[fake_id_len:seq_len],
+                    skip_special_tokens=self.skip_special_tokens)
+                outputs.append(output.encode('utf8'))
+        return outputs

postprocessing/config.pbtxt ADDED Viewed

	@@ -0,0 +1,124 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+name: "postprocessing"
+backend: "python"
+max_batch_size: 32
+input [
+  {
+    name: "TOKENS_BATCH"
+    data_type: TYPE_INT32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "SEQUENCE_LENGTH"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+    name: "CUM_LOG_PROBS"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+    optional: true
+  },
+  {
+    name: "OUTPUT_LOG_PROBS"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+    optional: true
+  },
+  {
+    name: "CONTEXT_LOGITS"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+    optional: true
+  },
+  {
+    name: "GENERATION_LOGITS"
+    data_type: TYPE_FP32
+    dims: [ -1, -1, -1 ]
+    optional: true
+  },
+  {
+    name: "BATCH_INDEX"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  }
+]
+output [
+  {
+    name: "OUTPUT"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  },
+  {
+    name: "OUT_CUM_LOG_PROBS"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "OUT_OUTPUT_LOG_PROBS"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "OUT_CONTEXT_LOGITS"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "OUT_GENERATION_LOGITS"
+    data_type: TYPE_FP32
+    dims: [ -1, -1, -1 ]
+  },
+  {
+    name: "OUT_BATCH_INDEX"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+parameters {
+  key: "tokenizer_dir"
+  value: {
+    string_value: "mlabonne/Llama-3.1-70B-Instruct-lorablated"
+  }
+}
+parameters {
+  key: "skip_special_tokens"
+  value: {
+    string_value: "True"
+  }
+}
+instance_group [
+    {
+        count: 1
+        kind: KIND_CPU
+    }
+]

preprocessing/1/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (11.1 kB). View file

preprocessing/1/model.py ADDED Viewed

	@@ -0,0 +1,439 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import json
+import os
+from typing import List
+import numpy as np
+import triton_python_backend_utils as pb_utils
+from transformers import AutoTokenizer, T5Tokenizer
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        # Parse model configs
+        model_config = json.loads(args['model_config'])
+        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
+            'string_value']
+        add_special_tokens = model_config['parameters'].get(
+            'add_special_tokens')
+        visual_model_path = model_config['parameters']['visual_model_path'][
+            'string_value']
+        if visual_model_path == "${visual_model_path}" or visual_model_path == "":
+            visual_model_path = None
+        if add_special_tokens is not None:
+            add_special_tokens_str = add_special_tokens['string_value'].lower()
+            if add_special_tokens_str in [
+                    'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
+            ]:
+                self.add_special_tokens = add_special_tokens_str in [
+                    'true', '1', 't', 'y', 'yes'
+                ]
+            else:
+                print(
+                    f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default."
+                )
+                self.add_special_tokens = True
+        else:
+            print(
+                f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default."
+            )
+            self.add_special_tokens = True
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
+                                                       legacy=False,
+                                                       padding_side='left',
+                                                       trust_remote_code=True)
+        if isinstance(self.tokenizer, T5Tokenizer):
+            self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id()
+        if not self.tokenizer.pad_token:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.tokenizer_end_id = self.tokenizer.encode(
+            self.tokenizer.eos_token, add_special_tokens=False)[0]
+        self.tokenizer_pad_id = self.tokenizer.encode(
+            self.tokenizer.pad_token, add_special_tokens=False)[0]
+        self.is_multimodal = False
+        if visual_model_path is not None:
+            self.is_multimodal = True
+            visual_model_path = os.path.join(visual_model_path, 'config.json')
+            with open(visual_model_path, 'r') as f:
+                visual_model_config = json.load(f)
+            self.model_type = visual_model_config['builder_config'][
+                'model_type']
+            assert self.model_type in [
+                'llava', 'blip2-opt'
+            ], f"[TensorRT-LLM][ERROR] Currently supported multi-modal models are llava and blip2-opt"
+            llm_model_path = model_config['parameters']['gpt_model_path'][
+                'string_value']
+            llm_model_path = os.path.join(llm_model_path, 'config.json')
+            with open(llm_model_path, 'r') as f:
+                llm_model_config = json.load(f)
+            self.vocab_size = int(
+                llm_model_config["pretrained_config"]["vocab_size"])
+            self._setup_ptable_shape(llm_model_config)
+        # Parse model output configs and convert Triton types to numpy types
+        output_names = [
+            "INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN",
+            "REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS",
+            "OUT_END_ID", "OUT_PAD_ID"
+        ]
+        input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"]
+        for input_name in input_names:
+            setattr(
+                self,
+                input_name.lower() + "_dtype",
+                pb_utils.triton_string_to_numpy(
+                    pb_utils.get_input_config_by_name(
+                        model_config, input_name)['data_type']))
+        for output_name in output_names:
+            setattr(
+                self,
+                output_name.lower() + "_dtype",
+                pb_utils.triton_string_to_numpy(
+                    pb_utils.get_output_config_by_name(
+                        model_config, output_name)['data_type']))
+    def _setup_ptable_shape(self, llm_model_config):
+        max_prompt_embedding_table_size = llm_model_config['build_config'][
+            'max_prompt_embedding_table_size']
+        max_batch_size = llm_model_config['build_config']['max_batch_size']
+        num_visual_features = max_prompt_embedding_table_size // max_batch_size
+        hidden_size = llm_model_config['pretrained_config']['hidden_size']
+        self.ptable_shape = (-1, num_visual_features, hidden_size)
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        responses = []
+        # Every Python backend must iterate over everyone of the requests
+        # and create a pb_utils.InferenceResponse for each of them.
+        for idx, request in enumerate(requests):
+            # Get input tensors
+            query = pb_utils.get_input_tensor_by_name(request,
+                                                      'QUERY').as_numpy()
+            batch_size = query.shape[0]
+            decoder_query = pb_utils.get_input_tensor_by_name(
+                request, 'DECODER_QUERY')
+            if decoder_query is not None:
+                decoder_query = decoder_query.as_numpy()
+            request_output_len = pb_utils.get_input_tensor_by_name(
+                request, 'REQUEST_OUTPUT_LEN').as_numpy()
+            bad_words_dict = pb_utils.get_input_tensor_by_name(
+                request, 'BAD_WORDS_DICT')
+            if bad_words_dict is not None:
+                bad_words_dict = bad_words_dict.as_numpy()
+            stop_words_dict = pb_utils.get_input_tensor_by_name(
+                request, 'STOP_WORDS_DICT')
+            if stop_words_dict is not None:
+                stop_words_dict = stop_words_dict.as_numpy()
+            embedding_bias_words = pb_utils.get_input_tensor_by_name(
+                request, 'EMBEDDING_BIAS_WORDS')
+            if embedding_bias_words is not None:
+                embedding_bias_words = embedding_bias_words.as_numpy()
+            embedding_bias_weights = pb_utils.get_input_tensor_by_name(
+                request, 'EMBEDDING_BIAS_WEIGHTS')
+            if embedding_bias_weights is not None:
+                embedding_bias_weights = embedding_bias_weights.as_numpy()
+            # Take the end_id from the input tensors
+            # If not specified, use tokenizer to get end_id
+            end_id = pb_utils.get_input_tensor_by_name(request, 'END_ID')
+            if end_id is not None:
+                end_id = end_id.as_numpy()
+            else:
+                end_id = [[self.tokenizer_end_id]] * batch_size
+            # Take the pad_id from the input tensors
+            # If not specified, use tokenizer to get pad_id
+            pad_id = pb_utils.get_input_tensor_by_name(request, 'PAD_ID')
+            if pad_id is not None:
+                pad_id = pad_id.as_numpy()
+            else:
+                pad_id = [[self.tokenizer_pad_id]] * batch_size
+            # Preprocessing input data.
+            input_id, request_input_len = self._create_request(query)
+            if decoder_query is not None:
+                decoder_input_id, request_decoder_input_len = self._create_request(
+                    decoder_query)
+            else:
+                decoder_input_id = pad_id * np.ones((batch_size, 1), np.int32)
+                request_decoder_input_len = 1 * np.ones(
+                    (batch_size, 1), np.int32)
+            bad_words = self._to_word_list_format(bad_words_dict, batch_size)
+            stop_words = self._to_word_list_format(stop_words_dict, batch_size)
+            embedding_bias = self._get_embedding_bias(
+                embedding_bias_words, embedding_bias_weights,
+                self.embedding_bias_weights_dtype, batch_size)
+            # Create output tensors. You need pb_utils.Tensor
+            # objects to create pb_utils.InferenceResponse.
+            input_id_tensor = pb_utils.Tensor(
+                'INPUT_ID', input_id.astype(self.input_id_dtype))
+            request_input_len_tensor = pb_utils.Tensor(
+                'REQUEST_INPUT_LEN',
+                request_input_len.astype(self.request_input_len_dtype))
+            decoder_input_id_tensor = pb_utils.Tensor(
+                'DECODER_INPUT_ID',
+                decoder_input_id.astype(self.decoder_input_id_dtype))
+            request_decoder_input_len_tensor = pb_utils.Tensor(
+                'REQUEST_DECODER_INPUT_LEN',
+                request_decoder_input_len.astype(
+                    self.request_decoder_input_len_dtype))
+            request_output_len_tensor = pb_utils.Tensor(
+                'REQUEST_OUTPUT_LEN', request_output_len)
+            bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
+            stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
+                                                    stop_words)
+            embedding_bias_tensor = pb_utils.Tensor('EMBEDDING_BIAS',
+                                                    embedding_bias)
+            end_id_tensor = pb_utils.Tensor('OUT_END_ID',
+                                            np.array(end_id, dtype=np.int32))
+            pad_id_tensor = pb_utils.Tensor('OUT_PAD_ID',
+                                            np.array(pad_id, dtype=np.int32))
+            inference_response = pb_utils.InferenceResponse(output_tensors=[
+                input_id_tensor, decoder_input_id_tensor, bad_words_ids_tensor,
+                stop_words_ids_tensor, request_input_len_tensor,
+                request_decoder_input_len_tensor, request_output_len_tensor,
+                embedding_bias_tensor, end_id_tensor, pad_id_tensor
+            ])
+            responses.append(inference_response)
+        # You should return a list of pb_utils.InferenceResponse. Length
+        # of this list must match the length of `requests` list.
+        return responses
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print('Cleaning up...')
+    def _create_request(self, query):
+        """
+            query : batch string (2D numpy array)
+        """
+        if isinstance(self.tokenizer, T5Tokenizer):
+            start_ids = [
+                np.array([self.tokenizer_bos_id] + self.tokenizer.encode(
+                    s[0].decode(), add_special_tokens=self.add_special_tokens)
+                         ).astype(int) for s in query
+            ]
+        else:
+            start_ids = [
+                np.array(
+                    self.tokenizer.encode(
+                        s[0].decode(),
+                        add_special_tokens=self.add_special_tokens)).astype(
+                            int) for s in query
+            ]
+        if self.is_multimodal:
+            if 'blip2' in self.model_type:
+                pre_prompt = None
+                post_prompt = None
+            elif 'llava' == self.model_type:
+                pre_prompt = "USER:\n"
+                post_prompt = " ASSISTANT:"
+            fake_prompt_id = np.arange(self.vocab_size,
+                                       self.vocab_size + self.ptable_shape[1])
+            if pre_prompt is not None:
+                pre_prompt_id = np.array(
+                    self.tokenizer.encode(
+                        pre_prompt,
+                        add_special_tokens=self.add_special_tokens,
+                        padding=True))
+            if post_prompt is not None:
+                post_prompt_id = np.array(
+                    self.tokenizer.encode(
+                        post_prompt,
+                        add_special_tokens=self.add_special_tokens,
+                        padding=True))
+            if post_prompt is None:
+                start_ids = [
+                    np.concatenate((fake_prompt_id, ids), axis=0)
+                    for ids in start_ids
+                ]
+            else:
+                start_ids = [
+                    np.concatenate(
+                        (pre_prompt_id, fake_prompt_id, ids, post_prompt_id),
+                        axis=0) for ids in start_ids
+                ]
+        start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int)
+        max_len = 0
+        for seq in start_ids:
+            max_len = max(max_len, seq.shape[0])
+        start_ids = np.stack([
+            np.pad(seq, (0, max_len - seq.shape[0]),
+                   'constant',
+                   constant_values=(0, self.tokenizer_pad_id))
+            for seq in start_ids
+        ])
+        return start_ids, start_lengths
+    def _to_word_list_format(self, word_lists: List[List[str | bytes]],
+                             batch_size):
+        '''
+        word_lists format:
+            len(word_lists) == batch_size
+            word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
+        '''
+        assert self.tokenizer != None, "need to set tokenizer"
+        if word_lists is None:
+            # Return an empty array of shape (1,2,0)
+            return np.empty([batch_size, 2, 0], dtype="int32")
+        flat_ids = []
+        offsets = []
+        for word_list in word_lists:
+            item_flat_ids = []
+            item_offsets = []
+            for word in word_list:
+                if isinstance(word, bytes):
+                    word = word.decode()
+                ids = self.tokenizer.encode(word, add_special_tokens=False)
+                if len(ids) == 0:
+                    continue
+                item_flat_ids += ids
+                item_offsets.append(len(ids))
+            flat_ids.append(np.array(item_flat_ids))
+            offsets.append(np.cumsum(np.array(item_offsets)))
+        pad_to = max(1, max(len(ids) for ids in flat_ids))
+        for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
+            flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)),
+                                 constant_values=0)
+            offsets[i] = np.pad(offs, (0, pad_to - len(offs)),
+                                constant_values=-1)
+        return np.array([flat_ids, offsets], dtype="int32").transpose(
+            (1, 0, 2))
+    def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights,
+                            bias_dtype, batch_size):
+        assert self.tokenizer != None, "need to set tokenizer"
+        if embedding_bias_words is None or embedding_bias_weights is None:
+            return np.empty([batch_size, 0],
+                            dtype=self.embedding_bias_weights_dtype)
+        batch_embedding_bias = []
+        for words, weights in zip(embedding_bias_words,
+                                  embedding_bias_weights):
+            vocab_size = self.tokenizer.vocab_size
+            embedding_bias = [0.] * vocab_size
+            assert len(words) == len(
+                weights
+            ), "Embedding bias words must have same dimension as embedding bias weights"
+            for word, weight in zip(words, weights):
+                if isinstance(word, bytes):
+                    word = word.decode()
+                ids = self.tokenizer.encode(word)
+                if len(ids) == 0:
+                    continue
+                for id in ids:
+                    embedding_bias[id] += weight
+            batch_embedding_bias.append(np.array(embedding_bias))
+        return np.array(batch_embedding_bias, dtype=bias_dtype)

preprocessing/config.pbtxt ADDED Viewed

	@@ -0,0 +1,170 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+name: "preprocessing"
+backend: "python"
+max_batch_size: 32
+input [
+    {
+        name: "QUERY"
+        data_type: TYPE_STRING
+        dims: [ 1 ]
+    },
+    {
+        name: "DECODER_QUERY"
+        data_type: TYPE_STRING
+        dims: [ 1 ]
+        optional: true
+    },
+    {
+        name: "REQUEST_OUTPUT_LEN"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    },
+    {
+        name: "BAD_WORDS_DICT"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+        optional: true
+    },
+    {
+        name: "STOP_WORDS_DICT"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+        optional: true
+    },
+    {
+        name: "EMBEDDING_BIAS_WORDS"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+        optional: true
+    },
+    {
+        name: "EMBEDDING_BIAS_WEIGHTS"
+        data_type: TYPE_FP32
+        dims: [ -1 ]
+        optional: true
+    },
+    {
+        name: "END_ID"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+        optional: true
+    },
+    {
+        name: "PAD_ID"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+        optional: true
+    }
+]
+output [
+    {
+        name: "INPUT_ID"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    },
+    {
+        name: "REQUEST_INPUT_LEN"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    },
+    {
+        name: "DECODER_INPUT_ID"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    },
+    {
+        name: "REQUEST_DECODER_INPUT_LEN"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    },
+    {
+        name: "BAD_WORDS_IDS"
+        data_type: TYPE_INT32
+        dims: [ 2, -1 ]
+    },
+    {
+        name: "STOP_WORDS_IDS"
+        data_type: TYPE_INT32
+        dims: [ 2, -1 ]
+    },
+    {
+        name: "EMBEDDING_BIAS"
+        data_type: TYPE_FP32
+        dims: [ -1 ]
+    },
+    {
+        name: "REQUEST_OUTPUT_LEN"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    },
+    {
+        name: "OUT_END_ID"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    },
+    {
+        name: "OUT_PAD_ID"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    }
+]
+parameters {
+  key: "tokenizer_dir"
+  value: {
+    string_value: "mlabonne/Llama-3.1-70B-Instruct-lorablated"
+  }
+}
+parameters {
+  key: "add_special_tokens"
+  value: {
+    string_value: "False"
+  }
+}
+parameters {
+  key: "visual_model_path"
+  value: {
+    string_value: "${visual_model_path}"
+  }
+}
+parameters: {
+  key: "gpt_model_path"
+  value: {
+    string_value: "/all_models/inflight_batcher_llm/tensorrt_llm/1"
+  }
+}
+instance_group [
+    {
+        count: 1
+        kind: KIND_CPU
+    }
+]

tensorrt_llm/1/.gitkeep ADDED Viewed

File without changes

tensorrt_llm/1/config.json ADDED Viewed

	@@ -0,0 +1,170 @@

+{
+    "version": "0.13.0.dev2024082000",
+    "pretrained_config": {
+        "mlp_bias": false,
+        "attn_bias": false,
+        "rotary_base": 500000.0,
+        "rotary_scaling": {
+            "factor": 32.0,
+            "high_freq_factor": 4.0,
+            "low_freq_factor": 1.0,
+            "original_max_position_embeddings": 8192,
+            "rope_type": "llama3"
+        },
+        "residual_mlp": false,
+        "disable_weight_only_quant_plugin": false,
+        "moe": {
+            "num_experts": 0,
+            "top_k": 0,
+            "normalization_mode": 1,
+            "tp_mode": 0
+        },
+        "remove_duplicated_kv_heads": false,
+        "architecture": "LlamaForCausalLM",
+        "dtype": "float16",
+        "vocab_size": 128256,
+        "hidden_size": 2048,
+        "num_hidden_layers": 16,
+        "num_attention_heads": 32,
+        "hidden_act": "silu",
+        "logits_dtype": "float16",
+        "norm_epsilon": 1e-05,
+        "position_embedding_type": "rope_gpt_neox",
+        "max_position_embeddings": 131072,
+        "num_key_value_heads": 8,
+        "intermediate_size": 8192,
+        "mapping": {
+            "world_size": 2,
+            "gpus_per_node": 8,
+            "cp_size": 1,
+            "tp_size": 2,
+            "pp_size": 1,
+            "moe_tp_size": 2,
+            "moe_ep_size": 1
+        },
+        "quantization": {
+            "quant_algo": "FP8",
+            "kv_cache_quant_algo": "FP8",
+            "group_size": 128,
+            "smoothquant_val": 0.5,
+            "clamp_val": null,
+            "has_zero_point": false,
+            "pre_quant_scale": false,
+            "exclude_modules": null
+        },
+        "use_parallel_embedding": true,
+        "embedding_sharding_dim": 0,
+        "share_embedding_table": false,
+        "head_size": 64,
+        "qk_layernorm": false,
+        "producer": {
+            "name": "modelopt",
+            "version": "0.15.1"
+        },
+        "bias": false,
+        "rotary_pct": 1.0,
+        "rank": 0,
+        "decoder": "llama",
+        "rmsnorm": true,
+        "lm_head_bias": false
+    },
+    "build_config": {
+        "max_input_len": 124000,
+        "max_seq_len": 4194304,
+        "opt_batch_size": null,
+        "max_batch_size": 32,
+        "max_beam_width": 1,
+        "max_num_tokens": 128000,
+        "opt_num_tokens": null,
+        "max_prompt_embedding_table_size": 0,
+        "kv_cache_type": "PAGED",
+        "gather_context_logits": false,
+        "gather_generation_logits": false,
+        "strongly_typed": true,
+        "builder_opt": null,
+        "force_num_profiles": null,
+        "profiling_verbosity": "layer_names_only",
+        "enable_debug_output": false,
+        "max_draft_len": 0,
+        "speculative_decoding_mode": 1,
+        "use_refit": false,
+        "input_timing_cache": null,
+        "output_timing_cache": "model.cache",
+        "lora_config": {
+            "lora_dir": [],
+            "lora_ckpt_source": "hf",
+            "max_lora_rank": 64,
+            "lora_target_modules": [],
+            "trtllm_modules_to_hf_modules": {}
+        },
+        "auto_parallel_config": {
+            "world_size": 1,
+            "gpus_per_node": 8,
+            "cluster_key": "H100-PCIe",
+            "cluster_info": null,
+            "sharding_cost_model": "alpha_beta",
+            "comm_cost_model": "alpha_beta",
+            "enable_pipeline_parallelism": false,
+            "enable_shard_unbalanced_shape": false,
+            "enable_shard_dynamic_shape": false,
+            "enable_reduce_scatter": true,
+            "builder_flags": null,
+            "debug_mode": false,
+            "infer_shape": true,
+            "validation_mode": false,
+            "same_buffer_io": {
+                "past_key_value_(\\d+)": "present_key_value_\\1"
+            },
+            "same_spec_io": {},
+            "sharded_io_allowlist": [
+                "past_key_value_\\d+",
+                "present_key_value_\\d*"
+            ],
+            "fill_weights": false,
+            "parallel_config_cache": null,
+            "profile_cache": null,
+            "dump_path": null,
+            "debug_outputs": []
+        },
+        "weight_sparsity": false,
+        "weight_streaming": false,
+        "plugin_config": {
+            "dtype": "float16",
+            "bert_attention_plugin": "auto",
+            "gpt_attention_plugin": "float16",
+            "gemm_plugin": "fp8",
+            "gemm_swiglu_plugin": null,
+            "fp8_rowwise_gemm_plugin": null,
+            "smooth_quant_gemm_plugin": null,
+            "identity_plugin": null,
+            "layernorm_quantization_plugin": null,
+            "rmsnorm_quantization_plugin": null,
+            "nccl_plugin": "float16",
+            "lookup_plugin": null,
+            "lora_plugin": null,
+            "weight_only_groupwise_quant_matmul_plugin": null,
+            "weight_only_quant_matmul_plugin": null,
+            "quantize_per_token_plugin": false,
+            "quantize_tensor_plugin": false,
+            "moe_plugin": "auto",
+            "mamba_conv1d_plugin": "auto",
+            "context_fmha": true,
+            "bert_context_fmha_fp32_acc": false,
+            "paged_kv_cache": true,
+            "remove_input_padding": true,
+            "reduce_fusion": false,
+            "enable_xqa": true,
+            "tokens_per_block": 64,
+            "use_paged_context_fmha": true,
+            "use_fp8_context_fmha": true,
+            "multiple_profiles": true,
+            "paged_state": false,
+            "streamingllm": false,
+            "manage_weights": false,
+            "use_fused_mlp": true
+        },
+        "use_strip_plan": false,
+        "max_encoder_input_len": 1024,
+        "use_fused_mlp": true
+    }
+}

tensorrt_llm/1/model.py ADDED Viewed

	@@ -0,0 +1,947 @@

+import datetime
+import json
+import os
+import sys
+import time
+from random import randint
+from threading import Lock, Thread
+import numpy as np
+import torch
+import triton_python_backend_utils as pb_utils
+from torch import from_numpy
+from torch.utils.dlpack import from_dlpack
+import tensorrt_llm.bindings.executor as trtllm
+def get_input_tensor_by_name(request,
+                             name,
+                             expected_batch_size=None,
+                             batch_index=None):
+    tensor = pb_utils.get_input_tensor_by_name(request, name)
+    if tensor is None:
+        return None
+    if tensor.is_cpu():
+        tensor = tensor.as_numpy()
+    else:
+        tensor = from_dlpack(tensor.to_dlpack())
+    if expected_batch_size is not None and tensor.shape[
+            0] != expected_batch_size:
+        raise pb_utils.TritonModelException(
+            f"Expected batch size doesn't match batch size for tensor {name}. Expected {expected_batch_size} got {tensor.shape[0]}"
+        )
+    if batch_index is not None and expected_batch_size is not None and batch_index >= expected_batch_size:
+        raise pb_utils.TritonModelException(
+            f"Invalid batch index in get_input_tensor_by_name for {name}")
+    if batch_index is not None:
+        # Add leading 1 batch dimension
+        if isinstance(tensor, np.ndarray):
+            return np.expand_dims(tensor[batch_index], axis=0)
+        elif isinstance(tensor, torch.Tensor):
+            return torch.unsqueeze(tensor[batch_index], dim=0)
+    else:
+        return tensor
+def get_input_scalar_by_name(request,
+                             name,
+                             expected_batch_size=1,
+                             batch_index=0):
+    tensor = pb_utils.get_input_tensor_by_name(request, name)
+    if tensor is None:
+        return None
+    tensor = tensor.as_numpy()
+    if tensor.size != expected_batch_size:
+        raise pb_utils.TritonModelException(
+            f"Expected a scalar tensor for tensor {name}")
+    return tensor.item(batch_index)
+def read_parameter_as_type(value, name, pytype=str):
+    if value == "":
+        return None
+    if value.startswith("${") and value.endswith("}"):
+        return None
+    if pytype is bool:
+        return value.lower() in ["1", "true"]
+    try:
+        result = pytype(value)
+        return result
+    except:
+        pb_utils.Logger.log_warning(
+            f"Could not read parameter '{name}' with value '{value}', will use default."
+        )
+        return None
+def get_parameter(model_config, name, pytype=str):
+    if name not in model_config['parameters']:
+        return None
+    return read_parameter_as_type(
+        model_config['parameters'][name]['string_value'], name, pytype)
+def convert_word_list(word_list):
+    if word_list is None:
+        return None
+    word_list = word_list.tolist()
+    if len(word_list) == 0 or len(word_list[0]) != 2:
+        raise pb_utils.TritonModelException(f"Invalid format for word list.")
+    words, indices = word_list[0]
+    result = []
+    current_index = 0
+    for i in indices:
+        if i == -1:
+            continue
+        if i > len(words):
+            raise pb_utils.TritonModelException(
+                f"Invalid format for word list.")
+        current_word = []
+        while current_index < i:
+            current_word.append(words[current_index])
+            current_index += 1
+        result.append(current_word)
+    return result
+def parse_medusa_choices(medusa_choices):
+    if medusa_choices is None:
+        return None
+    try:
+        result = json.loads(
+            "[" + medusa_choices.replace("{", "[").replace("}", "]") + "]")
+        assert isinstance(result, list) and len(result) > 0
+        assert all([isinstance(x, list) for x in result])
+        assert all([isinstance(y, int) for x in result for y in x])
+    except Exception:
+        raise pb_utils.TritonModelException(
+            "Invalid format for medusa_choices")
+    return result
+def get_sampling_config_from_request(request, batch_size=1, batch_index=0):
+    kwargs = {}
+    kwargs['beam_width'] = get_input_scalar_by_name(
+        request, 'beam_width', batch_size, batch_index) or 1
+    kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k',
+                                               batch_size, batch_index)
+    kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p',
+                                               batch_size, batch_index)
+    kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[
+        'top_p'] <= 0 else kwargs['top_p']
+    kwargs['random_seed'] = get_input_scalar_by_name(request, 'random_seed',
+                                                     batch_size, batch_index)
+    kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature',
+                                                     batch_size, batch_index)
+    kwargs['min_length'] = get_input_scalar_by_name(request, 'min_length',
+                                                    batch_size, batch_index)
+    kwargs['repetition_penalty'] = get_input_scalar_by_name(
+        request, 'repetition_penalty', batch_size, batch_index)
+    kwargs['presence_penalty'] = get_input_scalar_by_name(
+        request, 'presence_penalty', batch_size, batch_index)
+    kwargs['frequency_penalty'] = get_input_scalar_by_name(
+        request, 'frequency_penalty', batch_size, batch_index)
+    kwargs['length_penalty'] = get_input_scalar_by_name(
+        request, 'len_penalty', batch_size, batch_index)
+    kwargs['top_p_min'] = get_input_scalar_by_name(request,
+                                                   'runtime_top_p_min',
+                                                   batch_size, batch_index)
+    kwargs['top_p_reset_ids'] = get_input_scalar_by_name(
+        request, 'runtime_top_p_reset_ids', batch_size, batch_index)
+    kwargs['top_p_decay'] = get_input_scalar_by_name(request,
+                                                     'runtime_top_p_decay',
+                                                     batch_size, batch_index)
+    kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name(
+        request, 'beam_search_diversity_rate', batch_size, batch_index)
+    kwargs['early_stopping'] = get_input_scalar_by_name(
+        request, 'early_stopping', batch_size, batch_index)
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    return trtllm.SamplingConfig(**kwargs)
+def get_output_config_from_request(request,
+                                   exclude_input_from_output,
+                                   batch_size=1,
+                                   batch_index=0):
+    kwargs = {}
+    kwargs["return_log_probs"] = get_input_scalar_by_name(
+        request, 'return_log_probs', batch_size, batch_index)
+    kwargs["return_context_logits"] = get_input_scalar_by_name(
+        request, 'return_context_logits', batch_size, batch_index)
+    kwargs["return_generation_logits"] = get_input_scalar_by_name(
+        request, 'return_generation_logits', batch_size, batch_index)
+    kwargs["exclude_input_from_output"] = exclude_input_from_output
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    return trtllm.OutputConfig(**kwargs)
+def get_external_draft_tokens_config_from_request(request,
+                                                  batch_size=1,
+                                                  batch_index=0):
+    kwargs = {}
+    draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids',
+                                               batch_size, batch_index)
+    if draft_input_ids is not None:
+        kwargs['tokens'] = draft_input_ids[0].tolist()
+    draft_logits = get_input_tensor_by_name(request, 'draft_logits',
+                                            batch_size, batch_index)
+    if draft_logits is not None:
+        kwargs['logits'] = from_numpy(draft_logits).squeeze()
+    kwargs['acceptance_threshold'] = get_input_scalar_by_name(
+        request, 'draft_acceptance_threshold', batch_size, batch_index)
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    if len(kwargs) > 0:
+        return trtllm.ExternalDraftTokensConfig(**kwargs)
+    return None
+def get_prompt_tuning_config_from_request(request,
+                                          batch_size=1,
+                                          batch_index=0):
+    # prompt_vocab_size is unused by executor.
+    kwargs = {}
+    prompt_embedding_table = get_input_tensor_by_name(
+        request, 'prompt_embedding_table', batch_size, batch_index)
+    if prompt_embedding_table is not None:
+        if isinstance(prompt_embedding_table, np.ndarray):
+            kwargs["embedding_table"] = from_numpy(
+                prompt_embedding_table).squeeze()
+        elif isinstance(prompt_embedding_table, torch.Tensor):
+            kwargs["embedding_table"] = from_dlpack(
+                prompt_embedding_table.to_dlpack()).squeeze(dim=0)
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    if len(kwargs) > 0:
+        return trtllm.PromptTuningConfig(**kwargs)
+    return None
+def get_lora_config_from_request(request, batch_size=1, batch_index=0):
+    kwargs = {}
+    kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id',
+                                                 batch_size, batch_index)
+    lora_weights = get_input_tensor_by_name(request, 'lora_weights',
+                                            batch_size, batch_index)
+    if lora_weights is not None:
+        kwargs["weights"] = from_numpy(lora_weights).squeeze()
+    lora_config = get_input_tensor_by_name(request, 'lora_config', batch_size,
+                                           batch_index)
+    if lora_config is not None:
+        kwargs["config"] = from_numpy(lora_config).squeeze()
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    if len(kwargs) > 0:
+        return trtllm.LoraConfig(**kwargs)
+    return None
+def convert_request(request, exclude_input_from_output, decoupled):
+    inputs = {}
+    input_token_ids = get_input_tensor_by_name(request, 'input_ids')
+    if input_token_ids is None:
+        raise pb_utils.TritonModelException(
+            "A value is required for input_ids")
+    if len(input_token_ids.shape) != 2:
+        raise pb_utils.TritonModelException(f"Invalid format for input_ids")
+    batch_size = input_token_ids.shape[0]
+    requests = []
+    for batch_index in range(0, batch_size):
+        input_token_ids = get_input_tensor_by_name(request, 'input_ids',
+                                                   batch_size, batch_index)[0]
+        if input_token_ids is None:
+            raise pb_utils.TritonModelException(
+                "A value is required for input_ids")
+        input_token_ids = input_token_ids.tolist()
+        if len(input_token_ids) == 0:
+            raise pb_utils.TritonModelException(
+                f"Invalid format for input_ids")
+        input_length = get_input_scalar_by_name(request, 'input_lengths',
+                                                batch_size, batch_index)
+        if input_length is None:
+            input_length = len(input_token_ids)
+        # Trim input token ids with input_lengths
+        inputs['input_token_ids'] = input_token_ids[0:input_length]
+        inputs['max_new_tokens'] = get_input_scalar_by_name(
+            request, 'request_output_len', batch_size, batch_index)
+        if inputs['max_new_tokens'] is None:
+            raise pb_utils.TritonModelException(
+                "A value is required for request_output_len")
+        inputs['streaming'] = get_input_scalar_by_name(request, 'streaming',
+                                                       batch_size, batch_index)
+        if inputs['streaming'] and not decoupled:
+            raise pb_utils.TritonModelException(
+                "Streaming is only supported in decoupled mode.")
+        inputs['end_id'] = get_input_scalar_by_name(request, 'end_id',
+                                                    batch_size, batch_index)
+        inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id',
+                                                    batch_size, batch_index)
+        inputs['stop_words'] = convert_word_list(
+            get_input_tensor_by_name(request, 'stop_words_list', batch_size,
+                                     batch_index))
+        inputs['bad_words'] = convert_word_list(
+            get_input_tensor_by_name(request, 'bad_words_list', batch_size,
+                                     batch_index))
+        embedding_bias = get_input_tensor_by_name(request, 'embedding_bias',
+                                                  batch_size, batch_index)
+        if embedding_bias is not None and embedding_bias.size != 0:
+            inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze()
+        sampling_config = get_sampling_config_from_request(
+            request, batch_size, batch_index)
+        output_config = get_output_config_from_request(
+            request, exclude_input_from_output, batch_size, batch_index)
+        external_draft_tokens_config = get_external_draft_tokens_config_from_request(
+            request, batch_size, batch_index)
+        prompt_tuning_config = get_prompt_tuning_config_from_request(
+            request, batch_size, batch_index)
+        lora_config = get_lora_config_from_request(request, batch_size,
+                                                   batch_index)
+        requests.append(
+            trtllm.Request(
+                **inputs,
+                sampling_config=sampling_config,
+                output_config=output_config,
+                external_draft_tokens_config=external_draft_tokens_config,
+                prompt_tuning_config=prompt_tuning_config,
+                lora_config=lora_config,
+            ))
+    return requests
+def convert_response(response, batch_index):
+    if response.has_error():
+        return pb_utils.InferenceResponse(output_tensors=[],
+                                          error=pb_utils.TritonError(
+                                              response.error_msg)), True
+    result = response.result
+    beam_lengths = np.expand_dims(
+        np.array([len(beam) for beam in result.output_token_ids], np.int32), 0)
+    max_beam_length = max([len(beam) for beam in result.output_token_ids])
+    output_ids = np.full((1, len(result.output_token_ids), max_beam_length),
+                         -1, np.int32)
+    for idx, beam in enumerate(result.output_token_ids):
+        output_ids[0, idx, :len(beam)] = beam
+    output_tensors = [
+        pb_utils.Tensor("output_ids", output_ids),
+        pb_utils.Tensor("sequence_length", beam_lengths),
+    ]
+    output_tensors.append(
+        pb_utils.Tensor(
+            "cum_log_probs",
+            np.expand_dims(np.array(result.cum_log_probs, np.float32), 0)
+            if result.cum_log_probs is not None else np.zeros(
+                (1, 1), np.float32)))
+    output_tensors.append(
+        pb_utils.Tensor(
+            "output_log_probs",
+            np.expand_dims(np.array(result.log_probs, np.float32), 0) if
+            result.log_probs is not None else np.zeros((1, 1, 1), np.float32)))
+    output_tensors.append(
+        pb_utils.Tensor(
+            "context_logits",
+            np.expand_dims(np.array(result.context_logits, np.float32), 0)
+            if result.context_logits is not None else np.zeros(
+                (1, 1, 1), np.float32)))
+    output_tensors.append(
+        pb_utils.Tensor(
+            "generation_logits",
+            np.expand_dims(np.array(result.generation_logits, np.float32), 0)
+            if result.generation_logits is not None else np.zeros(
+                (1, 1, 1, 1), np.float32)))
+    output_tensors.append(
+        pb_utils.Tensor("batch_index",
+                        np.expand_dims(np.array([batch_index], np.int32), 0)))
+    return pb_utils.InferenceResponse(output_tensors), result.is_final
+def convert_scheduler_policy(batch_scheduler_policy: str):
+    if batch_scheduler_policy.lower() == "max_utilization":
+        return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
+    elif batch_scheduler_policy.lower() == "guaranteed_no_evict":
+        return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
+    raise pb_utils.TritonModelException(
+        f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported."
+    )
+def convert_batching_type(gpt_model_type: str):
+    if gpt_model_type is None:
+        return None
+    if gpt_model_type.lower(
+    ) == "inflight_fused_batching" or gpt_model_type.lower(
+    ) == "inflight_batching":
+        return trtllm.BatchingType.INFLIGHT
+    elif gpt_model_type.lower() == "v1":
+        return trtllm.BatchingType.STATIC
+    raise pb_utils.TritonModelException(
+        f"gpt_model_type value of '{gpt_model_type}' is not supported.")
+def convert_decoding_mode(decoding_mode: str):
+    if decoding_mode is None:
+        return None
+    elif decoding_mode == "auto":
+        return trtllm.DecodingMode.Auto()
+    elif decoding_mode == "top_k":
+        return trtllm.DecodingMode.TopK()
+    elif decoding_mode == "top_p":
+        return trtllm.DecodingMode.TopP()
+    elif decoding_mode == "top_k_top_p":
+        return trtllm.DecodingMode.TopKTopP()
+    elif decoding_mode == "beam_search":
+        return trtllm.DecodingMode.BeamSearch()
+    elif decoding_mode == "medusa":
+        return trtllm.DecodingMode.Medusa()
+    raise pb_utils.TritonModelException(
+        f"decoding_mode value of '{decoding_mode}' is not supported.")
+def convert_timestamp_to_seconds(timestamp: str):
+    return int(
+        datetime.datetime.strptime(timestamp,
+                                   "%m-%d-%Y %H:%M:%S.%f").timestamp())
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def get_scheduler_config(self, model_config):
+        batch_scheduler_policy = get_parameter(model_config,
+                                               "batch_scheduler_policy")
+        if batch_scheduler_policy is None:
+            return trtllm.SchedulerConfig()
+        return trtllm.SchedulerConfig(
+            convert_scheduler_policy(batch_scheduler_policy))
+    def get_kv_cache_config(self, model_config):
+        kwargs = {
+            "enable_block_reuse":
+            get_parameter(model_config, "enable_kv_cache_reuse", bool),
+            "max_tokens":
+            get_parameter(model_config, "max_tokens_in_paged_kv_cache", int),
+            "sink_token_length":
+            get_parameter(model_config, "sink_token_length", int),
+            "free_gpu_memory_fraction":
+            get_parameter(model_config, "kv_cache_free_gpu_mem_fraction",
+                          float),
+            "host_cache_size":
+            get_parameter(model_config, "kv_cache_host_memory_bytes", int),
+            "onboard_blocks":
+            get_parameter(model_config, "kv_cache_onboard_blocks", bool),
+        }
+        max_attention_window_size = get_parameter(model_config,
+                                                  "max_attention_window_size")
+        if max_attention_window_size:
+            kwargs["max_attention_window"] = [
+                int(x) for x in max_attention_window_size.split(",")
+            ]
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.KvCacheConfig(**kwargs)
+    def get_parallel_config(self, model_config):
+        kwargs = {}
+        gpu_device_ids = get_parameter(model_config, "gpu_device_ids")
+        if gpu_device_ids:
+            kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")]
+        self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR",
+                                                    "0") == "1"
+        if self.use_orchestrator_mode:
+            kwargs[
+                "communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR
+            worker_path = get_parameter(model_config, "worker_path")
+            if worker_path is not None:
+                raise pb_utils.TritonModelException(
+                    "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable."
+                )
+            executor_worker_path = get_parameter(model_config,
+                                                 "executor_worker_path")
+            kwargs["orchestrator_config"] = trtllm.OrchestratorConfig(
+                True, executor_worker_path)
+        if len(kwargs) > 0:
+            return trtllm.ParallelConfig(**kwargs)
+        return None
+    def get_peft_cache_config(self, model_config):
+        kwargs = {
+            "optimal_adapter_size":
+            get_parameter(model_config, "lora_cache_optimal_adapter_size",
+                          int),
+            "max_adapter_size":
+            get_parameter(model_config, "lora_cache_max_adapter_size", int),
+            "device_cache_percent":
+            get_parameter(model_config, "lora_cache_gpu_memory_fraction",
+                          float),
+            "host_cache_size":
+            get_parameter(model_config, "lora_cache_host_memory_bytes", int),
+        }
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.PeftCacheConfig(**kwargs)
+    def get_decoding_config(self, model_config):
+        kwargs = {
+            "medusa_choices":
+            parse_medusa_choices(get_parameter(model_config,
+                                               "medusa_choices")),
+            "decoding_mode":
+            convert_decoding_mode(get_parameter(model_config,
+                                                "decoding_mode")),
+        }
+        print(kwargs)
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.DecodingConfig(**kwargs)
+    def get_extended_runtime_perf_knob_config(self, model_config):
+        kwargs = {
+            "multi_block_mode":
+            get_parameter(model_config, "multi_block_mode", bool),
+            "enable_context_fmha_fp32_acc":
+            get_parameter(model_config, "enable_context_fmha_fp32_acc", bool)
+        }
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.ExtendedRuntimePerfKnobConfig(**kwargs)
+    def get_executor_config(self, model_config):
+        kwargs = {
+            "max_beam_width":
+            get_parameter(model_config, "max_beam_width", int),
+            "scheduler_config":
+            self.get_scheduler_config(model_config),
+            "kv_cache_config":
+            self.get_kv_cache_config(model_config),
+            "enable_chunked_context":
+            get_parameter(model_config, "enable_chunked_context", bool),
+            "normalize_log_probs":
+            get_parameter(model_config, "normalize_log_probs", bool),
+            "batching_type":
+            convert_batching_type(get_parameter(model_config,
+                                                "gpt_model_type")),
+            "parallel_config":
+            self.get_parallel_config(model_config),
+            "peft_cache_config":
+            self.get_peft_cache_config(model_config),
+            "decoding_config":
+            self.get_decoding_config(model_config),
+            "max_queue_size":
+            model_config.get(
+                "dynamic_batching",
+                {},
+            ).get(
+                "default_queue_policy",
+                {},
+            ).get("max_queue_size"),
+            "extended_runtime_perf_knob_config":
+            self.get_extended_runtime_perf_knob_config(model_config)
+        }
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.ExecutorConfig(**kwargs)
+    def create_metrics(self, model: str, version: str, is_v1_model: bool):
+        self.request_metric_family = pb_utils.MetricFamily(
+            name="nv_trt_llm_request_metrics",
+            description="TRT LLM request metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        self.runtime_memory_metric_family = pb_utils.MetricFamily(
+            name="nv_trt_llm_runtime_memory_metrics",
+            description="TRT LLM runtime memory metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        self.kv_cache_metric_family = pb_utils.MetricFamily(
+            name="nv_trt_llm_kv_cache_block_metrics",
+            description="TRT LLM KV cache block metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        model_type = "v1" if is_v1_model else "inflight_batcher"
+        self.model_type_metric_family = pb_utils.MetricFamily(
+            name=f"nv_trt_llm_{model_type}_metrics",
+            description=f"TRT LLM {model_type}-specific metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        self.general_metric_family = pb_utils.MetricFamily(
+            name="nv_trt_llm_general_metrics",
+            description="General TRT LLM metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        common_labels = {"model": model, "version": version}
+        self.all_metrics = {
+            # Request metrics
+            "num_active_requests":
+            self.request_metric_family.Metric(labels={
+                "request_type": "active",
+                **common_labels
+            }),
+            "max_num_active_requests":
+            self.request_metric_family.Metric(labels={
+                "request_type": "max",
+                **common_labels
+            }),
+            "num_scheduled_requests":
+            self.request_metric_family.Metric(labels={
+                "request_type": "scheduled",
+                **common_labels
+            }),
+            "num_context_requests":
+            self.request_metric_family.Metric(labels={
+                "request_type": "context",
+                **common_labels
+            }),
+            # Runtime metrics
+            "cpu_mem_usage":
+            self.runtime_memory_metric_family.Metric(labels={
+                "memory_type": "cpu",
+                **common_labels
+            }),
+            "gpu_mem_usage":
+            self.runtime_memory_metric_family.Metric(labels={
+                "memory_type": "gpu",
+                **common_labels
+            }),
+            "pinned_mem_usage":
+            self.runtime_memory_metric_family.Metric(labels={
+                "memory_type": "pinned",
+                **common_labels
+            }),
+            # KV cache metrics
+            "max_num_blocks":
+            self.kv_cache_metric_family.Metric(labels={
+                "kv_cache_block_type": "max",
+                **common_labels
+            }),
+            "free_num_blocks":
+            self.kv_cache_metric_family.Metric(labels={
+                "kv_cache_block_type": "free",
+                **common_labels
+            }),
+            "used_num_blocks":
+            self.kv_cache_metric_family.Metric(labels={
+                "kv_cache_block_type": "used",
+                **common_labels
+            }),
+            "tokens_per_block":
+            self.kv_cache_metric_family.Metric(labels={
+                "kv_cache_block_type": "tokens_per",
+                **common_labels
+            }),
+            # General metrics
+            "timestamp":
+            self.general_metric_family.Metric(labels={
+                "general_type": "timestamp",
+                **common_labels
+            }),
+            "iter":
+            self.general_metric_family.Metric(labels={
+                "general_type": "iteration_counter",
+                **common_labels
+            }),
+        }
+        if is_v1_model:
+            self.all_metrics.update({
+                "num_ctx_tokens":
+                self.model_type_metric_family.Metric(labels={
+                    "v1_specific_metric": "total_context_tokens",
+                    **common_labels
+                }),
+                "num_gen_tokens":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "v1_specific_metric": "total_generation_tokens",
+                        **common_labels
+                    }),
+                "empty_gen_slots":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "v1_specific_metric": "empty_generation_slots",
+                        **common_labels
+                    }),
+            })
+        else:
+            self.all_metrics.update({
+                "num_ctx_tokens":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "inflight_batcher_specific_metric":
+                        "total_context_tokens",
+                        **common_labels
+                    }),
+                "num_gen_requests":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "inflight_batcher_specific_metric":
+                        "generation_requests",
+                        **common_labels
+                    }),
+                "micro_batch_id":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "inflight_batcher_specific_metric": "micro_batch_id",
+                        **common_labels
+                    }),
+                "num_paused_requests":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "inflight_batcher_specific_metric": "paused_requests",
+                        **common_labels
+                    }),
+            })
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        model_config = json.loads(args['model_config'])
+        gpt_model_path = get_parameter(model_config, "gpt_model_path")
+        if get_parameter(model_config, "enable_trt_overlap", bool):
+            raise pb_utils.TritonModelException(
+                f"enable_trt_overlap=true is not supported.")
+        self.exclude_input_from_output = get_parameter(
+            model_config, "exclude_input_in_output", bool)
+        executor_config = self.get_executor_config(model_config)
+        self.executor = trtllm.Executor(gpt_model_path,
+                                        trtllm.ModelType.DECODER_ONLY,
+                                        executor_config)
+        self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
+            model_config)
+        self.cancellation_check_period_ms = get_parameter(
+            model_config, "cancellation_check_period_ms", int) or 100
+        self.stats_check_period_ms = get_parameter(
+            model_config, "stats_check_period_ms", int) or 100
+        if not self.decoupled:
+            raise pb_utils.TritonModelException(
+                "Please enable decoupled transaction policy in the model configuration to serve this model"
+            )
+        self.create_metrics(args["model_name"],
+                            args["model_version"],
+                            is_v1_model=executor_config.batching_type ==
+                            trtllm.BatchingType.STATIC)
+        self.triton_user_id_to_req_ids = {}
+        self.triton_req_id_to_req_ids = {}
+        self.req_id_to_request_data = {}
+        self.lock = Lock()
+        self.running = False
+        self.awaiter_thread = Thread(target=self.awaiter_loop)
+        self.cancellation_thread = Thread(target=self.cancellation_loop)
+        self.metrics_thread = Thread(target=self.metrics_loop)
+        if self.executor.can_enqueue_requests():
+            self.running = True
+            self.awaiter_thread.start()
+            self.cancellation_thread.start()
+            self.metrics_thread.start()
+        else:
+            # In leader mode, worker ranks will wait here until leader is done.
+            self.executor.shutdown()
+    def handle_stop_request(self, triton_user_id, response_sender):
+        if triton_user_id is None or triton_user_id == "":
+            response_sender.send(
+                pb_utils.InferenceResponse(error=pb_utils.TritonError(
+                    "A request id must be provided for request cancellation")),
+                flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+            return
+        with self.lock:
+            if triton_user_id in self.triton_user_id_to_req_ids:
+                req_ids = self.triton_user_id_to_req_ids[triton_user_id]
+                for req_id in req_ids:
+                    self.executor.cancel_request(req_id)
+        response_sender.send(
+            pb_utils.InferenceResponse(),
+            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        if not self.executor.can_enqueue_requests():
+            return
+        # Convert to executor requests.
+        triton_requests = []
+        executor_requests = []
+        batch_indices = []
+        triton_user_ids = []
+        triton_req_ids = []
+        for request in requests:
+            triton_user_id = request.request_id()
+            response_sender = request.get_response_sender()
+            stop = get_input_scalar_by_name(request, 'stop')
+            if stop:
+                self.handle_stop_request(triton_user_id, response_sender)
+            else:
+                #Unique request id used to identify each triton request
+                triton_req_id = str(randint(0, sys.maxsize))
+                self.triton_req_id_to_req_ids[triton_req_id] = set()
+                if triton_user_id is not None and triton_user_id != "":
+                    self.triton_user_id_to_req_ids[triton_user_id] = set()
+                try:
+                    converted_reqs = convert_request(
+                        request, self.exclude_input_from_output,
+                        self.decoupled)
+                except Exception as e:
+                    response_sender.send(
+                        pb_utils.InferenceResponse(error=pb_utils.TritonError(
+                            f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'"
+                        )),
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                else:
+                    for batch_index, converted_req in enumerate(
+                            converted_reqs):
+                        triton_requests.append(request)
+                        executor_requests.append(converted_req)
+                        triton_user_ids.append(triton_user_id)
+                        triton_req_ids.append(triton_req_id)
+                        batch_indices.append(batch_index)
+        with self.lock:
+            request_ids = self.executor.enqueue_requests(executor_requests)
+            for req_id, triton_req_id, triton_user_id, triton_request, batch_index in zip(
+                    request_ids, triton_req_ids, triton_user_ids,
+                    triton_requests, batch_indices):
+                self.req_id_to_request_data[
+                    req_id] = triton_req_id, triton_user_id, batch_index, triton_request.get_response_sender(
+                    )
+                self.triton_req_id_to_req_ids[triton_req_id].add(req_id)
+                if triton_user_id is not None and triton_user_id != "":
+                    self.triton_user_id_to_req_ids[triton_user_id].add(req_id)
+        return None
+    def awaiter_loop(self):
+        """Gets responses from executor and returns the results."""
+        while self.running:
+            for response in self.executor.await_responses(
+                    timeout=datetime.timedelta(milliseconds=1)):
+                req_id = response.request_id
+                with self.lock:
+                    if req_id not in self.req_id_to_request_data:
+                        continue
+                    triton_req_id, triton_user_id, batch_index, response_sender = self.req_id_to_request_data[
+                        req_id]
+                triton_response, is_final = convert_response(
+                    response, batch_index)
+                triton_request_final = False
+                if is_final:
+                    with self.lock:
+                        # Check if all executor requests part of that triton request are finished
+                        self.triton_req_id_to_req_ids[triton_req_id].remove(
+                            req_id)
+                        if len(self.triton_req_id_to_req_ids[triton_req_id]
+                               ) == 0:
+                            pb_utils.Logger.log_info(
+                                f"DELETING Req id {req_id}, triton_req_id {triton_req_id} "
+                            )
+                            triton_request_final = True
+                            del self.triton_req_id_to_req_ids[triton_req_id]
+                            if triton_user_id is not None and triton_user_id != "":
+                                del self.triton_user_id_to_req_ids[
+                                    triton_user_id]
+                        del self.req_id_to_request_data[req_id]
+                response_sender.send(
+                    triton_response,
+                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                    if triton_request_final else 0)
+                # Remove local reference so response_sender can be cleaned properly.
+                del response_sender
+    def cancellation_loop(self):
+        """Checks if any pending requests have been cancelled."""
+        while self.running:
+            time.sleep(self.cancellation_check_period_ms / 1000.0)
+            with self.lock:
+                for req_id, (triton_req_id, triton_user_id, batch_index,
+                             response_sender
+                             ) in self.req_id_to_request_data.items():
+                    if response_sender.is_cancelled():
+                        self.executor.cancel_request(req_id)
+                    # Remove local reference so response_sender can be cleaned properly.
+                    del response_sender
+    def metrics_loop(self):
+        """Updates triton metrics using stats from the executor."""
+        while self.running:
+            time.sleep(self.stats_check_period_ms / 1000.0)
+            for stat in self.executor.get_latest_iteration_stats():
+                try:
+                    for key, metric in self.all_metrics.items():
+                        value = None
+                        if hasattr(stat, key):
+                            value = getattr(stat, key)
+                        elif stat.kv_cache_stats is not None and hasattr(
+                                stat.kv_cache_stats, key):
+                            value = getattr(stat.kv_cache_stats, key)
+                        elif stat.static_batching_stats is not None and hasattr(
+                                stat.static_batching_stats, key):
+                            value = getattr(stat.static_batching_stats, key)
+                        elif stat.inflight_batching_stats is not None and hasattr(
+                                stat.inflight_batching_stats, key):
+                            value = getattr(stat.inflight_batching_stats, key)
+                        if value is not None:
+                            if key == "timestamp":
+                                value = convert_timestamp_to_seconds(value)
+                            metric.set(value)
+                        else:
+                            pb_utils.Logger.log_warn(
+                                f"Metric \"{key}\" not found.")
+                except Exception as e:
+                    pb_utils.Logger.log_warn(
+                        f"Error while processing metrics: {e}")
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        if self.executor.can_enqueue_requests():
+            self.running = False
+            self.awaiter_thread.join()
+            self.cancellation_thread.join()
+            self.metrics_thread.join()
+            self.executor.shutdown()

tensorrt_llm/1/rank0.engine ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b8418460f6786395ac4ace17e6dafad6c2b60a021fb247da853718db2c4fd13
+size 1065214420

tensorrt_llm/1/rank1.engine ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c40598916cbd21bcfa434ae02004e9e8b1d6f50445a3f5bdd4bb3971634072cf
+size 1065215172

tensorrt_llm/config.pbtxt ADDED Viewed

	@@ -0,0 +1,556 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+name: "tensorrt_llm"
+backend: "tensorrtllm"
+max_batch_size: 32
+model_transaction_policy {
+  decoupled: True
+}
+input [
+  {
+    name: "input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    allow_ragged_batch: true
+  },
+  {
+    name: "input_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "request_output_len"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "draft_input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "decoder_input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "decoder_input_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "draft_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "draft_acceptance_threshold"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "end_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "pad_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "stop_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "bad_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "embedding_bias"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "beam_width"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_k"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_min"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_decay"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_reset_ids"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "len_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "early_stopping"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "repetition_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "min_length"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "beam_search_diversity_rate"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "presence_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "frequency_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "random_seed"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_context_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_generation_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "stop"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "streaming"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "prompt_embedding_table"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "prompt_vocab_size"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  # the unique task ID for the given LoRA.
+  # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
+  # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
+  # If the cache is full the oldest LoRA will be evicted to make space for new ones.  An error is returned if `lora_task_id` is not cached.
+  {
+    name: "lora_task_id"
+	data_type: TYPE_UINT64
+	dims: [ 1 ]
+    reshape: { shape: [ ] }
+	optional: true
+  },
+  # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
+  # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
+  # each of the in / out tensors are first flattened and then concatenated together in the format above.
+  # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
+  {
+    name: "lora_weights"
+	data_type: TYPE_FP16
+	dims: [ -1, -1 ]
+	optional: true
+	allow_ragged_batch: true
+  },
+  # module identifier (same size a first dimension of lora_weights)
+  # See LoraModule::ModuleType for model id mapping
+  #
+  # "attn_qkv": 0     # compbined qkv adapter
+  # "attn_q": 1       # q adapter
+  # "attn_k": 2       # k adapter
+  # "attn_v": 3       # v adapter
+  # "attn_dense": 4   # adapter for the dense layer in attention
+  # "mlp_h_to_4h": 5  # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
+  # "mlp_4h_to_h": 6  # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
+  # "mlp_gate": 7     # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
+  #
+  # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
+  {
+    name: "lora_config"
+	data_type: TYPE_INT32
+	dims: [ -1, 3 ]
+	optional: true
+	allow_ragged_batch: true
+  }
+]
+output [
+  {
+    name: "output_ids"
+    data_type: TYPE_INT32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "sequence_length"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+    name: "cum_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "output_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "context_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "generation_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1, -1 ]
+  },
+  {
+    name: "batch_index"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
+parameters: {
+  key: "max_beam_width"
+  value: {
+    string_value: "1"
+  }
+}
+parameters: {
+  key: "FORCE_CPU_ONLY_INPUT_TENSORS"
+  value: {
+    string_value: "no"
+  }
+}
+parameters: {
+  key: "gpt_model_type"
+  value: {
+    string_value: "inflight_fused_batching"
+  }
+}
+parameters: {
+  key: "gpt_model_path"
+  value: {
+    string_value: "/all_models/inflight_batcher_llm/tensorrt_llm/1"
+  }
+}
+parameters: {
+  key: "encoder_model_path"
+  value: {
+    string_value: "${encoder_engine_dir}"
+  }
+}
+parameters: {
+  key: "max_tokens_in_paged_kv_cache"
+  value: {
+    string_value: "${max_tokens_in_paged_kv_cache}"
+  }
+}
+parameters: {
+  key: "max_attention_window_size"
+  value: {
+    string_value: "${max_attention_window_size}"
+  }
+}
+parameters: {
+  key: "sink_token_length"
+  value: {
+    string_value: "${sink_token_length}"
+  }
+}
+parameters: {
+  key: "batch_scheduler_policy"
+  value: {
+    string_value: "guaranteed_no_evict"
+  }
+}
+parameters: {
+  key: "kv_cache_free_gpu_mem_fraction"
+  value: {
+    string_value: "0.1"
+  }
+}
+parameters: {
+  key: "kv_cache_host_memory_bytes"
+  value: {
+    string_value: "${kv_cache_host_memory_bytes}"
+  }
+}
+parameters: {
+  key: "kv_cache_onboard_blocks"
+  value: {
+    string_value: "${kv_cache_onboard_blocks}"
+  }
+}
+# enable_trt_overlap is deprecated and doesn't have any effect on the runtime
+# parameters: {
+#   key: "enable_trt_overlap"
+#   value: {
+#     string_value: "${enable_trt_overlap}"
+#   }
+# }
+parameters: {
+  key: "exclude_input_in_output"
+  value: {
+    string_value: "True"
+  }
+}
+parameters: {
+  key: "cancellation_check_period_ms"
+  value: {
+    string_value: "${cancellation_check_period_ms}"
+  }
+}
+parameters: {
+  key: "stats_check_period_ms"
+  value: {
+    string_value: "${stats_check_period_ms}"
+  }
+}
+parameters: {
+  key: "iter_stats_max_iterations"
+  value: {
+    string_value: "${iter_stats_max_iterations}"
+  }
+}
+parameters: {
+  key: "request_stats_max_iterations"
+  value: {
+    string_value: "${request_stats_max_iterations}"
+  }
+}
+parameters: {
+  key: "enable_kv_cache_reuse"
+  value: {
+    string_value: "True"
+  }
+}
+parameters: {
+  key: "normalize_log_probs"
+  value: {
+    string_value: "${normalize_log_probs}"
+  }
+}
+parameters: {
+  key: "enable_chunked_context"
+  value: {
+    string_value: "${enable_chunked_context}"
+  }
+}
+parameters: {
+  key: "gpu_device_ids"
+  value: {
+    string_value: "0,1"
+  }
+}
+parameters: {
+  key: "lora_cache_optimal_adapter_size"
+  value: {
+    string_value: "${lora_cache_optimal_adapter_size}"
+  }
+}
+parameters: {
+  key: "lora_cache_max_adapter_size"
+  value: {
+    string_value: "${lora_cache_max_adapter_size}"
+  }
+}
+parameters: {
+  key: "lora_cache_gpu_memory_fraction"
+  value: {
+    string_value: "${lora_cache_gpu_memory_fraction}"
+  }
+}
+parameters: {
+  key: "lora_cache_host_memory_bytes"
+  value: {
+    string_value: "${lora_cache_host_memory_bytes}"
+  }
+}
+parameters: {
+  key: "decoding_mode"
+  value: {
+    string_value: "top_k_top_p"
+  }
+}
+parameters: {
+  key: "executor_worker_path"
+  value: {
+    string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
+  }
+}
+parameters: {
+  key: "medusa_choices"
+    value: {
+      string_value: "${medusa_choices}"
+  }
+}
+parameters: {
+  key: "gpu_weights_percent"
+    value: {
+      string_value: "${gpu_weights_percent}"
+  }
+}
+parameters: {
+  key: "enable_context_fmha_fp32_acc"
+  value: {
+    string_value: "${enable_context_fmha_fp32_acc}"
+  }
+}
+parameters: {
+  key: "multi_block_mode"
+  value: {
+    string_value: "${multi_block_mode}"
+  }
+}

tensorrt_llm_bls/1/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (3.15 kB). View file

tensorrt_llm_bls/1/lib/__pycache__/decode.cpython-310.pyc ADDED Viewed

Binary file (10.3 kB). View file

tensorrt_llm_bls/1/lib/__pycache__/triton_decoder.cpython-310.pyc ADDED Viewed

Binary file (11.5 kB). View file

tensorrt_llm_bls/1/lib/decode.py ADDED Viewed

	@@ -0,0 +1,386 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from collections.abc import Generator
+from dataclasses import dataclass
+from typing import Optional
+import numpy as np
+import torch
+class RequestValidationError(Exception):
+    pass
+def _validate_that(condition: bool, msg: str):
+    if not condition:
+        raise RequestValidationError(msg)
+def _validate_non_empty(data, msg: str):
+    if isinstance(data, torch.Tensor):
+        _validate_that(data is not None and data.numel() > 0, msg)
+    else:
+        _validate_that(data is not None and data.size > 0, msg)
+def _validate_single_gt_0(data, msg: str):
+    _validate_non_empty(data, msg)
+    _validate_that(data.flatten()[0] > 0, msg)
+def _single_value(data: Optional[np.ndarray]):
+    if data is None:
+        return None
+    return data.flatten()[0]
+@dataclass
+class Request:
+    text_input: np.ndarray = np.array([])
+    decoder_text_input: np.ndarray = None
+    image_input: Optional[np.ndarray] = None
+    max_tokens: Optional[np.ndarray] = None
+    bad_words: Optional[np.ndarray] = None
+    stop_words: Optional[np.ndarray] = None
+    end_id: Optional[np.ndarray] = None
+    pad_id: Optional[np.ndarray] = None
+    top_k: Optional[np.ndarray] = None
+    top_p: Optional[np.ndarray] = None
+    temperature: Optional[np.ndarray] = None
+    length_penalty: Optional[np.ndarray] = None
+    repetition_penalty: Optional[np.ndarray] = None
+    min_length: Optional[np.ndarray] = None
+    return_log_probs: Optional[np.ndarray] = None
+    prompt_embedding_table: Optional[np.ndarray] = None
+    prompt_vocab_size: Optional[np.ndarray] = None
+    embedding_bias_words: Optional[np.ndarray] = None
+    embedding_bias_weights: Optional[np.ndarray] = None
+    num_draft_tokens: Optional[np.ndarray] = None
+    use_draft_logits: Optional[np.ndarray] = None
+    stream: Optional[np.ndarray] = None
+    beam_width: Optional[np.ndarray] = None
+    return_context_logits: Optional[np.ndarray] = None
+    return_generation_logits: Optional[np.ndarray] = None
+    random_seed: Optional[np.ndarray] = None
+    presence_penalty: Optional[np.ndarray] = None
+    frequency_penalty: Optional[np.ndarray] = None
+    def validate(self):
+        _validate_non_empty(self.text_input, "text_input is required")
+        _validate_single_gt_0(self.max_tokens,
+                              "max_tokens must be a single value > 0")
+        num_draft_tokens = _single_value(self.num_draft_tokens)
+        _single_value(self.return_generation_logits)
+        context_logits = _single_value(self.return_context_logits)
+        if num_draft_tokens:
+            _validate_that(
+                not self.stream.any(),
+                "streaming is not supported with speculative decoding")
+            _validate_that(
+                not context_logits,
+                "context logits are not supported with speculative decoding")
+@dataclass
+class DraftRequest:
+    draft_input_ids: Optional[np.ndarray] = None
+    draft_logits: Optional[np.ndarray] = None
+@dataclass
+class PreprocResponse:
+    input_ids: np.ndarray = np.array([])
+    decoder_input_ids: np.ndarray = None
+    input_lengths: np.ndarray = np.array([])
+    decoder_input_lengths: np.ndarray = None
+    bad_words_list: Optional[np.ndarray] = None
+    stop_words_list: Optional[np.ndarray] = None
+    embedding_bias: Optional[np.ndarray] = None
+    end_id: Optional[np.ndarray] = None
+    pad_id: Optional[np.ndarray] = None
+    @classmethod
+    def with_new_inputs(cls,
+                        other,
+                        input_ids: Optional[np.ndarray] = None,
+                        input_lengths: Optional[np.ndarray] = None):
+        return cls(input_ids=(input_ids
+                              if input_ids is not None else other.input_ids),
+                   input_lengths=(input_lengths if input_lengths is not None
+                                  else other.input_lengths),
+                   decoder_input_ids=other.decoder_input_ids,
+                   decoder_input_lengths=other.decoder_input_lengths,
+                   bad_words_list=other.bad_words_list,
+                   stop_words_list=other.stop_words_list,
+                   end_id=other.end_id,
+                   pad_id=other.pad_id)
+@dataclass
+class MultimodalEncResponse:
+    prompt_embedding_table: Optional[torch.Tensor] = None
+    prompt_vocab_size: Optional[np.ndarray] = None
+@dataclass
+class GenerationResponse:
+    output_ids: np.ndarray = np.array([])
+    sequence_length: np.ndarray = np.array([])
+    cum_log_probs: Optional[np.ndarray] = None
+    output_log_probs: Optional[np.ndarray] = None
+    context_logits: Optional[np.ndarray] = None
+    generation_logits: Optional[np.ndarray] = None
+    batch_index: Optional[np.ndarray] = None
+@dataclass
+class Response:
+    text_output: np.ndarray = np.array([])
+    cum_log_probs: Optional[np.ndarray] = None
+    output_log_probs: Optional[np.ndarray] = None
+    context_logits: Optional[np.ndarray] = None
+    generation_logits: Optional[np.ndarray] = None
+    batch_index: Optional[np.ndarray] = None
+    def __eq__(self, o) -> bool:
+        """Just for testing"""
+        if not isinstance(o, Response):
+            return False
+        return (np.array_equal(self.text_output, o.text_output)
+                and np.array_equal(self.cum_log_probs, o.cum_log_probs)
+                and np.array_equal(self.output_log_probs, o.output_log_probs)
+                and np.array_equal(self.context_logits, o.context_logits)
+                and np.array_equal(self.generation_logits, o.generation_logits)
+                and np.array_equal(self.batch_index, o.batch_index))
+class Decoder:
+    def __init__(self, streaming=False, accumulate=False):
+        self._streaming = streaming
+        self._accumulate = accumulate
+        self._accumulated_tokens = []
+    def decode(self,
+               request: Request,
+               speculative_decoding=False,
+               is_multimodal=False) -> Generator[Response, None, None]:
+        batch_size = request.text_input.shape[0]
+        self._accumulated_tokens = [None] * batch_size
+        preproc_response = self.preprocess(request)
+        multimodal_enc_response = None
+        if is_multimodal:
+            multimodal_enc_response = self._multimodal_enc_generate(request)
+        if speculative_decoding:
+            if batch_size > 1:
+                raise Exception(
+                    "speculative decoding is not supported with batch size > 1"
+                )
+            for gen_response in self._spec_generate(preproc_response, request):
+                yield self.postprocess(gen_response, batch_size)
+        else:
+            if not self._streaming and batch_size == 1:
+                gen_response = self._generate_non_streaming(
+                    preproc_response,
+                    request,
+                    multimodal_enc_response=multimodal_enc_response)
+                yield self.postprocess(gen_response, batch_size)
+            else:
+                for gen_response in self._generate(
+                        preproc_response,
+                        request,
+                        multimodal_enc_response=multimodal_enc_response):
+                    yield self.postprocess(gen_response, batch_size)
+    def encountered_stop_words(self, input_ids, stop_words_ids):
+        for stop_word_ids in stop_words_ids:
+            if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids):
+                return True
+        return False
+    def _spec_generate(
+            self, preproc: PreprocResponse,
+            request: Request) -> Generator[GenerationResponse, None, None]:
+        if preproc.input_ids.shape[0] > 1:
+            raise Exception(
+                "Speculative decoding does not support batch size > 1.")
+        prompt_input_ids: np.ndarray = preproc.input_ids[0]
+        input_ids: np.ndarray = prompt_input_ids
+        output_len: int = request.max_tokens[0][0]
+        last_input_ids: np.ndarray = None
+        draft_output_ids: np.ndarray = None
+        draft_logits: np.ndarray = None
+        target_response: GenerationResponse = None
+        cur_preproc = preproc
+        counter = 0
+        while True:
+            counter += 1
+            num_draft_tokens = min(
+                request.num_draft_tokens[0][0],
+                len(prompt_input_ids) + output_len - len(input_ids) - 1)
+            draft_request = None
+            if num_draft_tokens > 0:
+                draft_response: GenerationResponse = self._draft_generate_non_streaming(
+                    cur_preproc, request, num_draft_tokens)
+                seq_len: int = draft_response.sequence_length[0][0]
+                # [1, beamWidth, outputLength] -> [outputLen]
+                draft_output_ids = draft_response.output_ids[0][0]
+                # [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded]
+                if request.use_draft_logits is not None and request.use_draft_logits[
+                        0]:
+                    if draft_response.generation_logits is not None:
+                        draft_logits = draft_response.generation_logits[0][0]
+                input_draft_tokens = draft_output_ids[len(input_ids):seq_len]
+                draft_request = DraftRequest(
+                    draft_input_ids=np.expand_dims(input_draft_tokens, 0))
+                if request.use_draft_logits is not None and request.use_draft_logits[
+                        0]:
+                    draft_request.draft_logits = np.expand_dims(
+                        draft_logits[-len(input_draft_tokens):], 0)
+            else:
+                draft_request = DraftRequest()
+            target_response = self._generate_non_streaming(
+                cur_preproc, request, draft_request)
+            last_input_ids = input_ids
+            input_ids = target_response.output_ids[0][0]
+            cur_preproc = PreprocResponse.with_new_inputs(
+                cur_preproc, np.expand_dims(input_ids, 0),
+                np.array([[len(input_ids)]], dtype=np.int32))
+            # Evaluate criteria to stop generation loop.
+            # If we've hit or exceeded the max output length, should stop
+            length_stop = (len(input_ids) >=
+                           len(prompt_input_ids) + output_len)
+            if length_stop:
+                break
+            # If draft and target have same outputs, should stop. Normally target should return 1 more token.
+            # If they are the same length, they should differ at the last token
+            target_draft_equal = draft_output_ids is not None and np.array_equal(
+                draft_output_ids, input_ids)
+            if target_draft_equal:
+                break
+            # If tokens no longer change, should stop, means we have hit early stopping
+            last_current_equal = np.array_equal(last_input_ids, input_ids)
+            if last_current_equal:
+                break
+            # Need to check if stop words was encountered
+            hit_stop_words = self.encountered_stop_words(
+                input_ids, preproc.stop_words_list[0])
+            if hit_stop_words:
+                break
+        yield target_response
+    def _draft_generate_non_streaming(
+            self, preproc: PreprocResponse, request: Request,
+            num_draft_tokens: int) -> GenerationResponse:
+        raise NotImplementedError()
+    def _multimodal_enc_generate(
+        self,
+        request: Request,
+    ) -> MultimodalEncResponse:
+        raise NotImplementedError()
+    def _generate(
+        self,
+        preproc: PreprocResponse,
+        request: Request,
+        draft_request: Optional[DraftRequest] = None,
+        multimodal_enc_response: Optional[MultimodalEncResponse] = None,
+    ) -> Generator[GenerationResponse, None, None]:
+        raise NotImplementedError()
+    def _generate_non_streaming(
+        self,
+        preproc: PreprocResponse,
+        request: Request,
+        draft_request: Optional[DraftRequest] = None,
+        multimodal_enc_response: Optional[MultimodalEncResponse] = None,
+    ) -> GenerationResponse:
+        raise NotImplementedError()
+    def postprocess(self, gen_response: GenerationResponse,
+                    batch_size) -> Response:
+        if self._accumulate and self._streaming:
+            new_tokens: np.ndarray = gen_response.output_ids
+            if new_tokens.ndim != 3:
+                raise Exception("Expected output_ids tensor to have 3 dims.")
+            if new_tokens.shape[0] != 1:
+                raise Exception("Expected batch size of 1")
+            if new_tokens.shape[1] != 1:
+                raise Exception(
+                    "Accumulation of tokens is only implemented for beam width = 1"
+                )
+            batch_index = gen_response.batch_index
+            if batch_index.ndim != 2:
+                raise Exception("Expected batch_index tensor to have 2 dims.")
+            if batch_index.shape[0] != 1:
+                raise Exception("Expected batch size of 1")
+            if batch_index.shape[1] != 1:
+                raise Exception("Expected only one batch_index")
+            batch_index = batch_index[0][0]
+            self._accumulated_tokens[batch_index] = new_tokens if (
+                self._accumulated_tokens[batch_index] is None
+            ) else np.concatenate(
+                (self._accumulated_tokens[batch_index], new_tokens), axis=2)
+            sequence_lengths = np.array(
+                [[self._accumulated_tokens[batch_index].shape[2]]],
+                dtype=np.int32)
+            return self._postprocess(self._accumulated_tokens[batch_index],
+                                     sequence_lengths, gen_response)
+        else:
+            return self._postprocess(gen_response.output_ids, None,
+                                     gen_response)
+    def _postprocess(self, tokens: np.ndarray,
+                     sequence_lengths: Optional[np.ndarray],
+                     gen_response: GenerationResponse) -> Response:
+        raise NotImplementedError()
+    def preprocess(self, request: Request) -> PreprocResponse:
+        raise NotImplementedError()
+    def reset_decoder(self):
+        self._accumulated_tokens = []

tensorrt_llm_bls/1/lib/triton_decoder.py ADDED Viewed

	@@ -0,0 +1,523 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from collections.abc import Callable
+from typing import Dict, Optional
+import numpy as np
+import triton_python_backend_utils as pb_utils
+from lib.decode import *
+from torch.utils.dlpack import from_dlpack, to_dlpack
+from typing_extensions import override
+class TritonDecoder(Decoder):
+    def __init__(self,
+                 streaming=False,
+                 accumulate=False,
+                 preproc_model_name="preprocessing",
+                 postproc_model_name="postprocessing",
+                 llm_model_name="tensorrt_llm",
+                 draft_llm_model_name: Optional[str] = None,
+                 multimodal_encoders_name: Optional[str] = None):
+        super().__init__(streaming=streaming, accumulate=accumulate)
+        self.preproc_model_name = preproc_model_name
+        self.postproc_model_name = postproc_model_name
+        self.llm_model_name = llm_model_name
+        self.draft_llm_model_name = draft_llm_model_name
+        self.multimodal_encoders_name = multimodal_encoders_name
+        self._preproc_outputs = [
+            "INPUT_ID",
+            "DECODER_INPUT_ID",
+            "REQUEST_INPUT_LEN",
+            "REQUEST_DECODER_INPUT_LEN",
+            "BAD_WORDS_IDS",
+            "STOP_WORDS_IDS",
+            "EMBEDDING_BIAS",
+            "OUT_PAD_ID",
+            "OUT_END_ID",
+        ]
+        self._multimodal_enc_outputs = [
+            "OUT_PROMPT_EMBEDDING_TABLE", "OUT_PROMPT_VOCAB_SIZE"
+        ]
+        self._llm_outputs = [
+            "output_ids", "sequence_length", "cum_log_probs",
+            "output_log_probs", "context_logits", "generation_logits",
+            "batch_index"
+        ]
+        self._postproc_outputs = [
+            "OUTPUT",
+        ]
+        self.input_names = [
+            "text_input",
+            "decoder_text_input",
+            "image_input",
+            "max_tokens",
+            "bad_words",
+            "stop_words",
+            "end_id",
+            "pad_id",
+            "top_k",
+            "top_p",
+            "temperature",
+            "length_penalty",
+            "repetition_penalty",
+            "min_length",
+            "presence_penalty",
+            "frequency_penalty",
+            "random_seed",
+            "return_log_probs",
+            "return_context_logits",
+            "return_generation_logits",
+            "beam_width",
+            "stream",
+            "prompt_embedding_table",
+            "prompt_vocab_size",
+            "embedding_bias_words",
+            "embedding_bias_weights",
+            "num_draft_tokens",
+            "use_draft_logits",
+        ]
+        self.__undo_reshape_whitelist = {
+            "max_tokens",
+            "end_id",
+            "pad_id",
+            "top_k",
+            "top_p",
+            "temperature",
+            "length_penalty",
+            "repetition_penalty",
+            "min_length",
+            "presence_penalty",
+            "frequency_penalty",
+            "random_seed",
+            "return_log_probs",
+            "return_context_logits",
+            "return_generation_logits",
+            "beam_width",
+            "stream",
+            "prompt_vocab_size",
+            "num_draft_tokens",
+            "use_draft_logits",
+        }
+    def _exec_triton_request(self, request):
+        responses = request.exec(decoupled=True)
+        for r in responses:
+            if r.has_error():
+                raise pb_utils.TritonModelException(r.error().message())
+            yield r
+    def _exec_triton_request_single(self, request):
+        responses = request.exec(decoupled=False)
+        if responses.has_error():
+            raise pb_utils.TritonModelException(responses.error().message())
+        return responses
+    def create_triton_response(self, response: Response):
+        name_map = {
+            "text_output": "text_output",
+            "cum_log_probs": "cum_log_probs",
+            "output_log_probs": "output_log_probs",
+            "context_logits": "context_logits",
+            "generation_logits": "generation_logits",
+            "batch_index": "batch_index"
+        }
+        tensors = self.create_triton_tensors(response, name_map)
+        return pb_utils.InferenceResponse(output_tensors=tensors)
+    def convert_triton_request(self, triton_request) -> Request:
+        request = Request()
+        for triton_name in self.input_names:
+            tensor = pb_utils.get_input_tensor_by_name(triton_request,
+                                                       triton_name)
+            target_name = triton_name
+            if tensor is None:
+                continue
+            if not hasattr(request, target_name):
+                raise AttributeError(
+                    f"Request has no attribute '{target_name}'")
+            setattr(request, target_name, tensor.as_numpy())
+        return request
+    def convert_triton_response(self,
+                                triton_response,
+                                response_factory: Callable,
+                                name_map=None):
+        response = response_factory()
+        for tensor in triton_response.output_tensors():
+            if tensor is None:
+                continue
+            triton_name = tensor.name()
+            if tensor.is_cpu():
+                value = tensor.as_numpy()
+            else:
+                # If the tensor is in GPU memory make it torch.Tensor type
+                value = from_dlpack(tensor.to_dlpack())
+            target_name = triton_name
+            if name_map and triton_name in name_map:
+                target_name = name_map[triton_name]
+            if name_map and not triton_name in name_map:
+                continue
+            if target_name is None:
+                # explicitly ignore this triton input
+                continue
+            if not hasattr(response, target_name):
+                raise AttributeError(
+                    f"response object has not attribute '{target_name}'")
+            setattr(response, target_name, value)
+        return response
+    def __undo_reshape(self, x, name):
+        if name in self.__undo_reshape_whitelist and len(x.shape) == 1:
+            # handle reshapes
+            return np.expand_dims(x, 0)
+        else:
+            return x
+    def create_triton_tensors(self, obj, name_map: dict):
+        tensors = []
+        for name, triton_name in name_map.items():
+            if triton_name is None:
+                continue
+            value = getattr(obj, name)
+            if value is None:
+                continue
+            if isinstance(value, np.ndarray):
+                t = pb_utils.Tensor(triton_name,
+                                    self.__undo_reshape(value, name))
+            elif isinstance(value, torch.Tensor):
+                t = pb_utils.Tensor.from_dlpack(
+                    triton_name, to_dlpack(self.__undo_reshape(value, name)))
+            tensors.append(t)
+        return tensors
+    @override
+    def preprocess(self, request: Request) -> PreprocResponse:
+        input_tensors = self._get_preproc_tensors(request)
+        triton_req = pb_utils.InferenceRequest(
+            model_name=self.preproc_model_name,
+            inputs=input_tensors,
+            requested_output_names=self._preproc_outputs)
+        triton_output = self._exec_triton_request_single(triton_req)
+        return self._get_preproc_response(triton_output)
+    def _get_preproc_tensors(self, request: Request):
+        name_map = {
+            "text_input": "QUERY",
+            "decoder_text_input": "DECODER_QUERY",
+            "max_tokens": "REQUEST_OUTPUT_LEN",
+            "bad_words": "BAD_WORDS_DICT",
+            "stop_words": "STOP_WORDS_DICT",
+            "embedding_bias_words": "EMBEDDING_BIAS_WORDS",
+            "embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS",
+            "pad_id": "PAD_ID",
+            "end_id": "END_ID",
+        }
+        return self.create_triton_tensors(request, name_map)
+    def _get_preproc_response(self, triton_output):
+        name_map = {
+            "INPUT_ID": "input_ids",
+            "DECODER_INPUT_ID": "decoder_input_ids",
+            "REQUEST_INPUT_LEN": "input_lengths",
+            "REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths",
+            "BAD_WORDS_IDS": "bad_words_list",
+            "STOP_WORDS_IDS": "stop_words_list",
+            "EMBEDDING_BIAS": "embedding_bias",
+            "OUT_PAD_ID": "pad_id",
+            "OUT_END_ID": "end_id",
+        }
+        return self.convert_triton_response(triton_output, PreprocResponse,
+                                            name_map)
+    @override
+    def _multimodal_enc_generate(self,
+                                 request: Request) -> MultimodalEncResponse:
+        input_tensors = self._get_multimodal_enc_tensors(request)
+        triton_req = pb_utils.InferenceRequest(
+            model_name=self.multimodal_encoders_name,
+            inputs=input_tensors,
+            requested_output_names=self._multimodal_enc_outputs)
+        triton_output = self._exec_triton_request_single(triton_req)
+        return self._get_multimodal_enc_response(triton_output)
+    def _get_multimodal_enc_tensors(self, preproc: PreprocResponse):
+        name_map = {
+            "image_input": "IMAGE",
+        }
+        return self.create_triton_tensors(preproc, name_map)
+    def _get_multimodal_enc_response(self, triton_output):
+        name_map = {
+            "OUT_PROMPT_EMBEDDING_TABLE": "prompt_embedding_table",
+            "OUT_PROMPT_VOCAB_SIZE": "prompt_vocab_size",
+        }
+        return self.convert_triton_response(triton_output,
+                                            MultimodalEncResponse, name_map)
+    @override
+    def _draft_generate_non_streaming(
+            self, preproc: PreprocResponse, request: Request,
+            num_draft_tokens: int) -> GenerationResponse:
+        input_tensors = self._get_llm_tensors(preproc, request,
+                                              num_draft_tokens, None, True)
+        triton_req = pb_utils.InferenceRequest(
+            model_name=self.draft_llm_model_name,
+            inputs=input_tensors,
+            requested_output_names=self._llm_outputs)
+        triton_response = self._exec_triton_request_single(triton_req)
+        llm_response = self._get_llm_response(triton_response)
+        return llm_response
+    @override
+    def _generate(
+        self,
+        preproc: PreprocResponse,
+        request: Request,
+        draft_request: Optional[DraftRequest] = None,
+        multimodal_enc_response: Optional[MultimodalEncResponse] = None
+    ) -> Generator[GenerationResponse, None, None]:
+        input_tensors = self._get_llm_tensors(
+            preproc,
+            request,
+            None,
+            draft_request,
+            multimodal_enc_response=multimodal_enc_response)
+        triton_req = pb_utils.InferenceRequest(
+            model_name=self.llm_model_name,
+            inputs=input_tensors,
+            requested_output_names=self._llm_outputs)
+        for r in self._exec_triton_request(triton_req):
+            yield self._get_llm_response(r)
+    @override
+    def _generate_non_streaming(
+        self,
+        preproc: PreprocResponse,
+        request: Request,
+        draft_request: Optional[DraftRequest] = None,
+        multimodal_enc_response: Optional[MultimodalEncResponse] = None
+    ) -> GenerationResponse:
+        input_tensors = self._get_llm_tensors(
+            preproc,
+            request,
+            None,
+            draft_request,
+            multimodal_enc_response=multimodal_enc_response)
+        triton_req = pb_utils.InferenceRequest(
+            model_name=self.llm_model_name,
+            inputs=input_tensors,
+            requested_output_names=self._llm_outputs)
+        r = self._exec_triton_request_single(triton_req)
+        return self._get_llm_response(r)
+    def _get_llm_tensors(
+            self,
+            preproc: PreprocResponse,
+            request: Request,
+            num_output_tokens: Optional[int] = None,
+            draft_request: Optional[DraftRequest] = None,
+            is_draft_model_request: bool = False,
+            multimodal_enc_response: MultimodalEncResponse = None):
+        tensors = []
+        tensors.extend(self._get_tensors_from_preproc(preproc))
+        if multimodal_enc_response is not None:
+            tensors.extend(
+                self._get_tensors_from_multimodal_enc(multimodal_enc_response))
+        tensors.extend(
+            self._get_llm_tensors_from_request(request, num_output_tokens,
+                                               draft_request,
+                                               is_draft_model_request))
+        return tensors
+    def _get_tensors_from_preproc(self, preproc: PreprocResponse):
+        name_map = {
+            "input_ids": "input_ids",
+            "decoder_input_ids": "decoder_input_ids",
+            "input_lengths": "input_lengths",
+            "bad_words_list": "bad_words_list",
+            "stop_words_list": "stop_words_list",
+            "embedding_bias": "embedding_bias",
+            "pad_id": "pad_id",
+            "end_id": "end_id",
+        }
+        return self.create_triton_tensors(preproc, name_map)
+    def _get_tensors_from_multimodal_enc(
+            self, multimodal_enc_response: MultimodalEncResponse):
+        name_map = {
+            "prompt_embedding_table": "prompt_embedding_table",
+            "prompt_vocab_size": "prompt_vocab_size",
+        }
+        return self.create_triton_tensors(multimodal_enc_response, name_map)
+    def _get_llm_tensors_from_request(
+            self,
+            request: Request,
+            num_output_tokens: Optional[int] = None,
+            draft_request: Optional[DraftRequest] = None,
+            is_draft_model_request: bool = False):
+        name_map: Dict[str, Optional[str]] = {
+            "beam_width": "beam_width",
+            "top_k": "runtime_top_k",
+            "top_p": "runtime_top_p",
+            "temperature": "temperature",
+            "length_penalty": "len_penalty",
+            "repetition_penalty": "repetition_penalty",
+            "min_length": "min_length",
+            "presence_penalty": "presence_penalty",
+            "frequency_penalty": "frequency_penalty",
+            "random_seed": "random_seed",
+            "return_log_probs": "return_log_probs",
+            "stream": "streaming",
+            "prompt_embedding_table": "prompt_embedding_table",
+            "prompt_vocab_size": "prompt_vocab_size",
+        }
+        batch_size = request.text_input.shape[0]
+        tensors = self.create_triton_tensors(request, name_map)
+        out_len_tensor = None
+        if request.max_tokens is not None:
+            out_len_tensor = request.max_tokens
+        out_len = None
+        if num_output_tokens is not None:
+            out_len = num_output_tokens
+        elif draft_request:
+            out_len = len(
+                draft_request.draft_input_ids[0]
+            ) + 1 if draft_request.draft_input_ids is not None else 1
+        if out_len is not None:
+            out_len_tensor = [[out_len]] * batch_size
+        if out_len_tensor is None:
+            raise Exception("Could not determine request_output_len")
+        else:
+            tensors.append(
+                pb_utils.Tensor("request_output_len",
+                                np.array(out_len_tensor, dtype=np.int32)))
+        if draft_request:
+            if draft_request.draft_input_ids is not None:
+                tensors.append(
+                    pb_utils.Tensor("draft_input_ids",
+                                    draft_request.draft_input_ids))
+                if draft_request.draft_logits is not None and request.use_draft_logits is not None and request.use_draft_logits[
+                        0]:
+                    tensors.append(
+                        pb_utils.Tensor("draft_logits",
+                                        draft_request.draft_logits))
+        return_context_logits_data = [False]
+        return_generation_logits_data = [False]
+        if draft_request is None:
+            if is_draft_model_request:
+                return_generation_logits_data = request.use_draft_logits if request.use_draft_logits is not None else [
+                    False
+                ]
+            else:
+                return_context_logits_data = request.return_context_logits if request.return_context_logits is not None else [
+                    False
+                ]
+                return_generation_logits_data = request.return_generation_logits if request.return_generation_logits is not None else [
+                    False
+                ]
+        return_context_logits = np.array([return_context_logits_data] *
+                                         batch_size,
+                                         dtype=bool)
+        return_generation_logits = np.array([return_generation_logits_data] *
+                                            batch_size,
+                                            dtype=bool)
+        assert len(return_context_logits.shape) == 2
+        assert len(return_generation_logits.shape) == 2
+        tensors.append(
+            pb_utils.Tensor("return_context_logits", return_context_logits))
+        tensors.append(
+            pb_utils.Tensor("return_generation_logits",
+                            return_generation_logits))
+        return tensors
+    def _get_llm_response(self, triton_output):
+        name_map = {
+            "output_ids": "output_ids",
+            "sequence_length": "sequence_length",
+            "cum_log_probs": "cum_log_probs",
+            "output_log_probs": "output_log_probs",
+            "context_logits": "context_logits",
+            "generation_logits": "generation_logits",
+            "batch_index": "batch_index",
+        }
+        return self.convert_triton_response(triton_output, GenerationResponse,
+                                            name_map)
+    def _postprocess(self, tokens: np.ndarray,
+                     sequence_lengths: Optional[np.ndarray],
+                     gen_response: GenerationResponse) -> Response:
+        input_tensors = self._get_postproc_tensors(tokens, sequence_lengths,
+                                                   gen_response)
+        triton_req = pb_utils.InferenceRequest(
+            model_name=self.postproc_model_name,
+            inputs=input_tensors,
+            requested_output_names=self._postproc_outputs)
+        r = self._exec_triton_request_single(triton_req)
+        response = self._get_response(r, gen_response)
+        return response
+    def _get_postproc_tensors(self, tokens: np.ndarray,
+                              sequence_lengths: Optional[np.ndarray],
+                              gen_response: GenerationResponse):
+        tensors = [
+            pb_utils.Tensor("TOKENS_BATCH", tokens),
+            pb_utils.Tensor(
+                "SEQUENCE_LENGTH", sequence_lengths
+                if sequence_lengths else gen_response.sequence_length)
+        ]
+        return tensors
+    def _get_response(self, triton_output, gen_res: GenerationResponse):
+        tensors = triton_output.output_tensors()
+        t_map = {}
+        for named_t in tensors:
+            name = named_t.name()
+            t = named_t.as_numpy()
+            t_map[name] = t
+        response = Response(text_output=t_map["OUTPUT"],
+                            cum_log_probs=gen_res.cum_log_probs,
+                            output_log_probs=gen_res.output_log_probs,
+                            context_logits=gen_res.context_logits,
+                            generation_logits=gen_res.generation_logits,
+                            batch_index=gen_res.batch_index)
+        return response

tensorrt_llm_bls/1/model.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import json
+import traceback
+import triton_python_backend_utils as pb_utils
+from lib.triton_decoder import TritonDecoder
+def get_valid_param_value(param, default_value=''):
+    value = param.get('string_value', '')
+    return default_value if value.startswith('${') or value == '' else value
+class TritonPythonModel:
+    def initialize(self, args):
+        # Parse model configs
+        model_config = json.loads(args['model_config'])
+        params = model_config['parameters']
+        accumulate_tokens_str = get_valid_param_value(
+            params.get('accumulate_tokens', {}))
+        self.accumulate_tokens = accumulate_tokens_str.lower() in [
+            'true', 'yes', '1', 't'
+        ]
+        self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
+            model_config)
+        self.logger = pb_utils.Logger
+        default_tensorrt_llm_model_name = 'tensorrt_llm'
+        self.llm_model_name = get_valid_param_value(
+            params.get('tensorrt_llm_model_name', {}),
+            default_tensorrt_llm_model_name)
+        self.draft_llm_model_name = get_valid_param_value(
+            params.get('tensorrt_llm_draft_model_name', {}), None)
+        self.multimodal_encoders_name = get_valid_param_value(
+            params.get('multimodal_encoders_name', {}), None)
+        self.decoder = TritonDecoder(
+            streaming=self.decoupled,
+            accumulate=self.accumulate_tokens,
+            preproc_model_name="preprocessing",
+            postproc_model_name="postprocessing",
+            llm_model_name=self.llm_model_name,
+            draft_llm_model_name=self.draft_llm_model_name,
+            multimodal_encoders_name=self.multimodal_encoders_name)
+    def execute(self, requests):
+        responses = []
+        for request in requests:
+            if self.decoupled:
+                response_sender = request.get_response_sender()
+            try:
+                req = self.decoder.convert_triton_request(request)
+                req.validate()
+                speculative_decode = (req.num_draft_tokens is not None
+                                      and req.num_draft_tokens[0][0] > 0)
+                if speculative_decode and (self.draft_llm_model_name is None
+                                           or self.draft_llm_model_name == ""):
+                    raise Exception(
+                        "cannot perform speculative decoding without draft model"
+                    )
+                is_multimodal = req.image_input is not None
+                if speculative_decode and is_multimodal:
+                    raise Exception(
+                        "Multimodal and speculative decoding is not currently supported"
+                    )
+                res_gen = self.decoder.decode(
+                    req,
+                    speculative_decoding=speculative_decode,
+                    is_multimodal=is_multimodal)
+                for res in res_gen:
+                    triton_response = self.decoder.create_triton_response(res)
+                    if self.decoupled:
+                        response_sender.send(triton_response)
+                    else:
+                        responses.append(triton_response)
+                if self.decoupled:
+                    response_sender.send(
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+            except Exception:
+                self.logger.log_error(traceback.format_exc())
+                # If encountering an error, send a response with err msg
+                error_response = pb_utils.InferenceResponse(
+                    output_tensors=[],
+                    error=pb_utils.TritonError(traceback.format_exc()))
+                if self.decoupled:
+                    response_sender.send(error_response)
+                    response_sender.send(
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                else:
+                    responses.append(error_response)
+            self.decoder.reset_decoder()
+            if self.decoupled:
+                return None
+            else:
+                assert len(responses) == len(requests)
+                return responses
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print('Cleaning up...')

tensorrt_llm_bls/config.pbtxt ADDED Viewed

	@@ -0,0 +1,270 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+name: "tensorrt_llm_bls"
+backend: "python"
+max_batch_size: 32
+model_transaction_policy {
+  decoupled: True
+}
+input [
+  {
+    name: "text_input"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  },
+  {
+    name: "decoder_text_input"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "image_input"
+    data_type: TYPE_FP16
+    dims: [ 3, -1, -1 ]
+    optional: true
+  },
+  {
+    name: "max_tokens"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+   name: "bad_words"
+   data_type: TYPE_STRING
+   dims: [ -1 ]
+   optional: true
+  },
+  {
+   name: "stop_words"
+   data_type: TYPE_STRING
+   dims: [ -1 ]
+   optional: true
+  },
+  {
+    name: "end_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "pad_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "top_k"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "length_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "repetition_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "min_length"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "presence_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "frequency_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "random_seed"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_context_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_generation_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "beam_width"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "stream"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "prompt_embedding_table"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+  },
+  {
+    name: "prompt_vocab_size"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+      name: "embedding_bias_words"
+      data_type: TYPE_STRING
+      dims: [ -1 ]
+      optional: true
+  },
+  {
+      name: "embedding_bias_weights"
+      data_type: TYPE_FP32
+      dims: [ -1 ]
+      optional: true
+  },
+  {
+      name: "num_draft_tokens",
+      data_type: TYPE_INT32,
+      dims: [ 1 ]
+      optional: true
+  },
+  {
+      name: "use_draft_logits",
+      data_type: TYPE_BOOL,
+      dims: [ 1 ]
+      reshape: { shape: [ ] }
+      optional: true
+  }
+]
+output [
+  {
+    name: "text_output"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  },
+  {
+    name: "cum_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "output_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "context_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "generation_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1, -1 ]
+  },
+  {
+    name: "batch_index"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+parameters: {
+  key: "accumulate_tokens"
+  value: {
+    string_value: "${accumulate_tokens}"
+  }
+}
+parameters: {
+  key: "tensorrt_llm_model_name"
+  value: {
+    string_value: "tensorrt_llm"
+  }
+}
+parameters: {
+  key: "tensorrt_llm_draft_model_name"
+  value: {
+    string_value: "${tensorrt_llm_draft_model_name}"
+  }
+}
+parameters: {
+  key: "multimodal_encoders_name"
+  value: {
+    string_value: "${multimodal_encoders_name}"
+  }
+}
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]