|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
name: "tensorrt_llm_bls" |
|
backend: "python" |
|
max_batch_size: 32 |
|
|
|
model_transaction_policy { |
|
decoupled: True |
|
} |
|
|
|
input [ |
|
{ |
|
name: "text_input" |
|
data_type: TYPE_STRING |
|
dims: [ 1 ] |
|
}, |
|
{ |
|
name: "decoder_text_input" |
|
data_type: TYPE_STRING |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "image_input" |
|
data_type: TYPE_FP16 |
|
dims: [ 3, -1, -1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "max_tokens" |
|
data_type: TYPE_INT32 |
|
dims: [ 1 ] |
|
}, |
|
{ |
|
name: "bad_words" |
|
data_type: TYPE_STRING |
|
dims: [ -1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "stop_words" |
|
data_type: TYPE_STRING |
|
dims: [ -1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "end_id" |
|
data_type: TYPE_INT32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "pad_id" |
|
data_type: TYPE_INT32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "top_k" |
|
data_type: TYPE_INT32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "top_p" |
|
data_type: TYPE_FP32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "temperature" |
|
data_type: TYPE_FP32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "length_penalty" |
|
data_type: TYPE_FP32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "repetition_penalty" |
|
data_type: TYPE_FP32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "min_length" |
|
data_type: TYPE_INT32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "presence_penalty" |
|
data_type: TYPE_FP32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "frequency_penalty" |
|
data_type: TYPE_FP32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "random_seed" |
|
data_type: TYPE_UINT64 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "return_log_probs" |
|
data_type: TYPE_BOOL |
|
dims: [ 1 ] |
|
reshape: { shape: [ ] } |
|
optional: true |
|
}, |
|
{ |
|
name: "return_context_logits" |
|
data_type: TYPE_BOOL |
|
dims: [ 1 ] |
|
reshape: { shape: [ ] } |
|
optional: true |
|
}, |
|
{ |
|
name: "return_generation_logits" |
|
data_type: TYPE_BOOL |
|
dims: [ 1 ] |
|
reshape: { shape: [ ] } |
|
optional: true |
|
}, |
|
{ |
|
name: "beam_width" |
|
data_type: TYPE_INT32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "stream" |
|
data_type: TYPE_BOOL |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "prompt_embedding_table" |
|
data_type: TYPE_FP16 |
|
dims: [ -1, -1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "prompt_vocab_size" |
|
data_type: TYPE_INT32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "embedding_bias_words" |
|
data_type: TYPE_STRING |
|
dims: [ -1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "embedding_bias_weights" |
|
data_type: TYPE_FP32 |
|
dims: [ -1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "num_draft_tokens", |
|
data_type: TYPE_INT32, |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "use_draft_logits", |
|
data_type: TYPE_BOOL, |
|
dims: [ 1 ] |
|
reshape: { shape: [ ] } |
|
optional: true |
|
} |
|
] |
|
output [ |
|
{ |
|
name: "text_output" |
|
data_type: TYPE_STRING |
|
dims: [ -1 ] |
|
}, |
|
{ |
|
name: "cum_log_probs" |
|
data_type: TYPE_FP32 |
|
dims: [ -1 ] |
|
}, |
|
{ |
|
name: "output_log_probs" |
|
data_type: TYPE_FP32 |
|
dims: [ -1, -1 ] |
|
}, |
|
{ |
|
name: "context_logits" |
|
data_type: TYPE_FP32 |
|
dims: [ -1, -1 ] |
|
}, |
|
{ |
|
name: "generation_logits" |
|
data_type: TYPE_FP32 |
|
dims: [ -1, -1, -1 ] |
|
}, |
|
{ |
|
name: "batch_index" |
|
data_type: TYPE_INT32 |
|
dims: [ 1 ] |
|
} |
|
] |
|
|
|
parameters: { |
|
key: "accumulate_tokens" |
|
value: { |
|
string_value: "${accumulate_tokens}" |
|
} |
|
} |
|
parameters: { |
|
key: "tensorrt_llm_model_name" |
|
value: { |
|
string_value: "tensorrt_llm" |
|
} |
|
} |
|
parameters: { |
|
key: "tensorrt_llm_draft_model_name" |
|
value: { |
|
string_value: "${tensorrt_llm_draft_model_name}" |
|
} |
|
} |
|
parameters: { |
|
key: "multimodal_encoders_name" |
|
value: { |
|
string_value: "${multimodal_encoders_name}" |
|
} |
|
} |
|
|
|
instance_group [ |
|
{ |
|
count: 1 |
|
kind : KIND_CPU |
|
} |
|
] |
|
|