diff --git a/model_repository/postprocessing/1/__pycache__/model.cpython-310.pyc b/model_repository/postprocessing/1/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa0ac1382a7864add3a9bb04e6b328fa6995f67d
Binary files /dev/null and b/model_repository/postprocessing/1/__pycache__/model.cpython-310.pyc differ
diff --git a/model_repository/postprocessing/1/model.py b/model_repository/postprocessing/1/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..20de97595195da5dedc044a31c6086c1f49892da
--- /dev/null
+++ b/model_repository/postprocessing/1/model.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+from pathlib import Path
+
+import numpy as np
+import triton_python_backend_utils as pb_utils
+
+# This tokenizer is `lmdeploy/turbomind/tokenizer.py`. When an LLM is served
+# by triton inference server, it has to be converted first by running
+# `python lmdeploy/serve/turbomind/deploy.py`. Then
+# `lmdeploy/turbomind/tokenizer.py` will be copied to `tokenizer/tokenizer.py`
+from .tokenizer.tokenizer import Tokenizer
+
+
+class TritonPythonModel:
+ """Your Python model must use the same class name.
+
+ Every Python model that is created must have "TritonPythonModel" as the
+ class name.
+ """
+
+ def initialize(self, args):
+ """`initialize` is called only once when the model is being loaded.
+ Implementing `initialize` function is optional. This function allows
+ the model to initialize any state associated with this model.
+ Parameters
+ ----------
+ args : dict
+ Both keys and values are strings. The dictionary keys and values are:
+ * model_config: A JSON string containing the model configuration
+ * model_instance_kind: A string containing model instance kind
+ * model_instance_device_id: A string containing model instance device
+ ID
+ * model_repository: Model repository path
+ * model_version: Model version
+ * model_name: Model name
+ """
+ # Parse model configs
+ self.model_config = model_config = json.loads(args['model_config'])
+
+ # Parse model output configs
+ output_config = pb_utils.get_output_config_by_name(
+ model_config, 'OUTPUT')
+
+ # Convert Triton types to numpy types
+ self.output_dtype = pb_utils.triton_string_to_numpy(
+ output_config['data_type'])
+
+ cur_folder = Path(__file__).parent
+
+ self.tokenizer = Tokenizer(
+ osp.join(
+ cur_folder, self.model_config['parameters']['tokenizer_path']
+ ['string_value']))
+
+ def execute(self, requests):
+ """`execute` must be implemented in every Python model. `execute`
+ function receives a list of pb_utils.InferenceRequest as the only
+ argument. This function is called when an inference is requested
+ for this model. Depending on the batching configuration (e.g. Dynamic
+ Batching) used, `requests` may contain multiple requests. Every
+ Python model, must create one pb_utils.InferenceResponse for every
+ pb_utils.InferenceRequest in `requests`. If there is an error, you can
+ set the error argument when creating a pb_utils.InferenceResponse.
+ Parameters
+ ----------
+ requests : list
+ A list of pb_utils.InferenceRequest
+ Returns
+ -------
+ list
+ A list of pb_utils.InferenceResponse. The length of this list must
+ be the same as `requests`
+ """
+
+ responses = []
+
+ # Every Python backend must iterate over everyone of the requests
+ # and create a pb_utils.InferenceResponse for each of them.
+ for idx, request in enumerate(requests):
+ # Get input tensors
+ tokens_batch = pb_utils.get_input_tensor_by_name(
+ request, 'TOKENS_BATCH').as_numpy()
+ sequence_length = pb_utils.get_input_tensor_by_name(
+ request, 'sequence_length').as_numpy()
+
+ # Postprocessing output data.
+ outputs = self._postprocessing(tokens_batch.tolist(),
+ sequence_length)
+
+ # Create output tensors. You need pb_utils.Tensor
+ # objects to create pb_utils.InferenceResponse.
+ output_tensor = pb_utils.Tensor(
+ 'OUTPUT',
+ np.array(outputs).astype(self.output_dtype))
+
+ # Create InferenceResponse. You can set an error here in case
+ # there was a problem with handling this inference request.
+ # Below is an example of how you can set errors in inference
+ # response:
+ #
+ # pb_utils.InferenceResponse(
+ # output_tensors=..., TritonError("An error occurred"))
+ inference_response = pb_utils.InferenceResponse(
+ output_tensors=[output_tensor])
+ responses.append(inference_response)
+
+ # You should return a list of pb_utils.InferenceResponse. Length
+ # of this list must match the length of `requests` list.
+ return responses
+
+ def finalize(self):
+ """`finalize` is called only once when the model is being unloaded.
+
+ Implementing `finalize` function is optional. This function allows the
+ model to perform any necessary clean ups before exit.
+ """
+ print('Cleaning up...')
+
+ def _postprocessing(self, tokens_batch, sequence_length):
+ """decode token ids into texts."""
+ outputs = []
+ for beam_tokens, beam_len in zip(tokens_batch, sequence_length):
+ for tokens, _len in zip(beam_tokens, beam_len):
+ output = self.tokenizer.decode(tokens, _len)
+ output = output.encode('utf8')
+ outputs.append(output)
+ return outputs
diff --git a/model_repository/postprocessing/1/tokenizer/config.json b/model_repository/postprocessing/1/tokenizer/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..84235b8a1a9618cc0ac265caf61ea4088780e3b1
--- /dev/null
+++ b/model_repository/postprocessing/1/tokenizer/config.json
@@ -0,0 +1,37 @@
+{
+ "_name_or_path": "/root/psy/internlm2-7b/work_dirs/internlm2_chat_7b_qlora_oasst1_512_e3_copy/hf_2/merge",
+ "architectures": [
+ "InternLM2ForCausalLM"
+ ],
+ "attn_implementation": "eager",
+ "auto_map": {
+ "AutoConfig": "configuration_internlm.InternLMConfig",
+ "AutoModel": "modeling_internlm2.InternLM2ForCausalLM",
+ "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM"
+ },
+ "bias": false,
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "fp16": true,
+ "hidden_act": "silu",
+ "hidden_size": 4096,
+ "initializer_range": 0.02,
+ "intermediate_size": 14336,
+ "max_position_embeddings": 32768,
+ "model_type": "internlm",
+ "num_attention_heads": 32,
+ "num_hidden_layers": 32,
+ "num_key_value_heads": 8,
+ "pad_token_id": 2,
+ "rms_norm_eps": 1e-05,
+ "rope_scaling": {
+ "factor": 2.0,
+ "type": "dynamic"
+ },
+ "rope_theta": 1000000,
+ "tie_word_embeddings": false,
+ "torch_dtype": "float16",
+ "transformers_version": "4.37.2",
+ "use_cache": false,
+ "vocab_size": 92544
+}
diff --git a/model_repository/postprocessing/1/tokenizer/configuration_internlm.py b/model_repository/postprocessing/1/tokenizer/configuration_internlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d013582feaa1f9970a4256c4a0f77000fa645de
--- /dev/null
+++ b/model_repository/postprocessing/1/tokenizer/configuration_internlm.py
@@ -0,0 +1,164 @@
+# coding=utf-8
+# Copyright (c) InternLM. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" InternLM model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class InternLMConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
+ an InternLM model according to the specified arguments, defining the model architecture. Instantiating a
+ configuration with the defaults will yield a similar configuration to that of the InternLM-7B.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+
+ Args:
+ vocab_size (`int`, *optional*, defaults to 32000):
+ Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the
+ `inputs_ids` passed when calling [`InternLMModel`]
+ hidden_size (`int`, *optional*, defaults to 4096):
+ Dimension of the hidden representations.
+ intermediate_size (`int`, *optional*, defaults to 11008):
+ Dimension of the MLP representations.
+ num_hidden_layers (`int`, *optional*, defaults to 32):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 32):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ num_key_value_heads (`int`, *optional*):
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+ by meanpooling all the original heads within that group. For more details checkout [this
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+ `num_attention_heads`.
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+ The non-linear activation function (function or string) in the decoder.
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
+ just in case (e.g., 512 or 1024 or 2048).
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+ The epsilon used by the rms normalization layers.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
+ relevant if `config.is_decoder=True`.
+ tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+ Whether to tie weight embeddings
+ Example:
+
+ ```python
+ >>> from transformers import InternLMModel, InternLMConfig
+
+ >>> # Initializing a InternLM internlm-7b style configuration
+ >>> configuration = InternLMConfig()
+
+ >>> # Initializing a model from the internlm-7b style configuration
+ >>> model = InternLMModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+ model_type = "internlm"
+ _auto_class = "AutoConfig"
+
+ def __init__( # pylint: disable=W0102
+ self,
+ vocab_size=103168,
+ hidden_size=4096,
+ intermediate_size=11008,
+ num_hidden_layers=32,
+ num_attention_heads=32,
+ num_key_value_heads=None,
+ hidden_act="silu",
+ max_position_embeddings=2048,
+ initializer_range=0.02,
+ rms_norm_eps=1e-6,
+ use_cache=True,
+ pad_token_id=0,
+ bos_token_id=1,
+ eos_token_id=2,
+ tie_word_embeddings=False,
+ bias=True,
+ rope_theta=10000,
+ rope_scaling=None,
+ attn_implementation="eager",
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.max_position_embeddings = max_position_embeddings
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.bias = bias
+
+ if num_key_value_heads is None:
+ num_key_value_heads = num_attention_heads
+ self.num_key_value_heads = num_key_value_heads
+
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.rms_norm_eps = rms_norm_eps
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.rope_scaling = rope_scaling
+ self._rope_scaling_validation()
+
+ self.attn_implementation = attn_implementation
+ if self.attn_implementation is None:
+ self.attn_implementation = "eager"
+ super().__init__(
+ pad_token_id=pad_token_id,
+ bos_token_id=bos_token_id,
+ eos_token_id=eos_token_id,
+ tie_word_embeddings=tie_word_embeddings,
+ **kwargs,
+ )
+
+ def _rope_scaling_validation(self):
+ """
+ Validate the `rope_scaling` configuration.
+ """
+ if self.rope_scaling is None:
+ return
+
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+ raise ValueError(
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+ f"got {self.rope_scaling}"
+ )
+ rope_scaling_type = self.rope_scaling.get("type", None)
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+ raise ValueError(
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+ )
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
+ raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")
diff --git a/model_repository/postprocessing/1/tokenizer/generation_config.json b/model_repository/postprocessing/1/tokenizer/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc5efeadd3bf2caa4462a3be79d580690f410668
--- /dev/null
+++ b/model_repository/postprocessing/1/tokenizer/generation_config.json
@@ -0,0 +1,7 @@
+{
+ "_from_model_config": true,
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "pad_token_id": 2,
+ "transformers_version": "4.37.2"
+}
diff --git a/model_repository/postprocessing/1/tokenizer/modeling_internlm2.py b/model_repository/postprocessing/1/tokenizer/modeling_internlm2.py
new file mode 100644
index 0000000000000000000000000000000000000000..39d6f71d2933385988ec05f845d3f6386c97f74b
--- /dev/null
+++ b/model_repository/postprocessing/1/tokenizer/modeling_internlm2.py
@@ -0,0 +1,1385 @@
+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on transformers/src/transformers/models/llama/modeling_llama.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch InternLM2 model."""
+import math
+import queue
+import threading
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+ BaseModelOutputWithPast,
+ CausalLMOutputWithPast,
+ SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+)
+
+try:
+ from transformers.generation.streamers import BaseStreamer
+except: # noqa # pylint: disable=bare-except
+ BaseStreamer = None
+
+from .configuration_internlm import InternLMConfig as InternLM2Config
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "InternLM2Config"
+
+flash_attn_func, flash_attn_varlen_func = None, None
+pad_input, index_first_axis, unpad_input = None, None, None
+def _import_flash_attn():
+ global flash_attn_func, flash_attn_varlen_func
+ global pad_input, index_first_axis, unpad_input
+ try:
+ from flash_attn import flash_attn_func as _flash_attn_func, flash_attn_varlen_func as _flash_attn_varlen_func
+ from flash_attn.bert_padding import pad_input as _pad_input, index_first_axis as _index_first_axis, unpad_input as _unpad_input
+ flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func
+ pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input
+ except ImportError:
+ raise ImportError("flash_attn is not installed.")
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+ return (
+ indices,
+ cu_seqlens,
+ max_seqlen_in_batch,
+ )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+
+ if past_key_values_length > 0:
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ bsz, src_len = mask.size()
+ tgt_len = tgt_len if tgt_len is not None else src_len
+
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+ inverted_mask = 1.0 - expanded_mask
+
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM2
+class InternLM2RMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ InternLM2RMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM2
+class InternLM2RotaryEmbedding(nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=torch.float32)
+
+ return (
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
+ )
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->InternLM2
+class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding):
+ """InternLM2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ t = t / self.scaling_factor
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM2
+class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding):
+ """InternLM2RotaryEmbedding extended with Dynamic NTK scaling.
+ Credits to the Reddit users /u/bloc97 and /u/emozilla.
+ """
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+
+ if seq_len > self.max_position_embeddings:
+ base = self.base * (
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+ ) ** (self.dim / (self.dim - 2))
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+# Copied from transformers.model.llama.modeling_llama.rotate_half
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+ """Applies Rotary Position Embedding to the query and key tensors."""
+ cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+ sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class InternLM2MLP(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.intermediate_size = config.intermediate_size
+ self.w1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.w3 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.w2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+ self.act_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, x):
+ down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x))
+
+ return down_proj
+
+
+# Copied from transformers.model.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# Modified from transformers.model.llama.modeling_llama.LlamaAttention
+class InternLM2Attention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: InternLM2Config):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = self.hidden_size // self.num_heads
+ self.num_key_value_heads = config.num_key_value_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ self.max_position_embeddings = config.max_position_embeddings
+ self.is_causal = True
+
+ if (self.head_dim * self.num_heads) != self.hidden_size:
+ raise ValueError(
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+ f" and `num_heads`: {self.num_heads})."
+ )
+
+ self.wqkv = nn.Linear(
+ self.hidden_size,
+ (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim,
+ bias=config.bias,
+ )
+
+ self.wo = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
+ self._init_rope()
+
+ def _init_rope(self):
+ if self.config.rope_scaling is None:
+ self.rotary_emb = InternLM2RotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.config.rope_theta,
+ )
+ else:
+ scaling_type = self.config.rope_scaling["type"]
+ scaling_factor = self.config.rope_scaling["factor"]
+ if scaling_type == "dynamic":
+ self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.config.rope_theta,
+ scaling_factor=scaling_factor,
+ )
+ elif scaling_type == "linear":
+ self.rotary_emb = InternLM2LinearScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.config.rope_theta,
+ scaling_factor=scaling_factor,
+ )
+ else:
+ raise ValueError("Currently we only support rotary embedding's type being 'dynamic' or 'linear'.")
+ return self.rotary_emb
+
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+ "Please make sure use `attention_mask` instead.`"
+ )
+
+ bsz, q_len, _ = hidden_states.size()
+
+ qkv_states = self.wqkv(hidden_states)
+
+ qkv_states = rearrange(
+ qkv_states,
+ "b q (h gs d) -> b q h gs d",
+ gs=2 + self.num_key_value_groups,
+ d=self.head_dim,
+ )
+
+ query_states = qkv_states[..., : self.num_key_value_groups, :]
+ query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d")
+ key_states = qkv_states[..., -2, :]
+ value_states = qkv_states[..., -1, :]
+
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+ past_key_value = (key_states, value_states) if use_cache else None
+
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+ )
+ attn_weights = attn_weights + attention_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+ attn_output = self.wo(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+# Modified from transformers.model.llama.modeling_llama.InternLM2FlashAttention2
+class InternLM2FlashAttention2(InternLM2Attention):
+ """
+ InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+ flash attention and deal with padding tokens in case the input contains any of them.
+ """
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ # InternLM2FlashAttention2 attention does not support output_attentions
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+ "Please make sure use `attention_mask` instead.`"
+ )
+
+ # overwrite attention_mask with padding_mask
+ attention_mask = kwargs.pop("padding_mask")
+
+ output_attentions = False
+
+ bsz, q_len, _ = hidden_states.size()
+
+ qkv_states = self.wqkv(hidden_states)
+
+ qkv_states = rearrange(
+ qkv_states,
+ "b q (h gs d) -> b q h gs d",
+ gs=2 + self.num_key_value_groups,
+ d=self.head_dim,
+ )
+
+ query_states = qkv_states[..., : self.num_key_value_groups, :]
+ query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d")
+ key_states = qkv_states[..., -2, :]
+ value_states = qkv_states[..., -1, :]
+
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+ past_key_value = (key_states, value_states) if use_cache else None
+
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ attn_output = self._flash_attention_forward(
+ query_states, key_states, value_states, attention_mask, q_len
+ )
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+ attn_output = self.wo(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+ def _flash_attention_forward(
+ self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+ ):
+ """
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+ first unpad the input, then computes the attention scores and pad the final attention scores.
+
+ Args:
+ query_states (`torch.Tensor`):
+ Input query states to be passed to Flash Attention API
+ key_states (`torch.Tensor`):
+ Input key states to be passed to Flash Attention API
+ value_states (`torch.Tensor`):
+ Input value states to be passed to Flash Attention API
+ attention_mask (`torch.Tensor`):
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+ position of padding tokens and 1 for the position of non-padding tokens.
+ dropout (`int`, *optional*):
+ Attention dropout
+ softmax_scale (`float`, *optional*):
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+ """
+ # Contains at least one padding token in the sequence
+ causal = self.is_causal and query_length != 1
+ if attention_mask is not None:
+ batch_size = query_states.shape[0]
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input(
+ query_states, key_states, value_states, attention_mask, query_length
+ )
+
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+ attn_output_unpad = flash_attn_varlen_func(
+ query_states,
+ key_states,
+ value_states,
+ cu_seqlens_q=cu_seqlens_q,
+ cu_seqlens_k=cu_seqlens_k,
+ max_seqlen_q=max_seqlen_in_batch_q,
+ max_seqlen_k=max_seqlen_in_batch_k,
+ dropout_p=dropout,
+ softmax_scale=softmax_scale,
+ causal=causal,
+ )
+
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+ else:
+ attn_output = flash_attn_func(
+ query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+ )
+
+ return attn_output
+
+ def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+ key_layer = index_first_axis(
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+ )
+ value_layer = index_first_axis(
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+ )
+
+ if query_length == kv_seq_len:
+ query_layer = index_first_axis(
+ query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+ )
+ cu_seqlens_q = cu_seqlens_k
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
+ indices_q = indices_k
+ elif query_length == 1:
+ max_seqlen_in_batch_q = 1
+ cu_seqlens_q = torch.arange(
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
+ ) # There is a memcpy here, that is very bad.
+ indices_q = cu_seqlens_q[:-1]
+ query_layer = query_layer.squeeze(1)
+ else:
+ # The -q_len: slice assumes left padding.
+ attention_mask = attention_mask[:, -query_length:]
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+ return (
+ query_layer,
+ key_layer,
+ value_layer,
+ indices_q.to(torch.int64),
+ (cu_seqlens_q, cu_seqlens_k),
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+ )
+
+INTERNLM2_ATTENTION_CLASSES = {
+ "eager": InternLM2Attention,
+ "flash_attention_2": InternLM2FlashAttention2,
+}
+
+# Modified from transformers.model.llama.modeling_llama.LlamaDecoderLayer
+class InternLM2DecoderLayer(nn.Module):
+ def __init__(self, config: InternLM2Config):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+
+ self.attention = INTERNLM2_ATTENTION_CLASSES[config.attn_implementation](config=config)
+
+ self.feed_forward = InternLM2MLP(config)
+ self.attention_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.ffn_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ **kwargs,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`, *optional*):
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+ query_sequence_length, key_sequence_length)` if default attention is used.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+ """
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+ "Please make sure use `attention_mask` instead.`"
+ )
+
+ residual = hidden_states
+
+ hidden_states = self.attention_norm(hidden_states)
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.attention(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ **kwargs,
+ )
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.ffn_norm(hidden_states)
+ hidden_states = self.feed_forward(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ return outputs
+
+
+InternLM2_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`InternLM2Config`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2
+@add_start_docstrings(
+ "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.",
+ InternLM2_START_DOCSTRING,
+)
+class InternLM2PreTrainedModel(PreTrainedModel):
+ config_class = InternLM2Config
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["InternLM2DecoderLayer"]
+ _skip_keys_device_placement = "past_key_values"
+
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+
+InternLM2_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+ `past_key_values`).
+
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+ information on the default strategy.
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.n_positions - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or
+ when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+ `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+ of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Modified from transformers.model.llama.modeling_llama.LlamaModel
+@add_start_docstrings(
+ "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.",
+ InternLM2_START_DOCSTRING,
+)
+class InternLM2Model(InternLM2PreTrainedModel):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLM2DecoderLayer`]
+
+ Args:
+ config: InternLM2Config
+ """
+
+ _auto_class = "AutoModel"
+
+ def __init__(self, config: InternLM2Config):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+ self.config = config
+
+ self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+
+ self.layers = nn.ModuleList([InternLM2DecoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.tok_embeddings
+
+ def set_input_embeddings(self, value):
+ self.tok_embeddings = value
+
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+ # create causal mask
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ combined_attention_mask = None
+ if input_shape[-1] > 1:
+ combined_attention_mask = _make_causal_mask(
+ input_shape,
+ inputs_embeds.dtype,
+ device=inputs_embeds.device,
+ past_key_values_length=past_key_values_length,
+ )
+
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+ inputs_embeds.device
+ )
+ combined_attention_mask = (
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+ )
+
+ return combined_attention_mask
+
+ @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if self.config.attn_implementation == "flash_attention_2":
+ _import_flash_attn()
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ batch_size, seq_length = input_ids.shape[:2]
+ elif inputs_embeds is not None:
+ batch_size, seq_length = inputs_embeds.shape[:2]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ seq_length_with_past = seq_length
+ past_key_values_length = 0
+ if past_key_values is not None:
+ past_key_values_length = past_key_values[0][0].shape[2]
+ seq_length_with_past = seq_length_with_past + past_key_values_length
+
+ if position_ids is None:
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+ position_ids = torch.arange(
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+ )
+ position_ids = position_ids.unsqueeze(0)
+
+ if inputs_embeds is None:
+ inputs_embeds = self.tok_embeddings(input_ids)
+
+ if self.config.attn_implementation == "flash_attention_2":
+ # 2d mask is passed through the layers
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+ else:
+ if attention_mask is None:
+ attention_mask = torch.ones(
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+ )
+ attention_mask = self._prepare_decoder_attention_mask(
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+ )
+
+ # embed positions
+ hidden_states = inputs_embeds
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ next_decoder_cache = () if use_cache else None
+
+ for idx, decoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ # None for past_key_value
+ return module(*inputs, output_attentions, None)
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(decoder_layer),
+ hidden_states,
+ attention_mask,
+ position_ids,
+ None,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ hidden_states = self.norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = next_decoder_cache if use_cache else None
+ if not return_dict:
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ )
+
+
+# Modified from transformers.model.llama.modeling_llama.LlamaForCausalLM
+class InternLM2ForCausalLM(InternLM2PreTrainedModel):
+ _auto_class = "AutoModelForCausalLM"
+
+ _tied_weights_keys = ["output.weight"]
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = InternLM2Model(config)
+ self.vocab_size = config.vocab_size
+ self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.tok_embeddings
+
+ def set_input_embeddings(self, value):
+ self.model.tok_embeddings = value
+
+ def get_output_embeddings(self):
+ return self.output
+
+ def set_output_embeddings(self, new_embeddings):
+ self.output = new_embeddings
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+ r"""
+ Args:
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+ Returns:
+
+ Example:
+
+ ```python
+ >>> from transformers import AutoTokenizer, InternLM2ForCausalLM
+
+ >>> model = InternLM2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+ >>> # Generate
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+ ```"""
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = outputs[0]
+ logits = self.output(hidden_states)
+ logits = logits.float()
+
+ loss = None
+ if labels is not None:
+ # Shift so that tokens < n predict n
+ shift_logits = logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+ # Flatten the tokens
+ loss_fct = CrossEntropyLoss()
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
+ shift_labels = shift_labels.view(-1)
+ # Enable model parallelism
+ shift_labels = shift_labels.to(shift_logits.device)
+ loss = loss_fct(shift_logits, shift_labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def prepare_inputs_for_generation(
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+ ):
+ if past_key_values is not None:
+ past_length = past_key_values[0][0].shape[2]
+
+ # Some generation methods already pass only the last input ID
+ if input_ids.shape[1] > past_length:
+ remove_prefix_length = past_length
+ else:
+ # Default to old behavior: keep only final ID
+ remove_prefix_length = input_ids.shape[1] - 1
+
+ input_ids = input_ids[:, remove_prefix_length:]
+
+ position_ids = kwargs.get("position_ids", None)
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -input_ids.shape[1] :]
+
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+ if inputs_embeds is not None and past_key_values is None:
+ model_inputs = {"inputs_embeds": inputs_embeds}
+ else:
+ model_inputs = {"input_ids": input_ids}
+
+ model_inputs.update(
+ {
+ "position_ids": position_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "attention_mask": attention_mask,
+ }
+ )
+ return model_inputs
+
+ @staticmethod
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+ )
+ return reordered_past
+
+ def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
+ prompt = ""
+ if meta_instruction:
+ prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n"""
+ else:
+ prompt += ""
+ for record in history:
+ prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
+ prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
+ return tokenizer([prompt], return_tensors="pt")
+
+ @torch.no_grad()
+ def chat(
+ self,
+ tokenizer,
+ query: str,
+ history: List[Tuple[str, str]] = [],
+ streamer: Optional[BaseStreamer] = None,
+ max_new_tokens: int = 1024,
+ do_sample: bool = True,
+ temperature: float = 0.8,
+ top_p: float = 0.8,
+ meta_instruction: str = "You are an AI assistant whose name is InternLM (书生·浦语).\n"
+ "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
+ "- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.",
+ **kwargs,
+ ):
+ inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
+ inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
+ # also add end-of-assistant token in eos token id to avoid unnecessary generation
+ eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
+ outputs = self.generate(
+ **inputs,
+ streamer=streamer,
+ max_new_tokens=max_new_tokens,
+ do_sample=do_sample,
+ temperature=temperature,
+ top_p=top_p,
+ eos_token_id=eos_token_id,
+ **kwargs,
+ )
+ outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
+ response = tokenizer.decode(outputs, skip_special_tokens=True)
+ response = response.split("<|im_end|>")[0]
+ history = history + [(query, response)]
+ return response, history
+
+ @torch.no_grad()
+ def stream_chat(
+ self,
+ tokenizer,
+ query: str,
+ history: List[Tuple[str, str]] = [],
+ max_new_tokens: int = 1024,
+ do_sample: bool = True,
+ temperature: float = 0.8,
+ top_p: float = 0.8,
+ **kwargs,
+ ):
+ """
+ Return a generator in format: (response, history)
+ Eg.
+ ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')])
+ ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')])
+ """
+ if BaseStreamer is None:
+ raise ModuleNotFoundError(
+ "The version of `transformers` is too low. Please make sure "
+ "that you have installed `transformers>=4.28.0`."
+ )
+
+ response_queue = queue.Queue(maxsize=20)
+
+ class ChatStreamer(BaseStreamer):
+ def __init__(self, tokenizer) -> None:
+ super().__init__()
+ self.tokenizer = tokenizer
+ self.queue = response_queue
+ self.query = query
+ self.history = history
+ self.response = ""
+ self.received_inputs = False
+ self.queue.put((self.response, history + [(self.query, self.response)]))
+
+ def put(self, value):
+ if len(value.shape) > 1 and value.shape[0] > 1:
+ raise ValueError("ChatStreamer only supports batch size 1")
+ elif len(value.shape) > 1:
+ value = value[0]
+
+ if not self.received_inputs:
+ # The first received value is input_ids, ignore here
+ self.received_inputs = True
+ return
+
+ token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
+ if token.strip() != "<|im_end|>":
+ self.response = self.response + token
+ history = self.history + [(self.query, self.response)]
+ self.queue.put((self.response, history))
+
+ def end(self):
+ self.queue.put(None)
+
+ def stream_producer():
+ return self.chat(
+ tokenizer=tokenizer,
+ query=query,
+ streamer=ChatStreamer(tokenizer=tokenizer),
+ history=history,
+ max_new_tokens=max_new_tokens,
+ do_sample=do_sample,
+ temperature=temperature,
+ top_p=top_p,
+ **kwargs,
+ )
+
+ def consumer():
+ producer = threading.Thread(target=stream_producer)
+ producer.start()
+ while True:
+ res = response_queue.get()
+ if res is None:
+ return
+ yield res
+
+ return consumer()
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
+@add_start_docstrings(
+ """
+ The InternLM2 Model transformer with a sequence classification head on top (linear layer).
+
+ [`InternLM2ForSequenceClassification`] uses the last token in order to do the classification,
+ as other causal models (e.g. GPT-2) do.
+
+ Since it does classification on the last token, it requires to know the position of the last token. If a
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+ each row of the batch).
+ """,
+ InternLM2_START_DOCSTRING,
+)
+class InternLM2ForSequenceClassification(InternLM2PreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+ self.model = InternLM2Model(config)
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.tok_embeddings
+
+ def set_input_embeddings(self, value):
+ self.model.tok_embeddings = value
+
+ @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ transformer_outputs = self.model(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ hidden_states = transformer_outputs[0]
+ logits = self.score(hidden_states)
+
+ if input_ids is not None:
+ batch_size = input_ids.shape[0]
+ else:
+ batch_size = inputs_embeds.shape[0]
+
+ if self.config.pad_token_id is None and batch_size != 1:
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+ if self.config.pad_token_id is None:
+ sequence_lengths = -1
+ else:
+ if input_ids is not None:
+ sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to(
+ logits.device
+ )
+ else:
+ sequence_lengths = -1
+
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+ loss = None
+ if labels is not None:
+ labels = labels.to(logits.device)
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(pooled_logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(pooled_logits, labels)
+ if not return_dict:
+ output = (pooled_logits,) + transformer_outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SequenceClassifierOutputWithPast(
+ loss=loss,
+ logits=pooled_logits,
+ past_key_values=transformer_outputs.past_key_values,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
diff --git a/model_repository/postprocessing/1/tokenizer/placeholder b/model_repository/postprocessing/1/tokenizer/placeholder
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/model_repository/postprocessing/1/tokenizer/pytorch_model.bin.index.json b/model_repository/postprocessing/1/tokenizer/pytorch_model.bin.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d95cf180df4c423e817c55f30f5ce93ac80e220
--- /dev/null
+++ b/model_repository/postprocessing/1/tokenizer/pytorch_model.bin.index.json
@@ -0,0 +1,554 @@
+{
+ "metadata": {
+ "total_size": 5251801088
+ },
+ "weight_map": {
+ "model.layers.0.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.2.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.20.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.3.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.30.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.4.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.tok_embeddings.weight": "pytorch_model-00001-of-00003.bin",
+ "output.weight": "pytorch_model-00003-of-00003.bin"
+ }
+}
diff --git a/model_repository/postprocessing/1/tokenizer/special_tokens_map.json b/model_repository/postprocessing/1/tokenizer/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..492d4b2966a1763442d426d880dbc29f94906e4c
--- /dev/null
+++ b/model_repository/postprocessing/1/tokenizer/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/model_repository/postprocessing/1/tokenizer/tokenization_internlm.py b/model_repository/postprocessing/1/tokenizer/tokenization_internlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9792349c7fed6fc64476eabdd9dad7a84640c3ee
--- /dev/null
+++ b/model_repository/postprocessing/1/tokenizer/tokenization_internlm.py
@@ -0,0 +1,240 @@
+# coding=utf-8
+# Copyright (c) InternLM. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes for IntermLM."""
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {}
+
+
+class InternLMTokenizer(PreTrainedTokenizer):
+ """
+ Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+ Args:
+ vocab_file (`str`):
+ Path to the vocabulary file.
+ """
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+ model_input_names = ["input_ids", "attention_mask"]
+ _auto_class = "AutoTokenizer"
+
+ def __init__(
+ self,
+ vocab_file,
+ unk_token="",
+ bos_token="",
+ eos_token="",
+ pad_token="",
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
+ add_bos_token=True,
+ add_eos_token=False,
+ decode_with_prefix_space=False,
+ clean_up_tokenization_spaces=False,
+ **kwargs,
+ ):
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ self.vocab_file = vocab_file
+ self.add_bos_token = add_bos_token
+ self.add_eos_token = add_eos_token
+ self.decode_with_prefix_space = decode_with_prefix_space
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+ self._no_prefix_space_tokens = None
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ pad_token=pad_token,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ **kwargs,
+ )
+
+ """ Initialization"""
+
+ @property
+ def no_prefix_space_tokens(self):
+ if self._no_prefix_space_tokens is None:
+ vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
+ self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
+ return self._no_prefix_space_tokens
+
+ @property
+ def vocab_size(self):
+ """Returns vocab size"""
+ return self.sp_model.get_piece_size()
+
+ @property
+ def bos_token_id(self) -> Optional[int]:
+ return self.sp_model.bos_id()
+
+ @property
+ def eos_token_id(self) -> Optional[int]:
+ return self.sp_model.eos_id()
+
+ def get_vocab(self):
+ """Returns vocab as a dict"""
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
+ def _tokenize(self, text):
+ """Returns a tokenized string."""
+ return self.sp_model.encode(text, out_type=str)
+
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ return self.sp_model.piece_to_id(token)
+
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ token = self.sp_model.IdToPiece(index)
+ return token
+
+ def _maybe_add_prefix_space(self, tokens, decoded):
+ if tokens and tokens[0] not in self.no_prefix_space_tokens:
+ return " " + decoded
+ else:
+ return decoded
+
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (string) in a single string."""
+ current_sub_tokens = []
+ out_string = ""
+ prev_is_special = False
+ for token in tokens:
+ # make sure that special tokens are not decoded using sentencepiece model
+ if token in self.all_special_tokens:
+ if not prev_is_special:
+ out_string += " "
+ out_string += self.sp_model.decode(current_sub_tokens) + token
+ prev_is_special = True
+ current_sub_tokens = []
+ else:
+ current_sub_tokens.append(token)
+ prev_is_special = False
+ out_string += self.sp_model.decode(current_sub_tokens)
+ out_string = self.clean_up_tokenization(out_string)
+ out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
+ return out_string[1:]
+
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ """
+ Save the vocabulary and special tokens file to a directory.
+
+ Args:
+ save_directory (`str`):
+ The directory in which to save the vocabulary.
+
+ Returns:
+ `Tuple(str)`: Paths to the files saved.
+ """
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ out_vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+ copyfile(self.vocab_file, out_vocab_file)
+ elif not os.path.isfile(self.vocab_file):
+ with open(out_vocab_file, "wb") as fi:
+ content_spiece_model = self.sp_model.serialized_model_proto()
+ fi.write(content_spiece_model)
+
+ return (out_vocab_file,)
+
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+ if self.add_bos_token:
+ bos_token_ids = [self.bos_token_id]
+ else:
+ bos_token_ids = []
+
+ output = bos_token_ids + token_ids_0
+
+ if token_ids_1 is not None:
+ output = output + token_ids_1
+
+ if self.add_eos_token:
+ output = output + [self.eos_token_id]
+
+ return output
+
+ def get_special_tokens_mask(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+ ) -> List[int]:
+ """
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+ special tokens using the tokenizer `prepare_for_model` method.
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+ Whether or not the token list is already formatted with special tokens for the model.
+
+ Returns:
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ """
+ if already_has_special_tokens:
+ return super().get_special_tokens_mask(
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+ )
+
+ if token_ids_1 is None:
+ return [1] + ([0] * len(token_ids_0)) + [1]
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+ def create_token_type_ids_from_sequences(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ """
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
+ use of token type ids, therefore a list of zeros is returned.
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of zeros.
+ """
+ eos = [self.eos_token_id]
+
+ if token_ids_1 is None:
+ return len(token_ids_0 + eos) * [0]
+ return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
diff --git a/model_repository/postprocessing/1/tokenizer/tokenizer.model b/model_repository/postprocessing/1/tokenizer/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6600712949ca9c4ffb50f25275993a21fba0b408
--- /dev/null
+++ b/model_repository/postprocessing/1/tokenizer/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b
+size 1477754
diff --git a/model_repository/postprocessing/1/tokenizer/tokenizer.py b/model_repository/postprocessing/1/tokenizer/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..db936a5501cb07d33d56083656dbd734ba7431bf
--- /dev/null
+++ b/model_repository/postprocessing/1/tokenizer/tokenizer.py
@@ -0,0 +1,400 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import os.path as osp
+from collections import deque
+from typing import List, Optional, Sequence, Union
+
+import torch
+
+from lmdeploy.utils import get_logger
+
+# this file will be copied to triton server, make sure all
+# importing are starting from the package root lmdeploy
+
+
+class SentencePieceTokenizer:
+ """Tokenizer of sentencepiece.
+
+ Args:
+ model_file (str): the path of the tokenizer model
+ """
+
+ def __init__(self, model_file: str):
+ from sentencepiece import SentencePieceProcessor
+ self.model = SentencePieceProcessor(model_file=model_file)
+ self._prefix_space_tokens = None
+ # for stop words
+ self._maybe_decode_bytes: bool = None
+ # TODO maybe lack a constant.py
+ self._indexes_tokens_deque = deque(maxlen=10)
+ self.max_indexes_num = 5
+ self.logger = get_logger('lmdeploy')
+
+ @property
+ def vocab_size(self):
+ """vocabulary size."""
+ return self.model.vocab_size()
+
+ @property
+ def bos_token_id(self):
+ """begine of the sentence token id."""
+ return self.model.bos_id()
+
+ @property
+ def eos_token_id(self):
+ """end of the sentence token id."""
+ return self.model.eos_id()
+
+ @property
+ def prefix_space_tokens(self):
+ """tokens without prefix space."""
+ if self._prefix_space_tokens is None:
+ vocab = self.model.IdToPiece(list(range(self.vocab_size)))
+ self._prefix_space_tokens = {
+ i
+ for i, tok in enumerate(vocab) if tok.startswith('▁')
+ }
+ return self._prefix_space_tokens
+
+ def _maybe_add_prefix_space(self, tokens, decoded):
+ """maybe add prefix space for incremental decoding."""
+ if len(tokens) and not decoded.startswith(' ') and\
+ tokens[0] in self.prefix_space_tokens:
+ return ' ' + decoded
+ else:
+ return decoded
+
+ def indexes_containing_token(self, token: str):
+ """Return all the possible indexes, whose decoding output may contain
+ the input token."""
+ # traversing vocab is time consuming, can not be accelerated with
+ # multi threads (computation) or multi process (can't pickle tokenizer)
+ # so, we maintain latest 10 stop words and return directly if matched
+ for _token, _indexes in self._indexes_tokens_deque:
+ if token == _token:
+ return _indexes
+ if token == ' ': # ' ' is special
+ token = '▁'
+ vocab = self.model.IdToPiece(list(range(self.vocab_size)))
+ indexes = [i for i, voc in enumerate(vocab) if token in voc]
+ if len(indexes) > self.max_indexes_num:
+ indexes = self.encode(token, add_bos=False)[-1:]
+ self.logger.warning(
+ f'There are too many(>{self.max_indexes_num}) possible '
+ f'indexes may decoding {token}, we will use {indexes} only')
+ self._indexes_tokens_deque.append((token, indexes))
+ return indexes
+
+ def encode(self, s: str, add_bos: bool = True, **kwargs):
+ """Tokenize a prompt.
+
+ Args:
+ s (str): a prompt
+ Returns:
+ list[int]: token ids
+ """
+ return self.model.Encode(s, add_bos=add_bos, **kwargs)
+
+ def decode(self, t: Sequence[int], offset: Optional[int] = None):
+ """De-tokenize.
+
+ Args:
+ t (List[int]): a list of token ids
+ offset (int): for incrementally decoding. Default to None, which
+ means not applied.
+ Returns:
+ str: text of decoding tokens
+ """
+ if isinstance(t, torch.Tensor):
+ t = t.tolist()
+ t = t[offset:]
+ out_string = self.model.Decode(t)
+ if offset:
+ out_string = self._maybe_add_prefix_space(t, out_string)
+ return out_string
+
+ def __call__(self, s: Union[str, Sequence[str]]):
+ """Tokenize prompts.
+
+ Args:
+ s (str): prompts
+ Returns:
+ list[int]: token ids
+ """
+ import addict
+ add_bos = False
+ add_eos = False
+
+ input_ids = self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
+ return addict.Addict(input_ids=input_ids)
+
+
+class HuggingFaceTokenizer:
+ """Tokenizer of sentencepiece.
+
+ Args:
+ model_dir (str): the directory of the tokenizer model
+ """
+
+ def __init__(self, model_dir: str):
+ from transformers import AutoTokenizer
+ model_file = osp.join(model_dir, 'tokenizer.model')
+ backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json')
+ model_file_exists = osp.exists(model_file)
+ self.logger = get_logger('lmdeploy')
+ if not osp.exists(backend_tokenizer_file) and model_file_exists:
+ self.logger.warning(
+ 'Can not find tokenizer.json. '
+ 'It may take long time to initialize the tokenizer.')
+ self.model = AutoTokenizer.from_pretrained(model_dir,
+ trust_remote_code=True)
+ self._prefix_space_tokens = None
+ # save tokenizer.json to reuse
+ if not osp.exists(backend_tokenizer_file) and model_file_exists:
+ if hasattr(self.model, 'backend_tokenizer'):
+ if os.access(model_dir, os.W_OK):
+ self.model.backend_tokenizer.save(backend_tokenizer_file)
+
+ if self.model.eos_token_id is None:
+ generation_config_file = osp.join(model_dir,
+ 'generation_config.json')
+ if osp.exists(generation_config_file):
+ with open(generation_config_file, 'r') as f:
+ cfg = json.load(f)
+ self.model.eos_token_id = cfg['eos_token_id']
+ elif hasattr(self.model, 'eod_id'): # Qwen remote
+ self.model.eos_token_id = self.model.eod_id
+
+ # for stop words
+ self._vocab_size_with_added: int = None
+ self._maybe_decode_bytes: bool = None
+ # TODO maybe lack a constant.py
+ self._indexes_tokens_deque = deque(maxlen=10)
+ self.max_indexes_num = 5
+ self.token2id = {}
+
+ @property
+ def vocab_size(self):
+ """vocabulary size."""
+ return self.model.vocab_size
+
+ @property
+ def vocab_size_with_added(self):
+ """vocabulary size with added vocab."""
+ if self._vocab_size_with_added is not None:
+ return self._vocab_size_with_added
+ self._vocab_size_with_added = len(self.model.get_vocab())
+ return self._vocab_size_with_added
+
+ @property
+ def bos_token_id(self):
+ """begine of the sentence token id."""
+ return self.model.bos_token_id
+
+ @property
+ def eos_token_id(self):
+ """end of the sentence token id."""
+ return self.model.eos_token_id
+
+ @property
+ def prefix_space_tokens(self):
+ """tokens without prefix space."""
+ if self._prefix_space_tokens is None:
+ vocab = self.model.convert_ids_to_tokens(
+ list(range(self.vocab_size)))
+ self._prefix_space_tokens = {
+ i
+ for i, tok in enumerate(vocab)
+ if tok.startswith('▁' if isinstance(tok, str) else b' ')
+ }
+ return self._prefix_space_tokens
+
+ def _maybe_add_prefix_space(self, tokens: List[int], decoded: str):
+ """maybe add prefix space for incremental decoding."""
+ if len(tokens) and not decoded.startswith(' ') and\
+ tokens[0] in self.prefix_space_tokens:
+ return ' ' + decoded
+ else:
+ return decoded
+
+ @property
+ def maybe_decode_bytes(self):
+ """Check if self.model.convert_ids_to_tokens return not a str value."""
+ if self._maybe_decode_bytes is None:
+ self._maybe_decode_bytes = False
+ vocab = self.model.convert_ids_to_tokens(
+ list(range(self.vocab_size)))
+ for tok in vocab:
+ if not isinstance(tok, str):
+ self._maybe_decode_bytes = True
+ break
+ return self._maybe_decode_bytes
+
+ def indexes_containing_token(self, token: str):
+ """Return all the possible indexes, whose decoding output may contain
+ the input token."""
+ # traversing vocab is time consuming, can not be accelerated with
+ # multi threads (computation) or multi process (can't pickle tokenizer)
+ # so, we maintain latest 10 stop words and return directly if matched
+ for _token, _indexes in self._indexes_tokens_deque:
+ if token == _token:
+ return _indexes
+
+ if self.token2id == {}:
+ # decode is slower than convert_ids_to_tokens
+ if self.maybe_decode_bytes:
+ self.token2id = {
+ self.model.decode(i): i
+ for i in range(self.vocab_size)
+ }
+ else:
+ self.token2id = {
+ self.model.convert_ids_to_tokens(i): i
+ for i in range(self.vocab_size)
+ }
+ if token == ' ': # ' ' is special
+ token = '▁'
+ indexes = [i for _token, i in self.token2id.items() if token in _token]
+ if len(indexes) > self.max_indexes_num:
+ indexes = self.encode(token, add_bos=False)[-1:]
+ self.logger.warning(
+ f'There are too many(>{self.max_indexes_num}) possible '
+ f'indexes may decoding {token}, we will use {indexes} only')
+ # there might be token id that exceeds self.vocab_size
+ if len(indexes) == 0:
+ indexes = self.encode(token, False)
+ if len(indexes) != 1:
+ self.logger.warning(
+ f'The token {token}, its length of indexes {indexes} is '
+ 'not 1. Currently, it can not be used as stop words')
+ indexes = []
+ self._indexes_tokens_deque.append((token, indexes))
+ return indexes
+
+ def encode(self, s: str, add_bos: bool = True, **kwargs):
+ """Tokenize a prompt.
+
+ Args:
+ s (str): a prompt
+ Returns:
+ list[int]: token ids
+ """
+ encoded = self.model.encode(s, **kwargs)
+ if not add_bos:
+ # in the middle of a session
+ if len(encoded) and encoded[0] == self.bos_token_id:
+ encoded = encoded[1:]
+ return encoded
+
+ def decode(self, t: Sequence[int], offset: Optional[int] = None):
+ """De-tokenize.
+
+ Args:
+ t (List[int]): a list of token ids
+ offset (int): for incrementally decoding. Default to None, which
+ means not applied.
+ Returns:
+ str: text of decoding tokens
+ """
+ skip_special_tokens = True
+ t = t[offset:]
+ out_string = self.model.decode(t,
+ skip_special_tokens=skip_special_tokens)
+ if offset:
+ out_string = self._maybe_add_prefix_space(t, out_string)
+ return out_string
+
+ def __call__(self, s: Union[str, Sequence[str]]):
+ """Tokenize prompts.
+
+ Args:
+ s (str): prompts
+ Returns:
+ list[int]: token ids
+ """
+ add_special_tokens = False
+ return self.model(s, add_special_tokens=add_special_tokens)
+
+
+class Tokenizer:
+ """Tokenize prompts or de-tokenize tokens into texts.
+
+ Args:
+ model_file (str): the path of the tokenizer model
+ """
+
+ def __init__(self, model_file: str):
+ if model_file.endswith('.model'):
+ model_folder = osp.split(model_file)[0]
+ else:
+ model_folder = model_file
+ model_file = osp.join(model_folder, 'tokenizer.model')
+ tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json')
+
+ model_file_exists = osp.exists(model_file)
+ config_exists = osp.exists(tokenizer_config_file)
+ use_hf_model = config_exists or not model_file_exists
+ self.logger = get_logger('lmdeploy')
+ if not use_hf_model:
+ self.model = SentencePieceTokenizer(model_file)
+ else:
+ self.model = HuggingFaceTokenizer(model_folder)
+
+ @property
+ def vocab_size(self):
+ """vocabulary size."""
+ return self.model.vocab_size
+
+ @property
+ def bos_token_id(self):
+ """begine of the sentence token id."""
+ return self.model.bos_token_id
+
+ @property
+ def eos_token_id(self):
+ """end of the sentence token id."""
+ return self.model.eos_token_id
+
+ def encode(self, s: str, add_bos: bool = True, **kwargs):
+ """Tokenize a prompt.
+
+ Args:
+ s (str): a prompt
+ Returns:
+ list[int]: token ids
+ """
+ return self.model.encode(s, add_bos, **kwargs)
+
+ def decode(self, t: Sequence[int], offset: Optional[int] = None):
+ """De-tokenize.
+
+ Args:
+ t (List[int]): a list of token ids
+ offset (int): for incrementally decoding. Default to None, which
+ means not applied.
+ Returns:
+ str: text of decoding tokens
+ """
+ return self.model.decode(t, offset)
+
+ def __call__(self, s: Union[str, Sequence[str]]):
+ """Tokenize prompts.
+
+ Args:
+ s (str): prompts
+ Returns:
+ list[int]: token ids
+ """
+ return self.model(s)
+
+ def indexes_containing_token(self, token):
+ """Return all the possible indexes, whose decoding output may contain
+ the input token."""
+ encoded = self.encode(token, add_bos=False)
+ if len(encoded) > 1:
+ self.logger.warning(
+ f'The token {token}, its length of indexes {encoded} is over '
+ 'than 1. Currently, it can not be used as stop words')
+ return []
+ return self.model.indexes_containing_token(token)
diff --git a/model_repository/postprocessing/1/tokenizer/tokenizer_config.json b/model_repository/postprocessing/1/tokenizer/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f133449013be570f08fdf7c70f1a2c8ccb4724da
--- /dev/null
+++ b/model_repository/postprocessing/1/tokenizer/tokenizer_config.json
@@ -0,0 +1,90 @@
+{
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92538": {
+ "content": "<|plugin|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92539": {
+ "content": "<|interpreter|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92540": {
+ "content": "<|action_end|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92541": {
+ "content": "<|action_start|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92542": {
+ "content": "<|im_end|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92543": {
+ "content": "<|im_start|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "auto_map": {
+ "AutoTokenizer": [
+ "tokenization_internlm.InternLMTokenizer",
+ null
+ ]
+ },
+ "bos_token": "",
+ "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "tokenizer_class": "InternLMTokenizer",
+ "unk_token": ""
+}
diff --git a/model_repository/postprocessing/config.pbtxt b/model_repository/postprocessing/config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a4c3fd1041dcd03dc5c18b3fc28533cb82ac5653
--- /dev/null
+++ b/model_repository/postprocessing/config.pbtxt
@@ -0,0 +1,36 @@
+name: "postprocessing"
+backend: "python"
+max_batch_size: 1
+input [
+ {
+ name: "TOKENS_BATCH"
+ data_type: TYPE_UINT32
+ dims: [ -1, -1 ]
+ },
+ {
+ name: "sequence_length"
+ data_type: TYPE_UINT32
+ dims: [ -1 ]
+ }
+]
+output [
+ {
+ name: "OUTPUT"
+ data_type: TYPE_STRING
+ dims: [ -1, -1 ]
+ }
+]
+
+instance_group [
+ {
+ count: 16
+ kind: KIND_CPU
+ }
+]
+
+parameters {
+ key: "tokenizer_path"
+ value: {
+ string_value: "tokenizer/tokenizer.model"
+ }
+}
diff --git a/model_repository/preprocessing/1/__pycache__/model.cpython-310.pyc b/model_repository/preprocessing/1/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..447bea773ddcc3daff21ef636ce8437c6632fed8
Binary files /dev/null and b/model_repository/preprocessing/1/__pycache__/model.cpython-310.pyc differ
diff --git a/model_repository/preprocessing/1/model.py b/model_repository/preprocessing/1/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e659fbae01737bd0a83980faf0e1eff9e607c3f
--- /dev/null
+++ b/model_repository/preprocessing/1/model.py
@@ -0,0 +1,151 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+from pathlib import Path
+
+import numpy as np
+import torch
+import triton_python_backend_utils as pb_utils
+from torch.nn.utils.rnn import pad_sequence
+
+# This tokenizer is `lmdeploy/turbomind/tokenizer.py`. When an LLM is served
+# by triton inference server, it has to be converted first by running
+# `python lmdeploy/serve/turbomind/deploy.py`. Then
+# `lmdeploy/turbomind/tokenizer.py` will be copied to `tokenizer/tokenizer.py`
+from .tokenizer.tokenizer import Tokenizer
+
+
+class TritonPythonModel:
+ """Your Python model must use the same class name.
+
+ Every Python model that is created must have "TritonPythonModel" as the
+ class name.
+ """
+
+ def initialize(self, args):
+ """`initialize` is called only once when the model is being loaded.
+ Implementing `initialize` function is optional. This function allows
+ the model to initialize any state associated with this model.
+ Parameters
+ ----------
+ args : dict
+ Both keys and values are strings. The dictionary keys and values are:
+ * model_config: A JSON string containing the model configuration
+ * model_instance_kind: A string containing model instance kind
+ * model_instance_device_id: A string containing model instance device
+ ID
+ * model_repository: Model repository path
+ * model_version: Model version
+ * model_name: Model name
+ """
+ # Parse model configs
+ self.model_config = model_config = json.loads(args['model_config'])
+
+ # Parse model output configs and convert Triton types to numpy types
+ input_names = ['INPUT_ID', 'REQUEST_INPUT_LEN']
+ for input_name in input_names:
+ setattr(
+ self,
+ input_name.lower() + '_dtype',
+ pb_utils.triton_string_to_numpy(
+ pb_utils.get_output_config_by_name(
+ model_config, input_name)['data_type']))
+
+ cur_folder = Path(__file__).parent
+ self.tokenizer = Tokenizer(
+ osp.join(
+ cur_folder, self.model_config['parameters']['tokenizer_path']
+ ['string_value']))
+ self.start_id = self.tokenizer.bos_token_id
+ self.end_id = self.tokenizer.eos_token_id
+
+ def execute(self, requests):
+ """`execute` must be implemented in every Python model. `execute`
+ function receives a list of pb_utils.InferenceRequest as the only
+ argument. This function is called when an inference is requested
+ for this model. Depending on the batching configuration (e.g. Dynamic
+ Batching) used, `requests` may contain multiple requests. Every
+ Python model, must create one pb_utils.InferenceResponse for every
+ pb_utils.InferenceRequest in `requests`. If there is an error, you can
+ set the error argument when creating a pb_utils.InferenceResponse.
+ Parameters
+ ----------
+ requests : list
+ A list of pb_utils.InferenceRequest
+ Returns
+ -------
+ list
+ A list of pb_utils.InferenceResponse. The length of this list must
+ be the same as `requests`
+ """
+
+ responses = []
+
+ # Every Python backend must iterate over everyone of the requests
+ # and create a pb_utils.InferenceResponse for each of them.
+ for idx, request in enumerate(requests):
+ # Get input tensors
+ query = pb_utils.get_input_tensor_by_name(request,
+ 'QUERY').as_numpy()
+
+ # Preprocessing input data.
+ input_id, request_input_len = self._create_request(query)
+
+ # Create output tensors. You need pb_utils.Tensor
+ # objects to create pb_utils.InferenceResponse.
+ input_id_tensor = pb_utils.Tensor(
+ 'INPUT_ID',
+ np.array(input_id).astype(self.input_id_dtype))
+ request_input_len_tensor = pb_utils.Tensor(
+ 'REQUEST_INPUT_LEN',
+ np.array(request_input_len).astype(
+ self.request_input_len_dtype))
+
+ # Create InferenceResponse. You can set an error here in case
+ # there was a problem with handling this inference request.
+ # Below is an example of how you can set errors in inference
+ # response:
+ #
+ # pb_utils.InferenceResponse(
+ # output_tensors=..., TritonError("An error occurred"))
+ inference_response = pb_utils.InferenceResponse(
+ output_tensors=[input_id_tensor, request_input_len_tensor])
+ responses.append(inference_response)
+
+ # You should return a list of pb_utils.InferenceResponse. Length
+ # of this list must match the length of `requests` list.
+ return responses
+
+ def finalize(self):
+ """`finalize` is called only once when the model is being unloaded.
+
+ Implementing `finalize` function is optional. This function allows the
+ model to perform any necessary clean ups before exit.
+ """
+ print('Cleaning up...')
+
+ def _create_request(self, query):
+ """Tokenize prompts and return the token ids and their length.
+
+ Args:
+ query (List[str]): a list of prompt
+ Returns:
+ tuple: token ids and their length
+ """
+ start_ids = []
+ for s in query:
+ _s = s[0].decode()
+ if _s == '':
+ start_id = [self.start_id
+ ] if self.start_id is not None else [-1]
+ elif _s == '':
+ start_id = [self.end_id] if self.end_id is not None else [-1]
+ else:
+ start_id = self.tokenizer.encode(_s)
+ start_ids.append(torch.IntTensor(start_id))
+
+ start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
+ start_ids = pad_sequence(start_ids,
+ batch_first=True,
+ padding_value=self.end_id)
+ return start_ids, start_lengths
diff --git a/model_repository/preprocessing/1/tokenizer/config.json b/model_repository/preprocessing/1/tokenizer/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..84235b8a1a9618cc0ac265caf61ea4088780e3b1
--- /dev/null
+++ b/model_repository/preprocessing/1/tokenizer/config.json
@@ -0,0 +1,37 @@
+{
+ "_name_or_path": "/root/psy/internlm2-7b/work_dirs/internlm2_chat_7b_qlora_oasst1_512_e3_copy/hf_2/merge",
+ "architectures": [
+ "InternLM2ForCausalLM"
+ ],
+ "attn_implementation": "eager",
+ "auto_map": {
+ "AutoConfig": "configuration_internlm.InternLMConfig",
+ "AutoModel": "modeling_internlm2.InternLM2ForCausalLM",
+ "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM"
+ },
+ "bias": false,
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "fp16": true,
+ "hidden_act": "silu",
+ "hidden_size": 4096,
+ "initializer_range": 0.02,
+ "intermediate_size": 14336,
+ "max_position_embeddings": 32768,
+ "model_type": "internlm",
+ "num_attention_heads": 32,
+ "num_hidden_layers": 32,
+ "num_key_value_heads": 8,
+ "pad_token_id": 2,
+ "rms_norm_eps": 1e-05,
+ "rope_scaling": {
+ "factor": 2.0,
+ "type": "dynamic"
+ },
+ "rope_theta": 1000000,
+ "tie_word_embeddings": false,
+ "torch_dtype": "float16",
+ "transformers_version": "4.37.2",
+ "use_cache": false,
+ "vocab_size": 92544
+}
diff --git a/model_repository/preprocessing/1/tokenizer/configuration_internlm.py b/model_repository/preprocessing/1/tokenizer/configuration_internlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d013582feaa1f9970a4256c4a0f77000fa645de
--- /dev/null
+++ b/model_repository/preprocessing/1/tokenizer/configuration_internlm.py
@@ -0,0 +1,164 @@
+# coding=utf-8
+# Copyright (c) InternLM. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" InternLM model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class InternLMConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
+ an InternLM model according to the specified arguments, defining the model architecture. Instantiating a
+ configuration with the defaults will yield a similar configuration to that of the InternLM-7B.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+
+ Args:
+ vocab_size (`int`, *optional*, defaults to 32000):
+ Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the
+ `inputs_ids` passed when calling [`InternLMModel`]
+ hidden_size (`int`, *optional*, defaults to 4096):
+ Dimension of the hidden representations.
+ intermediate_size (`int`, *optional*, defaults to 11008):
+ Dimension of the MLP representations.
+ num_hidden_layers (`int`, *optional*, defaults to 32):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 32):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ num_key_value_heads (`int`, *optional*):
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+ by meanpooling all the original heads within that group. For more details checkout [this
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+ `num_attention_heads`.
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+ The non-linear activation function (function or string) in the decoder.
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
+ just in case (e.g., 512 or 1024 or 2048).
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+ The epsilon used by the rms normalization layers.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
+ relevant if `config.is_decoder=True`.
+ tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+ Whether to tie weight embeddings
+ Example:
+
+ ```python
+ >>> from transformers import InternLMModel, InternLMConfig
+
+ >>> # Initializing a InternLM internlm-7b style configuration
+ >>> configuration = InternLMConfig()
+
+ >>> # Initializing a model from the internlm-7b style configuration
+ >>> model = InternLMModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+ model_type = "internlm"
+ _auto_class = "AutoConfig"
+
+ def __init__( # pylint: disable=W0102
+ self,
+ vocab_size=103168,
+ hidden_size=4096,
+ intermediate_size=11008,
+ num_hidden_layers=32,
+ num_attention_heads=32,
+ num_key_value_heads=None,
+ hidden_act="silu",
+ max_position_embeddings=2048,
+ initializer_range=0.02,
+ rms_norm_eps=1e-6,
+ use_cache=True,
+ pad_token_id=0,
+ bos_token_id=1,
+ eos_token_id=2,
+ tie_word_embeddings=False,
+ bias=True,
+ rope_theta=10000,
+ rope_scaling=None,
+ attn_implementation="eager",
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.max_position_embeddings = max_position_embeddings
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.bias = bias
+
+ if num_key_value_heads is None:
+ num_key_value_heads = num_attention_heads
+ self.num_key_value_heads = num_key_value_heads
+
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.rms_norm_eps = rms_norm_eps
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.rope_scaling = rope_scaling
+ self._rope_scaling_validation()
+
+ self.attn_implementation = attn_implementation
+ if self.attn_implementation is None:
+ self.attn_implementation = "eager"
+ super().__init__(
+ pad_token_id=pad_token_id,
+ bos_token_id=bos_token_id,
+ eos_token_id=eos_token_id,
+ tie_word_embeddings=tie_word_embeddings,
+ **kwargs,
+ )
+
+ def _rope_scaling_validation(self):
+ """
+ Validate the `rope_scaling` configuration.
+ """
+ if self.rope_scaling is None:
+ return
+
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+ raise ValueError(
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+ f"got {self.rope_scaling}"
+ )
+ rope_scaling_type = self.rope_scaling.get("type", None)
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+ raise ValueError(
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+ )
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
+ raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")
diff --git a/model_repository/preprocessing/1/tokenizer/generation_config.json b/model_repository/preprocessing/1/tokenizer/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc5efeadd3bf2caa4462a3be79d580690f410668
--- /dev/null
+++ b/model_repository/preprocessing/1/tokenizer/generation_config.json
@@ -0,0 +1,7 @@
+{
+ "_from_model_config": true,
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "pad_token_id": 2,
+ "transformers_version": "4.37.2"
+}
diff --git a/model_repository/preprocessing/1/tokenizer/modeling_internlm2.py b/model_repository/preprocessing/1/tokenizer/modeling_internlm2.py
new file mode 100644
index 0000000000000000000000000000000000000000..39d6f71d2933385988ec05f845d3f6386c97f74b
--- /dev/null
+++ b/model_repository/preprocessing/1/tokenizer/modeling_internlm2.py
@@ -0,0 +1,1385 @@
+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on transformers/src/transformers/models/llama/modeling_llama.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch InternLM2 model."""
+import math
+import queue
+import threading
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+ BaseModelOutputWithPast,
+ CausalLMOutputWithPast,
+ SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+)
+
+try:
+ from transformers.generation.streamers import BaseStreamer
+except: # noqa # pylint: disable=bare-except
+ BaseStreamer = None
+
+from .configuration_internlm import InternLMConfig as InternLM2Config
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "InternLM2Config"
+
+flash_attn_func, flash_attn_varlen_func = None, None
+pad_input, index_first_axis, unpad_input = None, None, None
+def _import_flash_attn():
+ global flash_attn_func, flash_attn_varlen_func
+ global pad_input, index_first_axis, unpad_input
+ try:
+ from flash_attn import flash_attn_func as _flash_attn_func, flash_attn_varlen_func as _flash_attn_varlen_func
+ from flash_attn.bert_padding import pad_input as _pad_input, index_first_axis as _index_first_axis, unpad_input as _unpad_input
+ flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func
+ pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input
+ except ImportError:
+ raise ImportError("flash_attn is not installed.")
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+ return (
+ indices,
+ cu_seqlens,
+ max_seqlen_in_batch,
+ )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+
+ if past_key_values_length > 0:
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ bsz, src_len = mask.size()
+ tgt_len = tgt_len if tgt_len is not None else src_len
+
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+ inverted_mask = 1.0 - expanded_mask
+
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM2
+class InternLM2RMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ InternLM2RMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM2
+class InternLM2RotaryEmbedding(nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=torch.float32)
+
+ return (
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
+ )
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->InternLM2
+class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding):
+ """InternLM2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ t = t / self.scaling_factor
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM2
+class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding):
+ """InternLM2RotaryEmbedding extended with Dynamic NTK scaling.
+ Credits to the Reddit users /u/bloc97 and /u/emozilla.
+ """
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+
+ if seq_len > self.max_position_embeddings:
+ base = self.base * (
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+ ) ** (self.dim / (self.dim - 2))
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+# Copied from transformers.model.llama.modeling_llama.rotate_half
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+ """Applies Rotary Position Embedding to the query and key tensors."""
+ cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+ sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class InternLM2MLP(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.intermediate_size = config.intermediate_size
+ self.w1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.w3 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.w2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+ self.act_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, x):
+ down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x))
+
+ return down_proj
+
+
+# Copied from transformers.model.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# Modified from transformers.model.llama.modeling_llama.LlamaAttention
+class InternLM2Attention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: InternLM2Config):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = self.hidden_size // self.num_heads
+ self.num_key_value_heads = config.num_key_value_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ self.max_position_embeddings = config.max_position_embeddings
+ self.is_causal = True
+
+ if (self.head_dim * self.num_heads) != self.hidden_size:
+ raise ValueError(
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+ f" and `num_heads`: {self.num_heads})."
+ )
+
+ self.wqkv = nn.Linear(
+ self.hidden_size,
+ (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim,
+ bias=config.bias,
+ )
+
+ self.wo = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
+ self._init_rope()
+
+ def _init_rope(self):
+ if self.config.rope_scaling is None:
+ self.rotary_emb = InternLM2RotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.config.rope_theta,
+ )
+ else:
+ scaling_type = self.config.rope_scaling["type"]
+ scaling_factor = self.config.rope_scaling["factor"]
+ if scaling_type == "dynamic":
+ self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.config.rope_theta,
+ scaling_factor=scaling_factor,
+ )
+ elif scaling_type == "linear":
+ self.rotary_emb = InternLM2LinearScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.config.rope_theta,
+ scaling_factor=scaling_factor,
+ )
+ else:
+ raise ValueError("Currently we only support rotary embedding's type being 'dynamic' or 'linear'.")
+ return self.rotary_emb
+
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+ "Please make sure use `attention_mask` instead.`"
+ )
+
+ bsz, q_len, _ = hidden_states.size()
+
+ qkv_states = self.wqkv(hidden_states)
+
+ qkv_states = rearrange(
+ qkv_states,
+ "b q (h gs d) -> b q h gs d",
+ gs=2 + self.num_key_value_groups,
+ d=self.head_dim,
+ )
+
+ query_states = qkv_states[..., : self.num_key_value_groups, :]
+ query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d")
+ key_states = qkv_states[..., -2, :]
+ value_states = qkv_states[..., -1, :]
+
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+ past_key_value = (key_states, value_states) if use_cache else None
+
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+ )
+ attn_weights = attn_weights + attention_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+ attn_output = self.wo(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+# Modified from transformers.model.llama.modeling_llama.InternLM2FlashAttention2
+class InternLM2FlashAttention2(InternLM2Attention):
+ """
+ InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+ flash attention and deal with padding tokens in case the input contains any of them.
+ """
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ # InternLM2FlashAttention2 attention does not support output_attentions
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+ "Please make sure use `attention_mask` instead.`"
+ )
+
+ # overwrite attention_mask with padding_mask
+ attention_mask = kwargs.pop("padding_mask")
+
+ output_attentions = False
+
+ bsz, q_len, _ = hidden_states.size()
+
+ qkv_states = self.wqkv(hidden_states)
+
+ qkv_states = rearrange(
+ qkv_states,
+ "b q (h gs d) -> b q h gs d",
+ gs=2 + self.num_key_value_groups,
+ d=self.head_dim,
+ )
+
+ query_states = qkv_states[..., : self.num_key_value_groups, :]
+ query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d")
+ key_states = qkv_states[..., -2, :]
+ value_states = qkv_states[..., -1, :]
+
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+ past_key_value = (key_states, value_states) if use_cache else None
+
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ attn_output = self._flash_attention_forward(
+ query_states, key_states, value_states, attention_mask, q_len
+ )
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+ attn_output = self.wo(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+ def _flash_attention_forward(
+ self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+ ):
+ """
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+ first unpad the input, then computes the attention scores and pad the final attention scores.
+
+ Args:
+ query_states (`torch.Tensor`):
+ Input query states to be passed to Flash Attention API
+ key_states (`torch.Tensor`):
+ Input key states to be passed to Flash Attention API
+ value_states (`torch.Tensor`):
+ Input value states to be passed to Flash Attention API
+ attention_mask (`torch.Tensor`):
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+ position of padding tokens and 1 for the position of non-padding tokens.
+ dropout (`int`, *optional*):
+ Attention dropout
+ softmax_scale (`float`, *optional*):
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+ """
+ # Contains at least one padding token in the sequence
+ causal = self.is_causal and query_length != 1
+ if attention_mask is not None:
+ batch_size = query_states.shape[0]
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input(
+ query_states, key_states, value_states, attention_mask, query_length
+ )
+
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+ attn_output_unpad = flash_attn_varlen_func(
+ query_states,
+ key_states,
+ value_states,
+ cu_seqlens_q=cu_seqlens_q,
+ cu_seqlens_k=cu_seqlens_k,
+ max_seqlen_q=max_seqlen_in_batch_q,
+ max_seqlen_k=max_seqlen_in_batch_k,
+ dropout_p=dropout,
+ softmax_scale=softmax_scale,
+ causal=causal,
+ )
+
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+ else:
+ attn_output = flash_attn_func(
+ query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+ )
+
+ return attn_output
+
+ def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+ key_layer = index_first_axis(
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+ )
+ value_layer = index_first_axis(
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+ )
+
+ if query_length == kv_seq_len:
+ query_layer = index_first_axis(
+ query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+ )
+ cu_seqlens_q = cu_seqlens_k
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
+ indices_q = indices_k
+ elif query_length == 1:
+ max_seqlen_in_batch_q = 1
+ cu_seqlens_q = torch.arange(
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
+ ) # There is a memcpy here, that is very bad.
+ indices_q = cu_seqlens_q[:-1]
+ query_layer = query_layer.squeeze(1)
+ else:
+ # The -q_len: slice assumes left padding.
+ attention_mask = attention_mask[:, -query_length:]
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+ return (
+ query_layer,
+ key_layer,
+ value_layer,
+ indices_q.to(torch.int64),
+ (cu_seqlens_q, cu_seqlens_k),
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+ )
+
+INTERNLM2_ATTENTION_CLASSES = {
+ "eager": InternLM2Attention,
+ "flash_attention_2": InternLM2FlashAttention2,
+}
+
+# Modified from transformers.model.llama.modeling_llama.LlamaDecoderLayer
+class InternLM2DecoderLayer(nn.Module):
+ def __init__(self, config: InternLM2Config):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+
+ self.attention = INTERNLM2_ATTENTION_CLASSES[config.attn_implementation](config=config)
+
+ self.feed_forward = InternLM2MLP(config)
+ self.attention_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.ffn_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ **kwargs,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`, *optional*):
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+ query_sequence_length, key_sequence_length)` if default attention is used.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+ """
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+ "Please make sure use `attention_mask` instead.`"
+ )
+
+ residual = hidden_states
+
+ hidden_states = self.attention_norm(hidden_states)
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.attention(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ **kwargs,
+ )
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.ffn_norm(hidden_states)
+ hidden_states = self.feed_forward(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ return outputs
+
+
+InternLM2_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`InternLM2Config`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2
+@add_start_docstrings(
+ "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.",
+ InternLM2_START_DOCSTRING,
+)
+class InternLM2PreTrainedModel(PreTrainedModel):
+ config_class = InternLM2Config
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["InternLM2DecoderLayer"]
+ _skip_keys_device_placement = "past_key_values"
+
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+
+InternLM2_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+ `past_key_values`).
+
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+ information on the default strategy.
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.n_positions - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or
+ when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+ `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+ of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Modified from transformers.model.llama.modeling_llama.LlamaModel
+@add_start_docstrings(
+ "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.",
+ InternLM2_START_DOCSTRING,
+)
+class InternLM2Model(InternLM2PreTrainedModel):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLM2DecoderLayer`]
+
+ Args:
+ config: InternLM2Config
+ """
+
+ _auto_class = "AutoModel"
+
+ def __init__(self, config: InternLM2Config):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+ self.config = config
+
+ self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+
+ self.layers = nn.ModuleList([InternLM2DecoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.tok_embeddings
+
+ def set_input_embeddings(self, value):
+ self.tok_embeddings = value
+
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+ # create causal mask
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ combined_attention_mask = None
+ if input_shape[-1] > 1:
+ combined_attention_mask = _make_causal_mask(
+ input_shape,
+ inputs_embeds.dtype,
+ device=inputs_embeds.device,
+ past_key_values_length=past_key_values_length,
+ )
+
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+ inputs_embeds.device
+ )
+ combined_attention_mask = (
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+ )
+
+ return combined_attention_mask
+
+ @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if self.config.attn_implementation == "flash_attention_2":
+ _import_flash_attn()
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ batch_size, seq_length = input_ids.shape[:2]
+ elif inputs_embeds is not None:
+ batch_size, seq_length = inputs_embeds.shape[:2]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ seq_length_with_past = seq_length
+ past_key_values_length = 0
+ if past_key_values is not None:
+ past_key_values_length = past_key_values[0][0].shape[2]
+ seq_length_with_past = seq_length_with_past + past_key_values_length
+
+ if position_ids is None:
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+ position_ids = torch.arange(
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+ )
+ position_ids = position_ids.unsqueeze(0)
+
+ if inputs_embeds is None:
+ inputs_embeds = self.tok_embeddings(input_ids)
+
+ if self.config.attn_implementation == "flash_attention_2":
+ # 2d mask is passed through the layers
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+ else:
+ if attention_mask is None:
+ attention_mask = torch.ones(
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+ )
+ attention_mask = self._prepare_decoder_attention_mask(
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+ )
+
+ # embed positions
+ hidden_states = inputs_embeds
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ next_decoder_cache = () if use_cache else None
+
+ for idx, decoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ # None for past_key_value
+ return module(*inputs, output_attentions, None)
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(decoder_layer),
+ hidden_states,
+ attention_mask,
+ position_ids,
+ None,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ hidden_states = self.norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = next_decoder_cache if use_cache else None
+ if not return_dict:
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ )
+
+
+# Modified from transformers.model.llama.modeling_llama.LlamaForCausalLM
+class InternLM2ForCausalLM(InternLM2PreTrainedModel):
+ _auto_class = "AutoModelForCausalLM"
+
+ _tied_weights_keys = ["output.weight"]
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = InternLM2Model(config)
+ self.vocab_size = config.vocab_size
+ self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.tok_embeddings
+
+ def set_input_embeddings(self, value):
+ self.model.tok_embeddings = value
+
+ def get_output_embeddings(self):
+ return self.output
+
+ def set_output_embeddings(self, new_embeddings):
+ self.output = new_embeddings
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+ r"""
+ Args:
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+ Returns:
+
+ Example:
+
+ ```python
+ >>> from transformers import AutoTokenizer, InternLM2ForCausalLM
+
+ >>> model = InternLM2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+ >>> # Generate
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+ ```"""
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = outputs[0]
+ logits = self.output(hidden_states)
+ logits = logits.float()
+
+ loss = None
+ if labels is not None:
+ # Shift so that tokens < n predict n
+ shift_logits = logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+ # Flatten the tokens
+ loss_fct = CrossEntropyLoss()
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
+ shift_labels = shift_labels.view(-1)
+ # Enable model parallelism
+ shift_labels = shift_labels.to(shift_logits.device)
+ loss = loss_fct(shift_logits, shift_labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def prepare_inputs_for_generation(
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+ ):
+ if past_key_values is not None:
+ past_length = past_key_values[0][0].shape[2]
+
+ # Some generation methods already pass only the last input ID
+ if input_ids.shape[1] > past_length:
+ remove_prefix_length = past_length
+ else:
+ # Default to old behavior: keep only final ID
+ remove_prefix_length = input_ids.shape[1] - 1
+
+ input_ids = input_ids[:, remove_prefix_length:]
+
+ position_ids = kwargs.get("position_ids", None)
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -input_ids.shape[1] :]
+
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+ if inputs_embeds is not None and past_key_values is None:
+ model_inputs = {"inputs_embeds": inputs_embeds}
+ else:
+ model_inputs = {"input_ids": input_ids}
+
+ model_inputs.update(
+ {
+ "position_ids": position_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "attention_mask": attention_mask,
+ }
+ )
+ return model_inputs
+
+ @staticmethod
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+ )
+ return reordered_past
+
+ def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
+ prompt = ""
+ if meta_instruction:
+ prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n"""
+ else:
+ prompt += ""
+ for record in history:
+ prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
+ prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
+ return tokenizer([prompt], return_tensors="pt")
+
+ @torch.no_grad()
+ def chat(
+ self,
+ tokenizer,
+ query: str,
+ history: List[Tuple[str, str]] = [],
+ streamer: Optional[BaseStreamer] = None,
+ max_new_tokens: int = 1024,
+ do_sample: bool = True,
+ temperature: float = 0.8,
+ top_p: float = 0.8,
+ meta_instruction: str = "You are an AI assistant whose name is InternLM (书生·浦语).\n"
+ "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
+ "- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.",
+ **kwargs,
+ ):
+ inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
+ inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
+ # also add end-of-assistant token in eos token id to avoid unnecessary generation
+ eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
+ outputs = self.generate(
+ **inputs,
+ streamer=streamer,
+ max_new_tokens=max_new_tokens,
+ do_sample=do_sample,
+ temperature=temperature,
+ top_p=top_p,
+ eos_token_id=eos_token_id,
+ **kwargs,
+ )
+ outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
+ response = tokenizer.decode(outputs, skip_special_tokens=True)
+ response = response.split("<|im_end|>")[0]
+ history = history + [(query, response)]
+ return response, history
+
+ @torch.no_grad()
+ def stream_chat(
+ self,
+ tokenizer,
+ query: str,
+ history: List[Tuple[str, str]] = [],
+ max_new_tokens: int = 1024,
+ do_sample: bool = True,
+ temperature: float = 0.8,
+ top_p: float = 0.8,
+ **kwargs,
+ ):
+ """
+ Return a generator in format: (response, history)
+ Eg.
+ ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')])
+ ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')])
+ """
+ if BaseStreamer is None:
+ raise ModuleNotFoundError(
+ "The version of `transformers` is too low. Please make sure "
+ "that you have installed `transformers>=4.28.0`."
+ )
+
+ response_queue = queue.Queue(maxsize=20)
+
+ class ChatStreamer(BaseStreamer):
+ def __init__(self, tokenizer) -> None:
+ super().__init__()
+ self.tokenizer = tokenizer
+ self.queue = response_queue
+ self.query = query
+ self.history = history
+ self.response = ""
+ self.received_inputs = False
+ self.queue.put((self.response, history + [(self.query, self.response)]))
+
+ def put(self, value):
+ if len(value.shape) > 1 and value.shape[0] > 1:
+ raise ValueError("ChatStreamer only supports batch size 1")
+ elif len(value.shape) > 1:
+ value = value[0]
+
+ if not self.received_inputs:
+ # The first received value is input_ids, ignore here
+ self.received_inputs = True
+ return
+
+ token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
+ if token.strip() != "<|im_end|>":
+ self.response = self.response + token
+ history = self.history + [(self.query, self.response)]
+ self.queue.put((self.response, history))
+
+ def end(self):
+ self.queue.put(None)
+
+ def stream_producer():
+ return self.chat(
+ tokenizer=tokenizer,
+ query=query,
+ streamer=ChatStreamer(tokenizer=tokenizer),
+ history=history,
+ max_new_tokens=max_new_tokens,
+ do_sample=do_sample,
+ temperature=temperature,
+ top_p=top_p,
+ **kwargs,
+ )
+
+ def consumer():
+ producer = threading.Thread(target=stream_producer)
+ producer.start()
+ while True:
+ res = response_queue.get()
+ if res is None:
+ return
+ yield res
+
+ return consumer()
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
+@add_start_docstrings(
+ """
+ The InternLM2 Model transformer with a sequence classification head on top (linear layer).
+
+ [`InternLM2ForSequenceClassification`] uses the last token in order to do the classification,
+ as other causal models (e.g. GPT-2) do.
+
+ Since it does classification on the last token, it requires to know the position of the last token. If a
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+ each row of the batch).
+ """,
+ InternLM2_START_DOCSTRING,
+)
+class InternLM2ForSequenceClassification(InternLM2PreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+ self.model = InternLM2Model(config)
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.tok_embeddings
+
+ def set_input_embeddings(self, value):
+ self.model.tok_embeddings = value
+
+ @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ transformer_outputs = self.model(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ hidden_states = transformer_outputs[0]
+ logits = self.score(hidden_states)
+
+ if input_ids is not None:
+ batch_size = input_ids.shape[0]
+ else:
+ batch_size = inputs_embeds.shape[0]
+
+ if self.config.pad_token_id is None and batch_size != 1:
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+ if self.config.pad_token_id is None:
+ sequence_lengths = -1
+ else:
+ if input_ids is not None:
+ sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to(
+ logits.device
+ )
+ else:
+ sequence_lengths = -1
+
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+ loss = None
+ if labels is not None:
+ labels = labels.to(logits.device)
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(pooled_logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(pooled_logits, labels)
+ if not return_dict:
+ output = (pooled_logits,) + transformer_outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SequenceClassifierOutputWithPast(
+ loss=loss,
+ logits=pooled_logits,
+ past_key_values=transformer_outputs.past_key_values,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
diff --git a/model_repository/preprocessing/1/tokenizer/placeholder b/model_repository/preprocessing/1/tokenizer/placeholder
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/model_repository/preprocessing/1/tokenizer/pytorch_model.bin.index.json b/model_repository/preprocessing/1/tokenizer/pytorch_model.bin.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d95cf180df4c423e817c55f30f5ce93ac80e220
--- /dev/null
+++ b/model_repository/preprocessing/1/tokenizer/pytorch_model.bin.index.json
@@ -0,0 +1,554 @@
+{
+ "metadata": {
+ "total_size": 5251801088
+ },
+ "weight_map": {
+ "model.layers.0.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.2.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.20.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.3.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.30.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.4.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.tok_embeddings.weight": "pytorch_model-00001-of-00003.bin",
+ "output.weight": "pytorch_model-00003-of-00003.bin"
+ }
+}
diff --git a/model_repository/preprocessing/1/tokenizer/special_tokens_map.json b/model_repository/preprocessing/1/tokenizer/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..492d4b2966a1763442d426d880dbc29f94906e4c
--- /dev/null
+++ b/model_repository/preprocessing/1/tokenizer/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/model_repository/preprocessing/1/tokenizer/tokenization_internlm.py b/model_repository/preprocessing/1/tokenizer/tokenization_internlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9792349c7fed6fc64476eabdd9dad7a84640c3ee
--- /dev/null
+++ b/model_repository/preprocessing/1/tokenizer/tokenization_internlm.py
@@ -0,0 +1,240 @@
+# coding=utf-8
+# Copyright (c) InternLM. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes for IntermLM."""
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {}
+
+
+class InternLMTokenizer(PreTrainedTokenizer):
+ """
+ Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+ Args:
+ vocab_file (`str`):
+ Path to the vocabulary file.
+ """
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+ model_input_names = ["input_ids", "attention_mask"]
+ _auto_class = "AutoTokenizer"
+
+ def __init__(
+ self,
+ vocab_file,
+ unk_token="",
+ bos_token="",
+ eos_token="",
+ pad_token="",
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
+ add_bos_token=True,
+ add_eos_token=False,
+ decode_with_prefix_space=False,
+ clean_up_tokenization_spaces=False,
+ **kwargs,
+ ):
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ self.vocab_file = vocab_file
+ self.add_bos_token = add_bos_token
+ self.add_eos_token = add_eos_token
+ self.decode_with_prefix_space = decode_with_prefix_space
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+ self._no_prefix_space_tokens = None
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ pad_token=pad_token,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ **kwargs,
+ )
+
+ """ Initialization"""
+
+ @property
+ def no_prefix_space_tokens(self):
+ if self._no_prefix_space_tokens is None:
+ vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
+ self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
+ return self._no_prefix_space_tokens
+
+ @property
+ def vocab_size(self):
+ """Returns vocab size"""
+ return self.sp_model.get_piece_size()
+
+ @property
+ def bos_token_id(self) -> Optional[int]:
+ return self.sp_model.bos_id()
+
+ @property
+ def eos_token_id(self) -> Optional[int]:
+ return self.sp_model.eos_id()
+
+ def get_vocab(self):
+ """Returns vocab as a dict"""
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
+ def _tokenize(self, text):
+ """Returns a tokenized string."""
+ return self.sp_model.encode(text, out_type=str)
+
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ return self.sp_model.piece_to_id(token)
+
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ token = self.sp_model.IdToPiece(index)
+ return token
+
+ def _maybe_add_prefix_space(self, tokens, decoded):
+ if tokens and tokens[0] not in self.no_prefix_space_tokens:
+ return " " + decoded
+ else:
+ return decoded
+
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (string) in a single string."""
+ current_sub_tokens = []
+ out_string = ""
+ prev_is_special = False
+ for token in tokens:
+ # make sure that special tokens are not decoded using sentencepiece model
+ if token in self.all_special_tokens:
+ if not prev_is_special:
+ out_string += " "
+ out_string += self.sp_model.decode(current_sub_tokens) + token
+ prev_is_special = True
+ current_sub_tokens = []
+ else:
+ current_sub_tokens.append(token)
+ prev_is_special = False
+ out_string += self.sp_model.decode(current_sub_tokens)
+ out_string = self.clean_up_tokenization(out_string)
+ out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
+ return out_string[1:]
+
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ """
+ Save the vocabulary and special tokens file to a directory.
+
+ Args:
+ save_directory (`str`):
+ The directory in which to save the vocabulary.
+
+ Returns:
+ `Tuple(str)`: Paths to the files saved.
+ """
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ out_vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+ copyfile(self.vocab_file, out_vocab_file)
+ elif not os.path.isfile(self.vocab_file):
+ with open(out_vocab_file, "wb") as fi:
+ content_spiece_model = self.sp_model.serialized_model_proto()
+ fi.write(content_spiece_model)
+
+ return (out_vocab_file,)
+
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+ if self.add_bos_token:
+ bos_token_ids = [self.bos_token_id]
+ else:
+ bos_token_ids = []
+
+ output = bos_token_ids + token_ids_0
+
+ if token_ids_1 is not None:
+ output = output + token_ids_1
+
+ if self.add_eos_token:
+ output = output + [self.eos_token_id]
+
+ return output
+
+ def get_special_tokens_mask(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+ ) -> List[int]:
+ """
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+ special tokens using the tokenizer `prepare_for_model` method.
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+ Whether or not the token list is already formatted with special tokens for the model.
+
+ Returns:
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ """
+ if already_has_special_tokens:
+ return super().get_special_tokens_mask(
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+ )
+
+ if token_ids_1 is None:
+ return [1] + ([0] * len(token_ids_0)) + [1]
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+ def create_token_type_ids_from_sequences(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ """
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
+ use of token type ids, therefore a list of zeros is returned.
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of zeros.
+ """
+ eos = [self.eos_token_id]
+
+ if token_ids_1 is None:
+ return len(token_ids_0 + eos) * [0]
+ return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
diff --git a/model_repository/preprocessing/1/tokenizer/tokenizer.model b/model_repository/preprocessing/1/tokenizer/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6600712949ca9c4ffb50f25275993a21fba0b408
--- /dev/null
+++ b/model_repository/preprocessing/1/tokenizer/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b
+size 1477754
diff --git a/model_repository/preprocessing/1/tokenizer/tokenizer.py b/model_repository/preprocessing/1/tokenizer/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..db936a5501cb07d33d56083656dbd734ba7431bf
--- /dev/null
+++ b/model_repository/preprocessing/1/tokenizer/tokenizer.py
@@ -0,0 +1,400 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import os.path as osp
+from collections import deque
+from typing import List, Optional, Sequence, Union
+
+import torch
+
+from lmdeploy.utils import get_logger
+
+# this file will be copied to triton server, make sure all
+# importing are starting from the package root lmdeploy
+
+
+class SentencePieceTokenizer:
+ """Tokenizer of sentencepiece.
+
+ Args:
+ model_file (str): the path of the tokenizer model
+ """
+
+ def __init__(self, model_file: str):
+ from sentencepiece import SentencePieceProcessor
+ self.model = SentencePieceProcessor(model_file=model_file)
+ self._prefix_space_tokens = None
+ # for stop words
+ self._maybe_decode_bytes: bool = None
+ # TODO maybe lack a constant.py
+ self._indexes_tokens_deque = deque(maxlen=10)
+ self.max_indexes_num = 5
+ self.logger = get_logger('lmdeploy')
+
+ @property
+ def vocab_size(self):
+ """vocabulary size."""
+ return self.model.vocab_size()
+
+ @property
+ def bos_token_id(self):
+ """begine of the sentence token id."""
+ return self.model.bos_id()
+
+ @property
+ def eos_token_id(self):
+ """end of the sentence token id."""
+ return self.model.eos_id()
+
+ @property
+ def prefix_space_tokens(self):
+ """tokens without prefix space."""
+ if self._prefix_space_tokens is None:
+ vocab = self.model.IdToPiece(list(range(self.vocab_size)))
+ self._prefix_space_tokens = {
+ i
+ for i, tok in enumerate(vocab) if tok.startswith('▁')
+ }
+ return self._prefix_space_tokens
+
+ def _maybe_add_prefix_space(self, tokens, decoded):
+ """maybe add prefix space for incremental decoding."""
+ if len(tokens) and not decoded.startswith(' ') and\
+ tokens[0] in self.prefix_space_tokens:
+ return ' ' + decoded
+ else:
+ return decoded
+
+ def indexes_containing_token(self, token: str):
+ """Return all the possible indexes, whose decoding output may contain
+ the input token."""
+ # traversing vocab is time consuming, can not be accelerated with
+ # multi threads (computation) or multi process (can't pickle tokenizer)
+ # so, we maintain latest 10 stop words and return directly if matched
+ for _token, _indexes in self._indexes_tokens_deque:
+ if token == _token:
+ return _indexes
+ if token == ' ': # ' ' is special
+ token = '▁'
+ vocab = self.model.IdToPiece(list(range(self.vocab_size)))
+ indexes = [i for i, voc in enumerate(vocab) if token in voc]
+ if len(indexes) > self.max_indexes_num:
+ indexes = self.encode(token, add_bos=False)[-1:]
+ self.logger.warning(
+ f'There are too many(>{self.max_indexes_num}) possible '
+ f'indexes may decoding {token}, we will use {indexes} only')
+ self._indexes_tokens_deque.append((token, indexes))
+ return indexes
+
+ def encode(self, s: str, add_bos: bool = True, **kwargs):
+ """Tokenize a prompt.
+
+ Args:
+ s (str): a prompt
+ Returns:
+ list[int]: token ids
+ """
+ return self.model.Encode(s, add_bos=add_bos, **kwargs)
+
+ def decode(self, t: Sequence[int], offset: Optional[int] = None):
+ """De-tokenize.
+
+ Args:
+ t (List[int]): a list of token ids
+ offset (int): for incrementally decoding. Default to None, which
+ means not applied.
+ Returns:
+ str: text of decoding tokens
+ """
+ if isinstance(t, torch.Tensor):
+ t = t.tolist()
+ t = t[offset:]
+ out_string = self.model.Decode(t)
+ if offset:
+ out_string = self._maybe_add_prefix_space(t, out_string)
+ return out_string
+
+ def __call__(self, s: Union[str, Sequence[str]]):
+ """Tokenize prompts.
+
+ Args:
+ s (str): prompts
+ Returns:
+ list[int]: token ids
+ """
+ import addict
+ add_bos = False
+ add_eos = False
+
+ input_ids = self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
+ return addict.Addict(input_ids=input_ids)
+
+
+class HuggingFaceTokenizer:
+ """Tokenizer of sentencepiece.
+
+ Args:
+ model_dir (str): the directory of the tokenizer model
+ """
+
+ def __init__(self, model_dir: str):
+ from transformers import AutoTokenizer
+ model_file = osp.join(model_dir, 'tokenizer.model')
+ backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json')
+ model_file_exists = osp.exists(model_file)
+ self.logger = get_logger('lmdeploy')
+ if not osp.exists(backend_tokenizer_file) and model_file_exists:
+ self.logger.warning(
+ 'Can not find tokenizer.json. '
+ 'It may take long time to initialize the tokenizer.')
+ self.model = AutoTokenizer.from_pretrained(model_dir,
+ trust_remote_code=True)
+ self._prefix_space_tokens = None
+ # save tokenizer.json to reuse
+ if not osp.exists(backend_tokenizer_file) and model_file_exists:
+ if hasattr(self.model, 'backend_tokenizer'):
+ if os.access(model_dir, os.W_OK):
+ self.model.backend_tokenizer.save(backend_tokenizer_file)
+
+ if self.model.eos_token_id is None:
+ generation_config_file = osp.join(model_dir,
+ 'generation_config.json')
+ if osp.exists(generation_config_file):
+ with open(generation_config_file, 'r') as f:
+ cfg = json.load(f)
+ self.model.eos_token_id = cfg['eos_token_id']
+ elif hasattr(self.model, 'eod_id'): # Qwen remote
+ self.model.eos_token_id = self.model.eod_id
+
+ # for stop words
+ self._vocab_size_with_added: int = None
+ self._maybe_decode_bytes: bool = None
+ # TODO maybe lack a constant.py
+ self._indexes_tokens_deque = deque(maxlen=10)
+ self.max_indexes_num = 5
+ self.token2id = {}
+
+ @property
+ def vocab_size(self):
+ """vocabulary size."""
+ return self.model.vocab_size
+
+ @property
+ def vocab_size_with_added(self):
+ """vocabulary size with added vocab."""
+ if self._vocab_size_with_added is not None:
+ return self._vocab_size_with_added
+ self._vocab_size_with_added = len(self.model.get_vocab())
+ return self._vocab_size_with_added
+
+ @property
+ def bos_token_id(self):
+ """begine of the sentence token id."""
+ return self.model.bos_token_id
+
+ @property
+ def eos_token_id(self):
+ """end of the sentence token id."""
+ return self.model.eos_token_id
+
+ @property
+ def prefix_space_tokens(self):
+ """tokens without prefix space."""
+ if self._prefix_space_tokens is None:
+ vocab = self.model.convert_ids_to_tokens(
+ list(range(self.vocab_size)))
+ self._prefix_space_tokens = {
+ i
+ for i, tok in enumerate(vocab)
+ if tok.startswith('▁' if isinstance(tok, str) else b' ')
+ }
+ return self._prefix_space_tokens
+
+ def _maybe_add_prefix_space(self, tokens: List[int], decoded: str):
+ """maybe add prefix space for incremental decoding."""
+ if len(tokens) and not decoded.startswith(' ') and\
+ tokens[0] in self.prefix_space_tokens:
+ return ' ' + decoded
+ else:
+ return decoded
+
+ @property
+ def maybe_decode_bytes(self):
+ """Check if self.model.convert_ids_to_tokens return not a str value."""
+ if self._maybe_decode_bytes is None:
+ self._maybe_decode_bytes = False
+ vocab = self.model.convert_ids_to_tokens(
+ list(range(self.vocab_size)))
+ for tok in vocab:
+ if not isinstance(tok, str):
+ self._maybe_decode_bytes = True
+ break
+ return self._maybe_decode_bytes
+
+ def indexes_containing_token(self, token: str):
+ """Return all the possible indexes, whose decoding output may contain
+ the input token."""
+ # traversing vocab is time consuming, can not be accelerated with
+ # multi threads (computation) or multi process (can't pickle tokenizer)
+ # so, we maintain latest 10 stop words and return directly if matched
+ for _token, _indexes in self._indexes_tokens_deque:
+ if token == _token:
+ return _indexes
+
+ if self.token2id == {}:
+ # decode is slower than convert_ids_to_tokens
+ if self.maybe_decode_bytes:
+ self.token2id = {
+ self.model.decode(i): i
+ for i in range(self.vocab_size)
+ }
+ else:
+ self.token2id = {
+ self.model.convert_ids_to_tokens(i): i
+ for i in range(self.vocab_size)
+ }
+ if token == ' ': # ' ' is special
+ token = '▁'
+ indexes = [i for _token, i in self.token2id.items() if token in _token]
+ if len(indexes) > self.max_indexes_num:
+ indexes = self.encode(token, add_bos=False)[-1:]
+ self.logger.warning(
+ f'There are too many(>{self.max_indexes_num}) possible '
+ f'indexes may decoding {token}, we will use {indexes} only')
+ # there might be token id that exceeds self.vocab_size
+ if len(indexes) == 0:
+ indexes = self.encode(token, False)
+ if len(indexes) != 1:
+ self.logger.warning(
+ f'The token {token}, its length of indexes {indexes} is '
+ 'not 1. Currently, it can not be used as stop words')
+ indexes = []
+ self._indexes_tokens_deque.append((token, indexes))
+ return indexes
+
+ def encode(self, s: str, add_bos: bool = True, **kwargs):
+ """Tokenize a prompt.
+
+ Args:
+ s (str): a prompt
+ Returns:
+ list[int]: token ids
+ """
+ encoded = self.model.encode(s, **kwargs)
+ if not add_bos:
+ # in the middle of a session
+ if len(encoded) and encoded[0] == self.bos_token_id:
+ encoded = encoded[1:]
+ return encoded
+
+ def decode(self, t: Sequence[int], offset: Optional[int] = None):
+ """De-tokenize.
+
+ Args:
+ t (List[int]): a list of token ids
+ offset (int): for incrementally decoding. Default to None, which
+ means not applied.
+ Returns:
+ str: text of decoding tokens
+ """
+ skip_special_tokens = True
+ t = t[offset:]
+ out_string = self.model.decode(t,
+ skip_special_tokens=skip_special_tokens)
+ if offset:
+ out_string = self._maybe_add_prefix_space(t, out_string)
+ return out_string
+
+ def __call__(self, s: Union[str, Sequence[str]]):
+ """Tokenize prompts.
+
+ Args:
+ s (str): prompts
+ Returns:
+ list[int]: token ids
+ """
+ add_special_tokens = False
+ return self.model(s, add_special_tokens=add_special_tokens)
+
+
+class Tokenizer:
+ """Tokenize prompts or de-tokenize tokens into texts.
+
+ Args:
+ model_file (str): the path of the tokenizer model
+ """
+
+ def __init__(self, model_file: str):
+ if model_file.endswith('.model'):
+ model_folder = osp.split(model_file)[0]
+ else:
+ model_folder = model_file
+ model_file = osp.join(model_folder, 'tokenizer.model')
+ tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json')
+
+ model_file_exists = osp.exists(model_file)
+ config_exists = osp.exists(tokenizer_config_file)
+ use_hf_model = config_exists or not model_file_exists
+ self.logger = get_logger('lmdeploy')
+ if not use_hf_model:
+ self.model = SentencePieceTokenizer(model_file)
+ else:
+ self.model = HuggingFaceTokenizer(model_folder)
+
+ @property
+ def vocab_size(self):
+ """vocabulary size."""
+ return self.model.vocab_size
+
+ @property
+ def bos_token_id(self):
+ """begine of the sentence token id."""
+ return self.model.bos_token_id
+
+ @property
+ def eos_token_id(self):
+ """end of the sentence token id."""
+ return self.model.eos_token_id
+
+ def encode(self, s: str, add_bos: bool = True, **kwargs):
+ """Tokenize a prompt.
+
+ Args:
+ s (str): a prompt
+ Returns:
+ list[int]: token ids
+ """
+ return self.model.encode(s, add_bos, **kwargs)
+
+ def decode(self, t: Sequence[int], offset: Optional[int] = None):
+ """De-tokenize.
+
+ Args:
+ t (List[int]): a list of token ids
+ offset (int): for incrementally decoding. Default to None, which
+ means not applied.
+ Returns:
+ str: text of decoding tokens
+ """
+ return self.model.decode(t, offset)
+
+ def __call__(self, s: Union[str, Sequence[str]]):
+ """Tokenize prompts.
+
+ Args:
+ s (str): prompts
+ Returns:
+ list[int]: token ids
+ """
+ return self.model(s)
+
+ def indexes_containing_token(self, token):
+ """Return all the possible indexes, whose decoding output may contain
+ the input token."""
+ encoded = self.encode(token, add_bos=False)
+ if len(encoded) > 1:
+ self.logger.warning(
+ f'The token {token}, its length of indexes {encoded} is over '
+ 'than 1. Currently, it can not be used as stop words')
+ return []
+ return self.model.indexes_containing_token(token)
diff --git a/model_repository/preprocessing/1/tokenizer/tokenizer_config.json b/model_repository/preprocessing/1/tokenizer/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f133449013be570f08fdf7c70f1a2c8ccb4724da
--- /dev/null
+++ b/model_repository/preprocessing/1/tokenizer/tokenizer_config.json
@@ -0,0 +1,90 @@
+{
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92538": {
+ "content": "<|plugin|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92539": {
+ "content": "<|interpreter|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92540": {
+ "content": "<|action_end|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92541": {
+ "content": "<|action_start|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92542": {
+ "content": "<|im_end|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92543": {
+ "content": "<|im_start|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "auto_map": {
+ "AutoTokenizer": [
+ "tokenization_internlm.InternLMTokenizer",
+ null
+ ]
+ },
+ "bos_token": "",
+ "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "tokenizer_class": "InternLMTokenizer",
+ "unk_token": ""
+}
diff --git a/model_repository/preprocessing/config.pbtxt b/model_repository/preprocessing/config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..997ba399ba04f1f521bdbf088815d1dd3c26f696
--- /dev/null
+++ b/model_repository/preprocessing/config.pbtxt
@@ -0,0 +1,37 @@
+name: "preprocessing"
+backend: "python"
+max_batch_size: 1
+
+input [
+ {
+ name: "QUERY"
+ data_type: TYPE_STRING
+ dims: [ -1 ]
+ }
+]
+output [
+ {
+ name: "INPUT_ID"
+ data_type: TYPE_UINT32
+ dims: [ -1 ]
+ },
+ {
+ name: "REQUEST_INPUT_LEN"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ }
+]
+
+instance_group [
+ {
+ count: 4
+ kind: KIND_CPU
+ }
+]
+
+parameters {
+ key: "tokenizer_path"
+ value: {
+ string_value: "tokenizer/tokenizer.model"
+ }
+}
diff --git a/model_repository/turbomind/1/placeholder b/model_repository/turbomind/1/placeholder
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/model_repository/turbomind/1/weights/config.ini b/model_repository/turbomind/1/weights/config.ini
new file mode 100644
index 0000000000000000000000000000000000000000..88f3d40970a1e663689736be546f8d3d64bb8734
--- /dev/null
+++ b/model_repository/turbomind/1/weights/config.ini
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c8358cd3fffcb86829f6b600bdd0ba77b6147eed572f88700ec4d914db070d6
+size 645
diff --git a/model_repository/turbomind/1/weights/layers.0.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.0.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..4f5435a75963ce7ce17b0536f500c8ebf8ca4220
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.0.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1763929a6e7bbdafdb81d39ebfa08263351ccea12347aa68b292b1b7c458e45
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.0.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.0.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..52107ec494683ad0e0403e4189bcceed1ceabdcb
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.0.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ed40e83191f5304fd2df93ff5b90ae9a165bbe489af8020e06948fbbb289d7d
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.0.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.0.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..6e21231bbe43b92e43a0d2600ed6969f6c00e767
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.0.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6710235be94402052aaaae809e488f433d75d6d33acf546e2d0bf7aae4d8f0f
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.0.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.0.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..4961bf6cfbf6ae7592675c56d719924794d8da68
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.0.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c069c91ef3a796ac2e9e0230319fabb6bc8433c68284c6e5ca71baa477a3438
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.0.attention_norm.weight b/model_repository/turbomind/1/weights/layers.0.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..51dd734ab95204a4ce7fd026707a375f1a85219f
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.0.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dde3cfe82d02d87660f40c667186249cd17a5ee5924ab2a3ea0385919a2d0f3b
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.0.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.0.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f3167a75e6defd59aa396437f58c797bb5cf1b2c
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.0.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26bc912102aa2b487baf312f3bfd8f97dc46ba6761c2328bfd3e45581bfbcfd4
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.0.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.0.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..68343cbdcbc17ec725af43c1a1d53b62bc5c32c0
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.0.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:309c93937a8778e4e4dce879efd1e0673f4bb7701644628abbaa8420e5b24cf0
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.0.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.0.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..3e82c77a6ba7b16d19d55f544f872223d33fba6d
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.0.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d096d08769d4b05f7483b4ed024224e0d4d35772231e757157e69c9c0dc1c6ef
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.0.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.0.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..fee7031bc4703588c99d993aaf4e1c0f1d080e5b
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.0.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdb73c0a0f614f1033850266d6ff4311374557a2653e0fa7857f8507ca87058e
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.0.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.0.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..e8f321d4e16161bcdf7f2b6979e9f90b8aa04ef3
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.0.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5b414270e0d50fbec62cdab6ecd217c2f688872d5ed7d9f91bb75dfff46651b
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.0.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.0.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..e376c6acc6ad65b07267f834beda69a889c5f0b1
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.0.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25f7250671024d0129c45c3f3d8f57887921d219c280350697d41e9170925c77
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.1.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.1.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..bb3ebc7beaa1d925c4a14fbad6d2df2ec6bad94f
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.1.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a125e82d7ee989858902abca2bec9dc3f4ad74008f5307a1e7a635d148c53f3a
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.1.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.1.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..bc0ed1f6f8ef00629e07ce4989e2ddde96723c08
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.1.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f96d91127194d8a8404809f81602727e59903c86473ee27012bb303f83cdf77
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.1.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.1.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2eaa43207863db980e17ed160bc4613b175baf27
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.1.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4905342d79812e6bd9d6d993443ee6b30df2f80cef44176d1398dc884c458bad
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.1.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.1.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c136a82b25947dc950216cf643734a4a5ee81a36
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.1.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c7971bdedd76bbe5630fd97b2badbdd26d22055ffe6fe0374fff051af9feb80
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.1.attention_norm.weight b/model_repository/turbomind/1/weights/layers.1.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..abe49b3b4fe282cbcf269cc92e4a1b03f8304d1b
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.1.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d589a6b27b707580d37c4b198dc952071bb1a34967ebd9175f9055ac012bc781
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.1.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.1.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..7d2bbd8d926a99dd1ba3adf0859660ace736b884
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.1.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1dd761cf75a1f95c5a55a245fbe1a8bca8967be0d7a03dd12108d0be835d7682
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.1.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.1.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..9fb67e07dca86f3c043855b520b84ed83c9b4930
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.1.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d4fdfeee03517f7896aadab5adec50c8449a2e1bda2f0cf5b8725b26057d1f6
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.1.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.1.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..83348571bf69b92747b68f25d3755c7b2146e4c5
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.1.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0c42be27fe2e9f48473b5cc4ec63cd06575ade857ea8699b4bd05eb4f801dc6
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.1.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.1.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..7f8d31081aee57241eed23ae114dd5e39f9e6bbf
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.1.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe915a8697f98fe80270d235325b469219fac1c8a4529052fd15f6b1ee8f13e6
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.1.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.1.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..6db47869baaf62ea10c904bb39ca2fd8dcb35aa5
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.1.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90fa27f32ad04b368d7110fb689b24ea02904efb2f2b7a9f9be876c331fc7212
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.1.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.1.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..87ba80c2080cfc64bd645133d99c4fb0f602b920
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.1.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08456e5241a0fbd14699cb889680261c9e0ca7d30051066d899e99be24e15d52
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.10.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.10.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..35f6c98510eb157f0971d9d241b2ec765cd3c834
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.10.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4d8d7ae69eea66730a10e906758105f2c99b16d082b9ea84d7e7cd8afcdbd4c
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.10.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.10.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..77eb52490f504dbd5b089674f267142c27e7acc0
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.10.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2885240377b91bd85bbe4ee6f67b8ca23233584c35ce71b752f9f3bbb66e266c
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.10.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.10.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..440d3e309d85cdfb81736fd024a2834f4d0ce308
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.10.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae5115820467dcb2720eeb7abbdaf3ecd5edb56d9d7453fb0bf4f6b65323445a
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.10.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.10.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..18b5ecc65f6f8133a1821de0925d37622a67af48
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.10.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4438217ed5de15cb91f4e30f0644b08952e981d25015dd4b75c4a0cae83517c2
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.10.attention_norm.weight b/model_repository/turbomind/1/weights/layers.10.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..4f0f39a02bb84010dd644e2fc96ef3b46d4c2820
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.10.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cd2c0d884542c0a881ef8fcfc9fbcc1feb67afbff0a8befc9bb741e2d8ea2af
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.10.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.10.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..bf50b623e7b1f4520d761286edd1db51a109c4c6
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.10.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a1258ea1e97e4c41db26a363eddedd3bd47c6d49f7bf738703c5746c54f4e37
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.10.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.10.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..ee36f684587a649d68d9579441ca3e90af8d7d6e
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.10.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48e7492a7d4447980961b5891a0997f2568bdbe10ed15ba0998f8ca1bdaf0a4c
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.10.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.10.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..b0cce8413321f6074dc61c7a28bc92377f4c7ab2
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.10.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fb81b3c6a3f7b674506b003621b7e92925754e97d23ecb1209003f2232e33cb
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.10.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.10.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..ce1603f2d10d9ae9ef7251cb66a02c3e0cba6b67
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.10.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:773b9c8eb4a3818b2667162b3169bd4fe813f2fcba5c708a49b79fa5c5053c61
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.10.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.10.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..bbe9a16316f0db34745e41ef00224f94b9237fee
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.10.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b576f4d059d0f37a4fd3e626e640dad540ff4758aa449bafe55a78046a01dc9b
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.10.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.10.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..da0421db9e924c29c37c13c09376487aaa383c8d
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.10.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:430d675f2f2e4512591d558ea6f29e42dd38c55ffcd8d21873a12e9ff90e15b2
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.11.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.11.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..d5058e0b21a7342d2379f3a9315e85ef9bbe7682
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.11.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2871ddd112a88bb89a549de3bf1c53af525e962e118eb7ad0feac6a56599a26e
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.11.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.11.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..92844164ec6f5b42e8222c577ce94bae5314a9c9
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.11.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de7017bdedc110df3a9f9fab19466968a5488b9ab3ad533f0908f2d368371adb
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.11.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.11.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c67e6d4b3e11faa456791b77155fef70589e246f
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.11.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:530e3110fadceb664c29ff9da577cf401128e93ae21601affd1c62137b04db35
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.11.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.11.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..4e0d310e48ae8ebd9b629872134eb3687a55e341
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.11.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1725da8fac86700a95c4ee9d40cf9ebf0d1ebabb4b145c2d57c4a31c42299cb8
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.11.attention_norm.weight b/model_repository/turbomind/1/weights/layers.11.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..f57dfc1e256d2fca8f1c8d59982ea28fb2f209c8
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.11.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4cb24612b49347f84741d6daab9a90b828aab924fc9b21fd2d2ca6b67abf8ea8
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.11.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.11.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..27905dc8bb55b6305cefdf0135d72eda3e7e17d9
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.11.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0af7f58d1e58e6610b5b56291bf697d79471c1eeaefdff9466fdc87996c3c86
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.11.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.11.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..474796975c206470856a63e5627806fdd1a9d0e4
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.11.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46e2d6846839f995e9434c35519a1152c52285d29672febe66e9f07b0e7523e5
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.11.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.11.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..b8e4a4f967601a2151a7eb5da1c126599eea4743
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.11.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ae182cb83af72cac11a76113fc5492ae4ccda1cd45df36facac10e65369d22c
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.11.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.11.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..aac9a3ac0afb93d279461dacd82e1fd80dfb6161
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.11.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54af6ef8d3b0aaa32183d5fb176a4d2097bd043e44ebea37ba43ac4021e18253
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.11.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.11.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..6f958acb3e97bbc263ba99adb14ceb897dc7e573
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.11.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ae646b4e03481a9e0eccf0a151deeae360012b79d455f413d6b4c8c05ead016
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.11.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.11.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3bf7aed58e43958ad08d6b6e8beffe072f7e15e6
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.11.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:114046d9b18a39823a18019529563163f191e5a74c65e959db74c96b77c9b4b9
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.12.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.12.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..b026bcfd8643c18461670a5a2980cf9a8539bb2b
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.12.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d30b7fa1db362abf3186072da75c305cd7e79f90f4b1eea6095014d9f7989da7
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.12.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.12.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..011903f321dd322447298b693e1eedb17f35c3ac
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.12.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:654fe994288ed138b388cb0e14a9c4e7124b601ac4efa404788e3267ed137307
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.12.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.12.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..fd89f748d1ea906c6617d240a4e123d243105b64
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.12.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:069d9e054d6cd0171b229e37a70b6a2fca364783cc8e80de9f81060931964e0b
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.12.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.12.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..b46cd92e96aa0e40ba260aea37674bdb9fbf1fd6
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.12.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:394968e46096fa0f50701fe0d09193561276359f023ea5dbc3a16bb3f1aff8b8
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.12.attention_norm.weight b/model_repository/turbomind/1/weights/layers.12.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..0020f8c429974d047571347728c95d5259c0da58
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.12.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:020a5a9ed0a5065303d1079d24ce7252b639f6f76bf49c7b8fb5fac3bc93fc1b
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.12.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.12.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f5cd9ca940d4417db1082cb6b445b56fc3ed304e
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.12.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9293f916e4009deb3dd715ac0fea08afe5be75548d2fe2e70a67fd5826664cea
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.12.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.12.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..be6c9b7b29a56d2d3afaec63b36099fc29d1ba80
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.12.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89899a4751211dda4328e2380ceec5d62d0d0b13fd164ccb7c9f5e189409a08f
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.12.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.12.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..53e4822e263ce179450dcfacefe7dd882447324d
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.12.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f0f0481d3c7eeecc2717614f38dcd54163c287431e82da95a1e8d5fd182cc27
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.12.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.12.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..2f8d90a6c38370788887ee529f4ad8c7b4fd6593
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.12.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:690b11e4c0f825ec39db6b53fc1ccdd51d051c752199195f2cff8079ef3b980d
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.12.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.12.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..35e00aeee302ec1726ef04c71f2a2f429fe0d23e
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.12.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce6abd982c6b4b398f13a6113cfaefff0fe65190ff1b232c8b9a68acb30fbfdb
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.12.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.12.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..8fb69a827363200f7cd82be1b4f35bab6e143bb7
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.12.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3cee21f879722a16a454f6455c8d8c3aec77cbfdba6cbebac9c4762d1d03bb2
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.13.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.13.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..63d098e6067e1aac3d4f6083c34f967abcfb40f4
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.13.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:983fa35043fba20d8f39610fc859862486472388df708d85176e198b9493f194
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.13.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.13.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..f78fb596aaf17a70c0fc17098a02d2fbd9f8b12e
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.13.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcfbdb8a6f2d86500e49d21e3d0cf88dda2e18b505be8459e46962f1a5403902
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.13.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.13.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..d0443fc30519b3ca74b5e3d4e0317af1dbe8b32d
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.13.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e76d5b55510b3111a4c8068f8bf2abe8372c9868a5346fd03831633817f49a3
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.13.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.13.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..6cbcd17aed1ae804e9e87a936274b99c9ad81296
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.13.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da85282928c5b1723c48e93cdadc416b400deb61bb90f28c4675989ab7d2f4f8
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.13.attention_norm.weight b/model_repository/turbomind/1/weights/layers.13.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..58edee2f8e729e06965c92f434900ae4f75e1a49
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.13.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:592d7039e973372cadcf8b3f717c19ecbcb911e2f40140d617855643bf2bfa3f
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.13.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.13.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..0f2f191246be551220b2b9df11e88d070f4b63c7
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.13.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1cbe619508e858a2637045e1e07f9cb0ec4c6020d6041e40bc9558aaa9fd290
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.13.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.13.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..8114a135ab96b7c28393bb44bad7050a71bd712c
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.13.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c555740ee91741c87411db09bc23b419caa191a4ac0ccf7e34b00fe64e614493
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.13.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.13.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..efc53988aa0826924baa6153c20d1fb1abae3183
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.13.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5434cecf17636b9bbdf1df6ae4b6d1eb6c06a611c93fe0291ad0d3892d850a81
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.13.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.13.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c927886fb77c90e7e2afb11bb38945c179e779cd
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.13.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c89194f222aef9d0488e0677d654d9f4cc783cebad2ba76e9013ef99684a1c2c
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.13.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.13.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..0044a510f007c3e66e363ee02bbc25f4c26cb6a6
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.13.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75cc6d0e292ec019791db0f7ef63b0508d8a5d19404fadb09c1b06a8dcae7cdb
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.13.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.13.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..313f047a7db61ca9b3fed45b948aad24958ec896
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.13.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e86a948027461837c94daa03c444ddaa2a484bdadcab47a89f78d0d332ba0370
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.14.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.14.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..d34a88071016d52838a914b177b787d6b7f5e989
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.14.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd65317b8701a195eabe835058a9366309ad055eebd4354fe994187573dcfcb4
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.14.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.14.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..dbf55a9dd11b2bb29fb5f7a2ec180b89f6372195
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.14.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a8b7af909bb0ee02940f92c80cde0a7a869e60bd4778c7eb5934ed7134b1e56
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.14.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.14.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f2e7385fd3b0a6c38260980964dfd035abe25f95
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.14.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f17aa0c464ae8e87100f9946574744e554c50847775d5e3cc888584c920b51bf
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.14.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.14.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..cca81645ed7af2fd8f2039c751f0856ab6332929
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.14.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac63fb5629b386babfc0cf09324e8388735c894def38688f57e5fa413a76a6b6
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.14.attention_norm.weight b/model_repository/turbomind/1/weights/layers.14.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..a2e5c82b9d622524d9390c76957ed9e8994aa2b8
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.14.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9d54e43cc40808a7a12fb34802e7e3fa239938943e4f247ea54556f65191e0e
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.14.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.14.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..efb7ccb2234e6b179d310051c53ba547a39f7b6b
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.14.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f974af156ac932cd0619e0e86095071dccc8cd0608319df5c1042492b2002e9d
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.14.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.14.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..8d916976c94c174148b04db334b907ec77c7d638
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.14.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5be3c8f04a42c5e0c9de9d00508fbb981849cf188dba80cf6127d8f4b4b712d
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.14.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.14.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c926dcac71d930076be55189beacbb36cfb1a777
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.14.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c33e3534172410d4656b1a244becc400d680dc19664a6fe5d2531f0733b24b1
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.14.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.14.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..78c574771e660fcfc3a237c9d56afe57b62f1ea0
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.14.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3be2e077ef369c828ac8f31826249f327d120baaaf9d0141f67b9a814f95a57b
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.14.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.14.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3094bf1d424cd5ba8300cb6dddb32e4bc9d78073
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.14.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdb3dd1a12abaf094e03a1d933aa4ab506d5c4c0cd21cf0802c04f4a0d5a85c7
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.14.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.14.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..a1ff0007bbe4e1f0abfdccce67158196a9b3ba13
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.14.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39dfb751ce93881ea2c4e2f68155583024cfcf9e85b5705781348b079cc29b0d
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.15.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.15.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..8d981e2ef18ba6fa67894151d2e5d33aec76e769
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.15.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f2d6afe6100ef0eb47d5b379ce3faa38ec1063ba36d47d9526647ea7fa4bda2
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.15.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.15.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..92d62c8db383b4e459224b1370a1d87eaa416096
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.15.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8abb8c1bad2acba915885821b231c1884cd63fd978d62d23a25775671c97f9b
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.15.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.15.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..43781b59b7834c4758226fadd3757cd458eb9001
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.15.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fca2dec7e83b35a6b582edfc05ddf49890b234aeba53a3d88384a436cc96c4c1
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.15.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.15.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..51a58827bb1c84c5a11deab1134c99e4cd37f472
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.15.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83bb55b56df6d0d2c1f6f04d894e5d6e63d476b8fffe1dd0441a892eed850502
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.15.attention_norm.weight b/model_repository/turbomind/1/weights/layers.15.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..7e895dc7fffaa82cf585391595f009adf667e4cd
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.15.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06c4e4b6e08466593216c5fffe5bb16fbe296be7d83b8d67084a728b4f0d26d0
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.15.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.15.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..8dfc85e4b6b9e369447163acf76550539913fb5a
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.15.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b271e071ebc5f1e37284433f76d394ee2ba20920d64e64355f6c37672bd68f3
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.15.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.15.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c0f10138fba546a8c454600fd6a73289e0a7f8fd
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.15.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b42f1cdd3b5b76e04cd4154950ade000eff8bfc44853c827ff351d00526201bc
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.15.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.15.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e0d0b67b1d9d4d9530690ac220e426dedaddb1fc
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.15.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c44d9731ffc2bbd8a368f60064a8e8e85f50b04677d059c25fce70aae38dc81
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.15.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.15.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..a99be30bc9c12257d3764ef09722a06f15ef0437
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.15.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:287e909a7bd9bcc0b456c57c361a614c1898383785bccf9f57eee7f91599e3b3
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.15.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.15.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..630c4372de835971e521542c84649a00c3b2e403
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.15.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8dafc8ea6132b5caec667dde3f6dda741e7ff23e40b8ff5f5ccc59232ca434b
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.15.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.15.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..a47b7192fa2a190ceb02a526a527aed679e93740
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.15.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c945e5779fcddbf5dff47a4c3502bce9ba0bace5158abc583e852d1418f9513a
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.16.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.16.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..b17d911138bd69b5faa2b303479e7cca9c12b659
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.16.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf8c2d841b0c3dfd0a4349bb4aa84c0d85141c14277e879c033484e225096715
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.16.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.16.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..bd4333af13bff4ad87c753e24461be8ab19102ab
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.16.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a54b05a6ce8083736ca7db382672bb83d215649338920308cf0edd2e4f1ae07
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.16.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.16.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e09e8104c2418067fc961e4fa84dc074da5eaa81
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.16.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b8f9b5eb6ea1827048eb48661af27f66fbf5f510055f7dfc813f28f79967c83
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.16.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.16.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..a056f4943ce26b8bb7e3c8d3d052feb2f324a4d8
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.16.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3031c7a07ae7554fdc02af0112aaf4f343c164f1da7e65ac0926e0b33ec1daf
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.16.attention_norm.weight b/model_repository/turbomind/1/weights/layers.16.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..710904f88b607829b98f69d31a704b5ccb2180d3
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.16.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0996c709a45131cb25cd72865a06e38920f31941b25f83f2d78ed5751645c284
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.16.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.16.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..ea56d48779234f87b2b0a859e2cb110d0718e2b9
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.16.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50fe105dfc87e7a2f06e12b9d1d92899b4b20106d29198eb7f8156c888b57620
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.16.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.16.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..5773631e90c5be54da0f5ca15e355b6bf855b4e3
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.16.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8081c981a8cc02210f42ffa6b41e8f8a018cc273f18dd184e7a76ea6a14af908
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.16.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.16.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..5a19b7dd919248c1d8f24d12508ffb36be409a0b
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.16.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b58ad7e7bd4aaf5109590b6f4b500643cea2e5ee7ecf3de2f2bafd931fecbba
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.16.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.16.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..17e81af1aaa097a81bf4407a23e87dfb0810ba73
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.16.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05659661021dfb93c23ca810756fba0afa33f7dc7103bb74e79a5b5cee0630c2
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.16.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.16.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..f45d501c72951cd1746375922f7e113162bef097
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.16.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:990398b91f28bd4d0ea10d21a8f911746291d93d353659c273a0d263f3f8b26f
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.16.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.16.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..cc7a02ca2638e540d970eba9c8c2ca40c599f58e
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.16.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a46e5538c6531808ab35a4aa3f8acc92997393bf5778110738282e7d0b5a6253
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.17.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.17.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..b7d289a0a181f768648b3388209609a158c0d194
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.17.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a874ceb40f2cd87b1fbadffe4f336e766e4632d1486bae80a524aca3884a760
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.17.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.17.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..02676e7729a5ae2a782c7397622f5661a55ae306
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.17.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3e383f96fe0c11172a8eb7c833e16437243ddf5083fe742f2f5267c606bf46f
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.17.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.17.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f5d248ed5bb53bc83690b851c4850179affe3a1e
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.17.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ba47e294f57c2391d17559990d81c10b3febf1ac79cdaf9646ea4b5b1efe9ae
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.17.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.17.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..cec2b0826f0458f462a1f155b2420afe3cade230
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.17.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19655fc3273537cb5a737021f0914fcaba9f520ae85a241b6943a1e375859c5a
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.17.attention_norm.weight b/model_repository/turbomind/1/weights/layers.17.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..493203ace8591c626f3ddd92a1d30a132fb91f7c
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.17.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f60382d336b8fe223742bf477d6e1d6b03a426c1397370821017d77560828a40
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.17.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.17.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..fada103f386b9576504b44aad9effb7227b81161
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.17.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6347e704f461d7d6ee0ae21b790cdd6180debf826b736f1862a27bc9ced0045
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.17.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.17.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..e34de3f6584cca7245e62f91730286274c18de9f
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.17.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13d6a83305e5bb3038ce5829693b70573fbcbfd18ef9251f42334a92a864f2f2
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.17.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.17.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..62706b91c086f1c95651471ed13767ce01618e08
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.17.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62bbff754141a2d1cf72617d73f2522333bb2694a88e8a5b37c1aca6b22b17a0
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.17.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.17.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..7d16b3f60264de0aab7805c342d890386aa3c7ec
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.17.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2aced42506d0f633676edf55b7de564b795eb6de86d8c0f6c0f1d1301233312
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.17.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.17.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..2115ea8bcc2774631a370c71a768d54242473864
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.17.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7866c4443b210b814e1bcca660a34c2b78f21172253d2c53300be2c3e3d44fc
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.17.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.17.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..945eb96703d8de2eef6085a642b1a27de7fb8cba
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.17.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8029ca34c285ba5e30b011338457cb6e1aa2bde375aa5bddeb10d5f735b827aa
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.18.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.18.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c8f8e2fdabca3f7c34468465c2a769b83df35ce8
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.18.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:802bfc3126429a1c8f50bb8bc82a62b62b5e4fac66b2e5201d5ca3dadc76b2b0
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.18.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.18.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..15b491c33507c9aa77edc43db2d844a6f497fca7
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.18.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5b1e35a7c3f4353a260afd771398ed0e6f3fb0cfe2c9e57c9c6aa837187477b
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.18.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.18.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..fda05fdf95a8e38dbba3ae8e857729fde60e6d1b
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.18.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d5e9b4b8ac11947e865c95a0ee01bea2b98bb4d8e186bc655980c0819220337
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.18.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.18.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..56d79eb2481c7040c86fa26964ede1eeae1395e4
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.18.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fb7cefb270cbf64d8347c25b5d776be71d432c570ac277fc6dcb8160f358040
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.18.attention_norm.weight b/model_repository/turbomind/1/weights/layers.18.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3c20c25a40ad141d017b4cce8700f88ca3d8efca
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.18.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4dac1fd7000d40fa00eb19ec7e140c8fd08a7e2fba5ac80c0f15abf00fd9048e
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.18.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.18.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..3c1d6af45afa49731996db41ef7d18503411125c
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.18.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23dee44b6cb77a166863b69487459d9de5dfd4c3989306919d4c35dc20c884be
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.18.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.18.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..54489f50388ea9154fce92dbadd4bf6a1a861f86
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.18.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10a6c1e2ca46dac304c89690e837221b7cd15133dc1e7ccfb18f69187af51208
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.18.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.18.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e90ed3787e1ac9da6ffed10588e004c09bf3b9b1
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.18.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a35d9d5c12d752b160f51f53a49e9a763662605165cb85272e539b60a9f92055
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.18.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.18.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..17951129ba756efbad134062196862ef2b290c05
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.18.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:845ca7749cf6829cc274de80528f41dbd289d125720a4f68417677871dd528c9
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.18.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.18.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3fdc07d36718c6a4fb843c7a0e547971f25bbe50
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.18.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:885808cbeec44e76e545008343da6029dce51d48908c85d61f4e3e5734a316a7
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.18.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.18.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..4b8d6bdb257005f9da0843e14b064394e5e12366
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.18.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da3eda4da09ebaeb73ef447011ce0b9ef2ee982ab26d8d0408ad482f9b2b389e
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.19.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.19.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f58ac78fbf8480c4a875a904f3eca7296b9d1dc7
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.19.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a697cc9e5c643856df75e5d40a4ddc810ad41c0ab9362ad6c7745862c000ccf
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.19.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.19.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..ff2f26342ca1663ff6c89e5015b02b41e976f9a9
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.19.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5deb01a923b8c70c8adaa62c3b6128231899cb7c185908822279725696d1c819
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.19.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.19.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f444fcc2661a285f914957b05cedde19a4954ace
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.19.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:682754ebee51648ef7b0249fee7289fdf825e61916f97ec62087c8e39e9c14bb
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.19.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.19.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..41cb9a3fa2554343948079acebcb10fa2a940517
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.19.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6d4a938a39924f222f02b460355a83ffb98a00ff19d05048c3bcb82c9e57edc
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.19.attention_norm.weight b/model_repository/turbomind/1/weights/layers.19.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..5acd5f2587a22bc1a1e2870e9b4af8ea1eaeb505
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.19.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63d26f2643a9aceebf2af38dbc611dc36da45a176257e478e62f85ddbc559f55
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.19.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.19.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..cc8dd8ef920737fc2e432adac1ce42303e7d7111
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.19.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a54bcfb108f050cf4a7c7cb37114ceb35476b3f8bb6cf6c541e8df014fbf6133
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.19.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.19.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c378e9b9bed297468e52701cb4eea8586e317e8f
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.19.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11cb4b7bd0b53f894236952f72793d3d4e647e6d07fc37e1112b0c5ba392176c
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.19.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.19.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..131386a17e034a3ba0ce59be9c0351b35dfc20e1
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.19.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f32b6e7bb6005ba215aa938a0b52300230f7008150b45a11916829314ef3494
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.19.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.19.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..af5383b2c8c39d1c54f5dea9298ea08f5cbe267b
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.19.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84f83448a65d6bf12e5484bdf2805b2648a5ee6c0f71f592f1399a71f787a365
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.19.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.19.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..6f5513a9af9eec5fbc82dd527339fb220156deb0
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.19.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7e2f003c72088419d2608b060a98ab42356eeffed53510f1d468f4ccd3f1141
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.19.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.19.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..fd5be00138be7b2df59bf0b592a9bef86dc82eb8
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.19.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c71b33b311eb0e23a8b2494a543ba1181fd72314b49cf78a9749b9cf4a00df4
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.2.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.2.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2d9c45e71e2c0ab82208f4202b06c9b97f6ba148
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.2.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fa15c6683fb8dd4f6a17b49bb0a989e462a984b2b1a62741c0261b0205e4d3a
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.2.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.2.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..cf230e2e4ec022b7dadc04504edd265c2736423a
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.2.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d46a56b3063ca3e890569f20f0f9554bd4b8b3dce4dd28c6de2a2c8b018de692
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.2.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.2.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2ec2d68e756cc1afd558415a1c748d3366f51240
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.2.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:745bd18832a4be0427eecf06fbd16e5b4d9045d9bae02a538648bf061f1bcd31
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.2.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.2.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..d1e959a3fa4ef4072ae44bb537bc108a99c3799e
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.2.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f165998aa89a2e93b82203e08444995edcdc00ed2dd2b3dc3171ed8c4aef68f
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.2.attention_norm.weight b/model_repository/turbomind/1/weights/layers.2.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..775cfb53b3214e57d496df775c7f2e98df37a237
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.2.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35db76352c3fef9616c14aefa7c0b05850df54a54e3e6c922df8876639c7048e
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.2.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.2.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..1b19b3f633c84fa1134ae29f0bf9f119d9b25d42
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.2.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5d14e61c9cc1a1874bbf7c1db7fb04e8b97f8d49e011bf0b5c2003a072083cf
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.2.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.2.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..e293bf94f00d2acb588e4a05e8b36c07adfd4cfe
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.2.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02a79b8fb1590037f3bcbe91f25dbcb82b2b91fe0a109dca31de0493a089fcdd
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.2.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.2.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c43fcc94e533822deff81b234c66897d23c2a5aa
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.2.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cbde66d92d3be35621cdb2171a2b9e5ab5448d229f07d7da65d25553adcce029
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.2.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.2.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c5beb7d2b7d8320386a5105a4a2618ceec4e4943
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.2.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41bfc952713a7fd5409f909e9ab107d9ef734e730f7b00d97fc34ef24395e62e
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.2.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.2.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..45e884fea486483f4689411e2b0f5841bb3e6317
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.2.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f16599930e314f9a8ef2b760cc6773e75961152d32432b5fc3e411955dbdc227
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.2.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.2.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..70e74bf48eaad9dd65823e3d66a8d46c4452b13d
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.2.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7808c14f00dcb7b2b77edadc8852138f46802e013a3025e161a669adde20339
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.20.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.20.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..6053a83955560e1c2a84e72515c7672d70304835
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.20.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45521551eeea8b702589fe7c6b19749333abf647f53f56713807dc38f58041ec
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.20.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.20.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..0e188dc213c48bf55e4b2001a68e495c895187a7
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.20.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7d9740714493408c67acb934d26406c11421ab7efdabd743bd990103a90f701
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.20.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.20.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..730a6aa484d4286f408baf8abf88ea73e0b5aa02
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.20.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55586decc011d181feef941588d73d75de2ec8040bce7db734699a33a7bd6f42
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.20.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.20.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..affb6ab65788c985dc6ccf43d5cb3fcc8f4e91f6
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.20.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3dff92bdb0d4bd34ecf08c0c024d9aabfeb9dc6407b55b55d25835922bddb9c
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.20.attention_norm.weight b/model_repository/turbomind/1/weights/layers.20.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..a4b06c9551477c77ebc9de6151cd219a9c13f63c
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.20.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4dfd453a8ca7eaa0368df85c67b0c4520d044c50e21e3e9c642016e56425fe2c
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.20.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.20.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e0aa342e545feda824e44af8745b7bf6714e3672
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.20.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a12408ddaac163c3473e187a838044bf3c05b1a72758d6b77338da700a74f845
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.20.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.20.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..595f2605064e623b1acbbbb39aad1abe47d2b5fe
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.20.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a20c9c4a6621e851abb268c647e4f9459277dc53bc5f64a0504562c9e7736b61
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.20.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.20.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..3881b21e76f4c55a6f5a94d56794ece1d12912e8
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.20.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e13a13177f50e58cd454dfef4083e8b8da065d25bd277aeabcbbd65d9c7ee2db
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.20.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.20.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..f0c038b596c5143988722e1d044fdba36b9f4c53
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.20.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2bb55062eaf5f412bae85c9ac428ddc2e0e59d0e53ebd21abb1228cf4d1ea3c
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.20.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.20.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3cfe4cc50ce587ea9b564a20130b4fe2225d7d52
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.20.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37c809eef52d6f683a42650531b04e14b95934556c2f3607466882fff2c7a049
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.20.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.20.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3fe9d60389494bd97b6721514bbf76a4a2f4aeea
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.20.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97801b00a17ab91f1019edf80b667e915c772df1461e322cb8602d8bd831a8b1
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.21.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.21.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..905d5eb82f1967282905cf3974e526f1e48e2b90
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.21.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2792bae2516c6d5167b1efdd66141ddc18439be883865eee923aa0d64f3501f7
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.21.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.21.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..9a1f6b2beb40845a92a60a5b1ea44afefad5446c
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.21.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:953b7c49b7ba4bab3b5ab552b697d5be9184144ec4f8f6ea9815a0e12420a4c6
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.21.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.21.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..fbd8d63b76ae1f3a0394dfd4c09e724627ce656a
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.21.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f719914491c7941474c1b6efa5a79541ade54eff71a6d65a28dcff17baeacd89
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.21.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.21.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..3199f31825d84cf98169a9ac8361fd01195c513a
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.21.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21e70d0275306b0d766b533780955602dc9d5163028c509745120b4e9dd070d1
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.21.attention_norm.weight b/model_repository/turbomind/1/weights/layers.21.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..ace9b471c09970005b6d8dcb34406ac8671f3340
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.21.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f5b37279d734e53f01e524b941104c4a2a0794819cb443255e46130190eb060
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.21.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.21.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..93ad736f2b44139c784864069aece4a59db96543
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.21.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7298a7ea1a9a2f16bfcca14510dce8da6342ceaccf48354e63945a00c86a8887
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.21.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.21.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..a7e502a74af20d234730806f84f0ee0fbec81a3d
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.21.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90e896e7361f2fde100ee9cbf4591ba2509c11ad2e06ff9150614c28f39f6cc7
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.21.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.21.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e129776d2c3518130aa1688eefa5ce1d57e1f1cb
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.21.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0415c4da6fb2feb289a75e84a73c525272f0098ee5c14faf5544454178576f62
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.21.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.21.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..60435a424658f628b48358ed84954acb2782b727
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.21.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ff5c969303a6b351d8bb80064aad2c92e8c5c32d85bff840317ca0739ced463
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.21.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.21.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..6655336998857a70516ff902b71f61175fd1a6c3
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.21.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8042770bf17c4b7520332fdeeef3decf2eb77871e6d80a2fcfe79e850827faae
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.21.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.21.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..68bb063c7fe76ee11dc858fe2552eff20f89fc06
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.21.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:babef4e3b7889042e89f865f3c8bb53f6191e2c9329e3eb418e0627256b4bbf7
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.22.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.22.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..26e5e328af67eb6995b4eccd4f3f47e2a5572bbb
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.22.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3845fa57cee6ae1adc7c640c17820f11d196a86138e3ab1b26d1fcdb5a12d480
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.22.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.22.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..25e896649de6e4eebef3fb52b4695e66834ea627
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.22.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60a8fb6d26d3741fbf2dbd24d9e96a689ce0d8311349bc7b7d487a94ffae7309
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.22.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.22.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..30d513ba9872686a172b2e5bb54d7dc19c89b18b
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.22.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e8c0a44652ccfbbb876d6c56c552653b788b14188b48f41b957d17036111f93
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.22.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.22.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..63489132ff37547f3c5a7082e39f7d6e60d99e2f
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.22.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cf24c066812a6a36df8eec192b40520df7d10573d5a2bfd2327ddaecf6e938a
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.22.attention_norm.weight b/model_repository/turbomind/1/weights/layers.22.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..67e9beee3472ac10efd53bef75c3678f86f0287a
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.22.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87603494aa61475dfc747464841436f303bcf654dc27b1a07564f53558ebc0e8
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.22.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.22.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..a6f81f752873c957d60d333f567fcf45dc101888
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.22.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37604a1d32f8001155e15ab4e13282b050da543ad0d0a25b759081246fdbdb15
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.22.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.22.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..7fc132bdca2ee4128bec7e863686fdca2f7aebf4
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.22.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06d1aced0b15076b9f26d4ea4f4f6b732368d7b373e7a588635da39cb9db5f39
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.22.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.22.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2679586d03d73f48a045c13e8c8b19ad6eaa9b50
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.22.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15b2a9ac0ae91a96deefa360ba92e79339705410d925b2356b9815692ea31061
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.22.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.22.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..7216f3454da54e1117fd4e92befe84b4c8b46a1a
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.22.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a99b63ab8c94e4d8f81bc8cab1561f47e3c2bac9f6e13f0b23d9438e02d7d1e
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.22.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.22.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..4d71b5ceacf9dcc9afaaf1adf8978c2911ea951f
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.22.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:309c8793e4e6d01a426ded64878ab5bb81fc897a4369e2e12e180067d9e2f97f
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.22.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.22.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..265569647dc54011c0c7aa312cda60679eddf224
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.22.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a792b8d14741661477851bbe77b6f5dc4fecf7ce07009fb7d6bd25090b2ad2b
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.23.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.23.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..3c4b6c3a2d7fa4c456839afe2c5df63b4801cf29
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.23.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2a664f7c9133d9a3d3f013ae68b7c826124f0ce8ee3e2a8b7a3d412fc4ce18c
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.23.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.23.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..6980321a22d78892613c341246abfd4fa6a6ec1b
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.23.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d1caf7d6d040d5052d79ec08aa4282d486d3fd63e54ce73293b62776d97cc01
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.23.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.23.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..a959f9c51c2010dee1865544214aa31aca8e384b
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.23.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:019ccc843a3257c4a7b36900f96de821382e2847851af142ae89a9238b434b20
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.23.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.23.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..63ad5cf1b74567dc10825bf3797cef1aeaf45b20
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.23.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80a82f597426b697fe58ed646f41dd9a6f4514d8d93e7f2791fac932dac100ca
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.23.attention_norm.weight b/model_repository/turbomind/1/weights/layers.23.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..95ac563b56807e330af49708f5e09a5b5d763971
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.23.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d621b52a30d8a04c1866972255522c844eebd9f0b57ee2b90fd4f8e5e7ba07a
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.23.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.23.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..070dac5924104453edc840b81f83c3af7c79534c
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.23.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e95a18e90a00cd47b6fce45cb8c1eeedb6ec2b8fed6f0cd8de85f36cfd5dedee
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.23.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.23.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..53c5e980f8815c039d907e5466820c61f9d1076c
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.23.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae6d90f0468717c0bf1b22ab4914319697011c4ee53f13241c0ca1970acc3331
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.23.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.23.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..3dbd1908961ec50661072cfe35a0e65123ee0522
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.23.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1036d81bd9d055c59bed34241ec3328c1035676dbcd78a0186946147c58af98b
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.23.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.23.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..377898876f13249c94c85b69c632e4edbf89ca0d
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.23.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f354eef95b3a2007598e99428488351bc81e825cc08c8a22beea2a74432f0e91
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.23.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.23.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..6034309e63a873c266790385d8a50379dff8c851
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.23.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36a712b30e1f4b920e2bf0e553bf62898650a968b94cb544d4c0cb45dd9724ba
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.23.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.23.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..2054dd9b5bac4cc5f3947a6a29b0a00ee9c8f9c6
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.23.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:362bc48a1da392c1d9c1404743b87e700f048e91e2236c0f23136126cbd17a42
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.24.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.24.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..90ca332aa05b52f6a6c1174451a057235aeec1f3
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.24.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5cb069457b3e48f9401929077bc5a44b988b7741941ed8157cf23fc0af8fa2
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.24.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.24.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c424c3a6af59cdb2e6cd3d2acdd6fa6b8585e46b
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.24.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b47c34802342bd2a02dc98d311924169d7abdc703e43279cffdcf1422243038d
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.24.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.24.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..183cbc95eb079e344c88e1fa4774f568a66dbbd9
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.24.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6468f6b524dabe33d4487522c605b92a5c91eaaa9d6b39433dd31588bfd09215
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.24.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.24.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c435ad2044cc72cc87bf58ea590aea7b6e463349
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.24.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59fa63a2023ffc20a936686267ae08fe6c793889ca330e0fb0a44ab2b5fe8041
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.24.attention_norm.weight b/model_repository/turbomind/1/weights/layers.24.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..dccff49fb462091aab55a0c4eb163652123ff7d5
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.24.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d38dd18c9fe84631f30cb2b7cb92efc25473d4ba1c438a7817690ed3bbaabd8
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.24.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.24.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f0bea0526b3fe332953eeee191fd4d279f3a8286
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.24.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db478db4b91a673763d0252f233423fa31c7a562f80cbc6c106931886d56e253
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.24.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.24.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..8d233c239c539161b7c5f0b5f890f196d9c544c2
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.24.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5329cd85fc6390d7fc596abdb5907e3c2576c2fb6fc87d7c0dc2dbae326a826
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.24.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.24.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..d4c99dfed4f5fd009c04c0693ddd1253dadfb80e
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.24.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78e4b556d2c58615b1f3bcbfe8780a1217bc0420383b55afbf6767315ca09e66
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.24.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.24.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..8d61abbf087e7f17d99482529ceb6649e5f98e4b
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.24.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9861b1f0dcf30259bc7a9d1c02969f271b805981c696d49b1dcdd939a7ff504b
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.24.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.24.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..a5247850bcab46ee044a136c8ca64f1223e6f1a7
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.24.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f64ff3faab2a3c58cde1f351d57bef281660b552a9dbb9c0aa49bff00dcd6719
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.24.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.24.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3a9a25a5c3ba55692571909bb40b460b6ed82ade
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.24.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d2ab419befc2e7b0391b3b7e7bfa13bf728db0d6cba53136aedc0802a4fcc8c
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.25.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.25.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..41c3344f95ab3594af8a3648d644979c8b8a3e84
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.25.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0971d51d3ac5fa3cb80bf7adb2616878c3921d6810a7b8c312f2c5edfc20ba2b
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.25.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.25.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..380f67b6fde572f2eecd73076b154bb56c631ceb
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.25.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd9d2322fc1ac860eeeb0ae4f57b15011ca5728cab0c2de14ad0734c813b1070
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.25.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.25.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..58a080a5403fbc6975a8c92d3d8890d106c41f32
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.25.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42757d1b84d12da08d617496b557df5dc43260ad03444559342e57effdeff897
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.25.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.25.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..a623dfbef7759c22ba42888f23b6af5e7c88703c
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.25.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc49597aa705026d30a172bcee0421ded59135ee57d2d1a38d511274fd00db51
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.25.attention_norm.weight b/model_repository/turbomind/1/weights/layers.25.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..e330398be316b3c7d2b4e8091847c876352631d0
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.25.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f978aa26bb24bbd527a1e949719d548e1c7bf7d30f04b02f0f28d1343053132
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.25.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.25.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..941b657818aee3d6c553e08ef74566cd98e55321
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.25.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:063a4b6c0bb854f67986762bafa9651778da009fd725fe723fa47306a99a845f
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.25.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.25.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..4df2b6e64935f05f8ec6ea3db6b9723c6ca0a7bd
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.25.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4a77dbd2274b6de3cfb89254d1cb2c0af54b304bb9134a280cbe9b620a361a9
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.25.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.25.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..a2a36f211eb8cebc2e1ce26bbd4bcd9a806cee31
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.25.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1626e0d17ba4f05b0f1e65537f46ada22bef2d00deb136c30dd6bb481b617d58
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.25.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.25.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..09e7a5b567087d78bfcd3614b11b21106f5f8f59
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.25.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d9b0e50a31c6c29d57500a64edf731ea04db50967219bfdcb0853730c574333
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.25.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.25.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..026c4beed926345148e983d57a1eb89a25c4fd1c
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.25.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0eea4a26418b7a503c71abf443da9d784c2adca6551e4f1b998f94d6145d696
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.25.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.25.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..67871afaf8d1df47fbde1f4a65674ded07d4a864
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.25.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cad249894548c60911d6d65a7d5846938c1e479698b4466d4cc6e03d2444922
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.26.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.26.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..8e3258b77728a5579d15c2a374b61be41a2afa09
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.26.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3b88ded4b32bf8ff5ab7fa3616ab98f1bfea6fd86f37b729ad69ffe89d33e97
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.26.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.26.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..cb16882090f73a8651b55899be0c7b66b7d89aef
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.26.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1303373a67371e1e2f3ed25bc8cd8e559b9503bc5b4fdc37bfaf758cd26acfb3
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.26.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.26.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f65b33bea38f966cd6cd26980998df21898fad28
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.26.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da07e11c5ce840df7eaa7de1ddff66356a2995b93b6d1cdefe1d96f6d4eb62a6
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.26.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.26.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..e34f9fbc1e33e117eb223353e64a0d03c3a1ce09
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.26.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec446a339a8b88e9d35b0feb0dc82c82f64420cc45aa67b0730bc6fdfeb33b24
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.26.attention_norm.weight b/model_repository/turbomind/1/weights/layers.26.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..bd89d7d2bb2a10e4537def6bc6550ddf681db645
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.26.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:452e37de79706d39a7fddbbd901e8353363bb41bb1178eebb42b0a9aad1998fc
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.26.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.26.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..ef1f200bdb37b79404804e211dddd09441a90cfb
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.26.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fac2317afed02f28c9f68eae5e04821f1fea2d7553bd4ce30b68b9a7e896be65
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.26.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.26.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..3613b7754b7de11bd7146b2f99bbb2aabad43346
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.26.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e892079f260d62e05e5169a508c1b50c3beffc1e568e189b358850a9596863ac
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.26.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.26.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..42508b0d05c03cfe54875df80e5848f92e3a2148
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.26.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b2ab3bee38aee899c1454a69dc424ae61b6d14d67438c307369be02f6460085
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.26.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.26.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..6078af07ebbfebda87b1016fd58cdcffbb0b4c73
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.26.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:552933cb4c5ad88c47fcfc8c8982e8a9d6c2bcf4975d0a1ff17f85a0de9a72a0
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.26.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.26.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..659727ca29164c591b4db04c441375c79e981fce
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.26.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a474d6dce328dea51c94d84fde68d4472d68dbbf19ce347181b5956b98d41847
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.26.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.26.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..db316b10f011519fdc39c70e40706bb6499001f4
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.26.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d995b27407d7307c6a5b4a4fa7f6247eac5d8c1cc62c066c9bd4395d0455a939
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.27.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.27.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2b398a0b63fe43f5bd6467e9001673b60b3d8b76
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.27.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fb11cc9d2229d99f45200d53d2430007eca65a120d988a8ace070a0e3754128
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.27.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.27.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..36269d2bb210deac5bfb20fc68c3a3c0ba2430d9
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.27.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3b885790c722268908e56129344337198b0c0e4b3bf5e21a7f091d0846a5d30
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.27.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.27.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..75c54cf768728053f1051c6d1260296c943bc2cd
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.27.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d46493db19a5dc9a8d01151f769f22f10733969cad257ff2372fe9ef169efdc7
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.27.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.27.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..91523912e1e6240ee472d551a8422724c7f9396f
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.27.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f80605e605d11e0f5a9e470c80c72859f9651f99f3db043b9eab3989fffd647
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.27.attention_norm.weight b/model_repository/turbomind/1/weights/layers.27.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..92e464dfb802dd2cde189e137b6e908acaec5c38
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.27.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b48e7db8fe774bd46f4eecc92ef7f6bde3cb8e3ba66836e6cae00572ea0e14e
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.27.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.27.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e7392da13e07a3f00396eb1965e2c22daece98a8
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.27.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a634ce6c3f2743a5e0fa245a0adf32df70a41dc7c969d40b1a3197f0436cdf5
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.27.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.27.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..4fadfc7e45425848c37d17c3f39ffbbb822a8c78
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.27.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc138f3c7e31e1be2b6e2a57d7d5a2ffab4fa52343122dd272e41ac4bfd9096e
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.27.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.27.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..aae88c0abda360c16b47ef75abda1c4077edf25e
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.27.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9052da467e48c0c4138fd3769e456cb753464bb30a03a4942846a5b3877131f
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.27.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.27.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..3b2fa2b516a8c83d6eed1702e517e005ac19f281
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.27.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e1f67441bf5d4f5ca51f1f289e07a3c59907d324265741f76ad966bf1755749
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.27.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.27.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..c15c40329868b970cca611aff6e2bbe13d48abf0
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.27.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7fda3309eb353c9341280ab8f2a516011494cba8b769560e91cd0c9d27fc6561
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.27.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.27.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..d8710f2aebc08c7c65db4a66ef9daeba362df5ce
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.27.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2367dba495b15a673a5e8f907f19e98254caa8845195d88897b3ecc36d7c794
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.28.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.28.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..11c1eafa7f15149287cd144977ef8e5a42645397
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.28.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1f9e7857882c7a56236572f8a03d72222b257c8d9ed6e2efa1d66c6b5e21fb1
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.28.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.28.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..f725cdf5914a0af48485baa5a948fb90c3030913
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.28.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da00a72b006477cacf5f86157b6206faefb0b9a1945fed4e5f2a2f9fc9846f55
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.28.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.28.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..294eeaef86a93508f7f8b171fb8a303bcfb5602c
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.28.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:626eff3b0dc5215c6954f774fc8116aa989824ab9c971a3782d8bce5ad31d0a8
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.28.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.28.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..90a1002de820fee0fabb5d5081cde6d434fa08dc
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.28.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5faf82a3313ab0b53237e677fa72b3b44137a47ab5f26d401a3bf43f5beb1bd8
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.28.attention_norm.weight b/model_repository/turbomind/1/weights/layers.28.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..1ec94894ca9c51e452e351065e83a91a22a1d264
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.28.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac4a8732ba2c28970db1dc7e821bd6c8b0e4de12f8de1b6bc6692840154562a4
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.28.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.28.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2ad5905fe8ebd68dafedb5c0bbe70d34f3f8c71d
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.28.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f375cdf0cd1a60d7c9d00319853242606c44be5322598f91dbff37284f0ab67
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.28.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.28.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..f8676ba3b145e257dc1c75c1f9d9dd86413bc37d
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.28.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f57f5b0745ad5281aa67d83c0da6f1ebc7539dff487ae1345761bf995aedb1c
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.28.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.28.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e3532b664b06cd727ceb44f27462084bddb160c3
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.28.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:393b972c36770d253df01db59d0c889a018a26ec7a18cf1e69617828344e2ed4
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.28.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.28.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..9cba65bef1506cf3787aac95439d21334e5424fa
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.28.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f4650f45c05fbd9d52eade717d47d32b1127ad57db10133ba490f5af3843551
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.28.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.28.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..0a50537a8d1863c6ea2bf1177d91c15f67d42dec
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.28.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26ab58696d625c79d618dd907bbeefb29dcb441a358411ed99c0f88e8649e74b
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.28.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.28.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..335aa2710f889028753142ad7c1c770b5aaece8c
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.28.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be67c63310802e47b331969149928657a52d9caadc4dcd0599f0ed63fa8fe4c3
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.29.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.29.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f7fb2a0c283d5309b0acac81e3f78bf535e119e0
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.29.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:964846927bb91f85e501fe1626e8958dba12656845d1c2963d6f0d31ba0e6fe9
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.29.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.29.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..e4616ace3831b1353261ce821a222788574a6a7e
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.29.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59389b1002ea4286ef68d6a28a48de0070a8fe63bb33881a4ea5b4d4824b586a
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.29.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.29.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c339b504ad1ca7893a586fe0fbab27e0414733d4
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.29.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a9f306da7ef17418be8aa9f47f97e653aeab2c155aaf1f32ea93c6e3e424c19
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.29.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.29.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..880d7d9c3c95158609d1215b2f6bba14a3a6c655
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.29.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1370f068209c9ab1f42b6657508b06a3511d1d2d8d2c5b5988f4d58591d40279
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.29.attention_norm.weight b/model_repository/turbomind/1/weights/layers.29.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..dc3408e864d2f349f03d2ea9f976241c0dd4ae19
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.29.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0136d8df649cc27c395128240a43f899929866414704347f851202cc638b9ec0
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.29.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.29.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..12bd5dfc4141909486de6f81eb5de2cd0541f243
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.29.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90f34915975f77f41c0057ec1ddc7e83098a74c6efe44d5cfcbd6252f7483773
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.29.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.29.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..92ba76313e8ccbbbbf563a230bc24e60c122fbbb
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.29.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56814e27f2fc6ea900d3623c77d1df558ea69fe154c99fe57fd45b6567a62186
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.29.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.29.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..850b76dcf051ec7876aa7626f2aee3c02df70a73
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.29.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95e520a4a76d63d5f4cfad6bb9577ab1343c24d563ee6491b0120e8b8f605a24
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.29.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.29.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..8d8434eea29d62735d93ec7d3ed91e73a56773a5
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.29.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a45ecef0ec7bb53ccdd1499338dfc1590c5b4d4e64ca01119d8e2eac40c5249
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.29.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.29.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..cf3ccd85ec2a836282f95d8ffa96f001a6c78bfb
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.29.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80000d50b78aad7b0076bc159838fbc0e679d1b07aa00f374142e40c5fcbba01
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.29.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.29.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..135fea41df0db406183c0c705ee1bf4e15b3d938
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.29.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2334dc6b4e2acee8b2c60625419023d8b5cb9692341970a8cb0cb0950658940d
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.3.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.3.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..741f2dbe9906898116ac1c0bcf6b6f1305ac0c7d
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.3.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b023e843f1b897e2768f8aa9d1f18e1a2fcb8a17ee904981117c3822cafda263
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.3.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.3.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..58882890a176f4e5d124ddfbdce381fc920d5b9d
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.3.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02c5a27de7ab84dc800a722021cefc12233818ba708f7ef20abed96d1efa3b29
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.3.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.3.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..28835af03e975d2a253d1b43e9094dcef5665859
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.3.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:961c0e6293f13ca0eb880f274fcf96b1394f554b645856d99f898ae03ba05ab1
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.3.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.3.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..4941d02a83a0dab878ad6795511df8e08e216ce0
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.3.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6a94458f402b8342d3936d5c436bcc1125e642d5216c1cf70ad7850d134dbdf
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.3.attention_norm.weight b/model_repository/turbomind/1/weights/layers.3.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..fee571b50c58b11c6d17e7daaf1a1796af101e8a
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.3.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e702523cc2696abf9ea5f86ca0c3b8110cbc92f9074f3573cd0935519da7f326
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.3.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.3.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..6576fcc897f882a63b4376d2366b8a16b75529b2
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.3.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec59414d327ec0ca8adf200f8593102b1cbef09d5a97e88f7e6f3d1d941e32d7
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.3.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.3.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..39bfc8b9158d17ace10985a0aefa5ed9b27c830f
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.3.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:592014759039919238673a2d601e2d397b3eb60f2b684d06201310dc35e6f870
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.3.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.3.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..a2dc182c2e093651d77ac65087453506558cc6df
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.3.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c67555a8eae4e6cc55420ec37ea21933418f802190fc809bb33855011f8ec82a
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.3.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.3.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..b12f9eae6cb382f2ef562f1e7dad7d8f2c7f4f48
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.3.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b8d6409835e70b1c0fdf81979b61995fb90f43381277f9e457070df5a91229c
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.3.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.3.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..1ac16014018db6a631b37da0836ea438c9d2fdaa
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.3.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b485c2892ea53a76f21e84c2ed42436b05a41f5dab146fab77f25d2b506ae53
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.3.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.3.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..86f8adc521ad298ee51185ebf02afa53325facc9
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.3.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76bf77db19b1d0234ee2da545c98ee3d5921030e6deaa8b2742d4e9d400d7207
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.30.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.30.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..087b322573894903eb8e5cf81dc0e4962ccbb4bb
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.30.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b085323586c5f61228e43ec3cf935799c983d169abd417a55a6c3f82cd255a1
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.30.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.30.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..de17498ac115e410694314f9e590322ecc3140ef
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.30.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:602a6e94ab5a7bda70167414ea1e71c46be0e7b46a69689d093f991dc6930079
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.30.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.30.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e9eddf6db391e55430e3ca4f04fc6966cdb3bc10
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.30.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5864869bc2f57778cafb236ed45dbcacce36836e1c8b3dd94fd1375829174baa
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.30.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.30.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..f810acf8fcee1cdadd5b34adde32f9c37b177343
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.30.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c899fc162f4dbec0809e3059f9ed0ba9d3004a75d31841ade9aaf16df93493e
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.30.attention_norm.weight b/model_repository/turbomind/1/weights/layers.30.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..ad23a4893d3cffe2d398058b89dc78f528c91053
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.30.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:683f799d6ecb59ef5b47ee78d4d1653b6a49da4dc6c6865734f2832457ad888e
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.30.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.30.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..b61119e589e6b7759f74e927ba8c5a5286eb965f
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.30.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb97c170f0415eeb563dfaab343a6b7c736fb302b605cf65ac29e190d485f03a
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.30.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.30.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..3f892216a36905289e63b4b93c0eaf050e7acc02
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.30.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:debf89602b57cf687b1f434d484beefd647c3ea0e8305484658248c8238a347f
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.30.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.30.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..d0743b7b13a262d47d3c95ff5f00bcf70dca3937
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.30.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00143d530f528cfdded636568772b1ac564990d10d52c943463e8198b0f45b22
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.30.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.30.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..649ffe4f3c74051e77a62d2bd111b1c8956635a4
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.30.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6881934dda1754f8b7bdb5619bed9e9ec7cd819080a5080d36c545274e7563bd
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.30.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.30.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..19611f78c82d05c2fa778fc4099462db96768018
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.30.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c07830c7b5e53981d0d97e28af650885ba42b1395e88e2a8b553c080258be805
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.30.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.30.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..ebf0f2ce5ad46a9897b292cf74ea4074253d9e00
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.30.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7a7079eaefe501289467f67ff3ec35deb358c17022eff2a2d77c011d87a7485
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.31.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.31.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..33f1f7e919ab93f0f093697cc6564c8041cf7c9a
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.31.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42e8c9373e34e9f38c5aa5b7f9e7282f283dd138fa488699361a998289d4f0b8
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.31.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.31.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..51b423248b2e8762a232cb9f6524cc2d2882e6a1
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.31.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e74870d817de1f15c0b372de19d9049754192d574290aa47cc2da4114e02fbe3
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.31.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.31.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..7976fa7add831d946d9634761ff8db4d07f69a6b
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.31.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:882c11872607c376a08d0e7ab4025ebae8050ca0a958b4678fa7c5f5fe34af8c
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.31.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.31.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..87b74517a018f5d65e974fc575140a80f0cf2f63
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.31.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:780d8a3fc0d41d7e42ab7524e0e8eb3a5044627584cb749954a08d74e8889cc2
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.31.attention_norm.weight b/model_repository/turbomind/1/weights/layers.31.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..9e1759f5a7b8ce3bcbdf54ac4a167aa2a3836eeb
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.31.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13b79fca3496315c35d45be930b96ac34c0616ae9bb69018d41d4fe7d77fa1c3
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.31.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.31.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..fa724a72baf441d9817165d242ae54e77b819e7d
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.31.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d6490623b97868d9d81417ecbbc40bbcf24f872882ca23b74a76f6f384082cd
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.31.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.31.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..4e046750532412be4588ab28e7285c8f68bccf2f
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.31.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b03dd848d3c92adda40904bb369f812d1a2de1d72e53600bdf89cf3002aa5e4
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.31.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.31.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..7954c17e1c4aac980fc31bc92786998b66007879
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.31.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f491d3ff06bae3646c8cabbf8c8b6e14963e909e5a3f2cadd84931bb1acc076
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.31.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.31.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..1f95fe4038958211cbda9224b4161cae99e0c2e5
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.31.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7895c436da989422f207c0631685485aada8b0cf45d0db3bbf0cb18b8573d8f4
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.31.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.31.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..491eadebff5c76dbdda444c927fd0bb153d54dbd
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.31.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b010068e8df791fcfd32ddefe46198f72adc5cb104f59512820541ed232ed52
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.31.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.31.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..9ed6ce58e195ff81f658649f8fbf99311dad0183
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.31.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fcd30ad8a1a6ae548b3b6cdbe2b3693c1d260fcf73e63e4cb201f4ff3a9216e8
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.4.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.4.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..9efa7ae8526ee807be03ca3903436c1c4e096b2a
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.4.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd04897e691fff067678bfb5826f8c0dae0914c4a822266312a9fd08f9c8dfb9
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.4.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.4.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..b717a0bccf881f43c4dd4849aa9abac991f829b7
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.4.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a4e0a9b4313f6f28361952f5e1c00250e0bc8d8e348238f634679cc9983d4b0
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.4.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.4.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..bbc885705f67c282413e4e10b430177fa24c64d1
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.4.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83ef42f037338f04aa63a71554b631e20e2cc1f4c44d0498061891de5d46dfec
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.4.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.4.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..0dea56a4d1087a93efcf6c1d4c45d4eddcffd41d
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.4.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92669ba1e130035258630c4bb58a6ae23088baa4c818edb89d18126368fdd2b1
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.4.attention_norm.weight b/model_repository/turbomind/1/weights/layers.4.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..85901d7d4381bcdd1d25c69d8652668e9e82e4d7
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.4.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4392ba124c790351e1e804e3f6954b04df59cabe55918fb2ab208b9fcb1a25d4
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.4.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.4.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2eecef389220ebcbbb1b399d81d28d5c7123895d
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.4.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efab7d32785919b64059b2e20f610eae03ee8a2ba95bcd5c2d786e3074f66875
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.4.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.4.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..203aad693c83911b91ea533a372c2414914f0c33
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.4.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:624fd673a1cb8d5eed0814f7d0ebcfa6de1f0933f2c808a43fe9915863d06992
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.4.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.4.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..22624a1646b9f3bc812053a3e4eccd3aa066e8cc
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.4.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2a9bc1f9a857eb51f12e913af082a9d065232ad278a46bf3312fee70b57c929
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.4.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.4.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..ba1d032b1632c72d516bf607d69ef9d858ec3f69
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.4.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f5a160ff8d293e97b6037541c207caf6ea4b15e625bd94dba7be81f1aa3052f
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.4.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.4.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..10fdc6cff9055cfb29be992fd58fec67e3a1e156
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.4.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7584bdc2460f81e60ad3db90f314b1c3c0bb458b724ad5a8ef2f6b87991871f
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.4.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.4.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..8ab0548585972c0f9a19539e4f0246ed192f0042
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.4.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:734c894776290dd532cb25f542e38b56c9151c45fb751e1d58f5aba3c1cf86ce
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.5.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.5.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..04ab0a16f4f6b5b500d30b4b27152a073d6efffb
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.5.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76f7240f7f94715ffc2e22da1e1986a7738b3a81d2803a89fa8d467ab37d52f3
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.5.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.5.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..35b017f6b8442ef2ed28b4f1d7f2aab7e6c8f3d4
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.5.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f30a98755d5e88115a8343930c20bbfd34ef8095694f4c0709b299e0ee587b25
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.5.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.5.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..4b270cc9d0768c5834bf5dee3db2ae53b9d1a2db
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.5.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c2c8b87162bc3f8d4c6044cbbba5bff1a0b4d484418966d683cd8edd5ffe289
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.5.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.5.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..2170f6316f894a43c57df7c6f3b6435d6d290e59
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.5.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8a0bc293e079e00c8fb29ea166613fb81fc7a51dfae01bda404298bd3541858
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.5.attention_norm.weight b/model_repository/turbomind/1/weights/layers.5.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..e56c76ec2f895f4ab09e315bcb026a0cd110898e
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.5.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e322bf9e96c707a007b6cf18e95291034a7b4acc28cc9c868ba72a2067f42a4a
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.5.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.5.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c0603e429404aebb532d112009658a498d6a25d2
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.5.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b111a37c3e4700a7ac8bcc755e22baf0cdd205a4f64cce28587b12e6bf542fa5
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.5.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.5.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..54720e241e1c6574c937ac39760a84933da14ee8
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.5.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ccbdd88d473982cb63c5daa191f2956e0826feff876c6303ad46054ce474a9f3
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.5.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.5.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f69f281b519e24e86576e49e914a3f29b9833837
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.5.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d055b75469902bb480fb2470766fc359100caf6f512e030d846c895cb23501e
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.5.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.5.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..39d27ba627be29fdb76869d39b5a02b38030a6a9
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.5.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf2b8068885689ca049003d3dff4bc8e68b47ddb9be7d7fdd56b39582b7fd61e
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.5.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.5.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..8f90bb2bd06c0ff2405bb8ca61c65441dc384653
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.5.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c886bfe39172273f70831164b7b87f48054c0da65cd1724be839673c817009b9
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.5.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.5.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..0032439aec9359a437391315477b7201d232b7ba
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.5.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b479855806803e6c485764401a2ed76b362ac09f2606a6d58fbba9b134ee186
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.6.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.6.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..08c09cae235117db0cf2be801f075c4236bd6ba2
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.6.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebf9ddd2465c02a1a37bafe82e009127d6cbbcf0bec3b323eece36934bb6eeff
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.6.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.6.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..78b67e25716cf86de09b47dc537db6ec420fd21a
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.6.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b39acb9cc4de067c3ef5b0128c253ad0b646756445766d91f2421ca30ab6e272
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.6.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.6.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2d2cd5ddae6f67b08f6610fd6bfd8fe17ff43ad7
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.6.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81ad5a0787961305a05ec9b7c0fb89cc2aa70589a36efea39557a8ff33be93c9
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.6.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.6.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..380b6dedbd40afe6240e0271cfd0000ef9f17b01
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.6.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:edadc4493b3568ab5ebe758a1aedc2ef5fefcd688f5a78eb1866379967ca1cd6
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.6.attention_norm.weight b/model_repository/turbomind/1/weights/layers.6.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..68cf1e82a5f3d60ef2c37bde39437efe411c0263
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.6.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5dcd4367593812ecec39d8b1ff7cd21912c1283686db24be488384fd2453162c
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.6.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.6.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f66c0c431c68905f3cc431d2b266b628bcc1f9b1
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.6.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3cc20446684f9b809fd52c40bda9d32c115789c650575c0e54f5ab030b7ceed
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.6.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.6.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..d158d234d215899f80ded95207cff364e20e0c1d
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.6.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f01f13b1cd0cd8080d7c4906d71e44200b8053aa605a37069f1a9e1034a81f93
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.6.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.6.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..0bee7d213091341bc193cd21b808a3776987b7dd
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.6.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95865a00e74b9d37ba9c21241922979b4f26eb06b78b84b25be12bcfba617657
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.6.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.6.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..80f3f7257450ba5de9d4dabaa61b516c7c807046
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.6.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0dcaefa2acb86a25aedc25d60558af179bbf8968f1fd023b20343dad73b0184
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.6.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.6.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..b56799656e38d049d14d02b2d7e4ab1e470bac6d
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.6.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e458ef7058c9d7734737447072dc2908dea9ebf64a2ebcef932e4d6832057f5b
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.6.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.6.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..70c460d32701c69c43ce43977e55d4c5e407b1c8
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.6.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa3e886e06b35057d676139206ed116fafd8c8dd29244eff07cf1221837e8807
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.7.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.7.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..4bd1b6da8292c5b10b20dbee8e2ee7e95a46637d
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.7.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0c4ca025a4e163c0dc2da98d463549125001a9cc93654f37907cce2a9882d52
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.7.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.7.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..8846088f9a04128c3626ebdde6d6747d1d663587
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.7.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c086c5de28164657905ed6eaed423d6244ae0368c6180aa26fc0a6eb89724a83
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.7.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.7.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c4891059c086711d0200456b57dc31f93418ba81
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.7.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efcb8926a09d3f78acbff4e19e2e5bafad04172d17321a6af2b4fe7974c40fe1
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.7.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.7.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..a08abb8652ecda43c661807290bbefa793fb0160
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.7.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c0cdf8402670c6998b317082c140f0eb51c4bb0b41ca4e6386c6f1648f56a76
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.7.attention_norm.weight b/model_repository/turbomind/1/weights/layers.7.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..37c18cd18f7054a248d6352d4d5a25ac9a4175e5
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.7.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28cf5e25d536f7d9180c2eb1d7dcfd7d4bb749816849f75c5e09f0210cdbc417
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.7.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.7.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..9b50669a9dc81bf91e567a299ee57d333907a007
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.7.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0581fd7f812265f9b47b8eab7621664a046c4c6f98279676df767aaf339eee7
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.7.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.7.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..93d6f40d2e5bcd8b2a2da3d12418121279963070
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.7.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f86e5d5f8bd7d8eded5bf5a5cbefc9b1b3242cdb2b486f6b1b0289d75f4df828
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.7.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.7.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..9d07164c18362f5b0879cc88dbb43ef395f284f2
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.7.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b02b881d979d0fb77a4d705ed4bc68ca58e7cfa84a504d90b9e816ddd99a6b0
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.7.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.7.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..b95f34d475e6c10781aca4639fbcadc9e706fc5a
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.7.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0c7e60168198f2ac9347ac8eb4fc59ea42fe0380e24550cd4fa2e989a2d90b4
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.7.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.7.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..7669f396fbea22312892ecc7e69f5847e3e3d0f7
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.7.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bce0233aef9e8401ea7eaddce5b44f2a28b6fd1018023ec3f2cae495f4d205b6
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.7.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.7.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..d2b299db6620c0abf87b67b228dd03b696854499
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.7.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae08ed15fa296e998f7e93b866fb5536103b357ca8fd0e8ee44423c4fe3ea4d3
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.8.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.8.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..9a071d9e1c24a362c04a0f4335000d1eeeadbfea
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.8.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:312a5231076c36e023c30c18761d4793c7aaf2d1658f740a4ed6fe3ab9fb9532
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.8.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.8.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..b756258fc2694a8580c1d6d55d73c1aae4f88737
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.8.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:045eb164e9d18487951013b4a69dab786f034139e232a0c079e6c6de0b84d445
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.8.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.8.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..79dcacb0bc5ed37629a105bb0afdc20c383e1736
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.8.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:917ac6b4102a88cb5fe47a13834f30fb45329e8234e6bf4a6d5def09acfca138
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.8.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.8.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..3f21f5d05d73002cb0251350fce183ec3b6f82cc
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.8.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:075ca25071e36779993618787bcad51f47a6210b5c7efb13836b9f0c39113c7b
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.8.attention_norm.weight b/model_repository/turbomind/1/weights/layers.8.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..6441edc914d86ab07b46c530e63df5e212499fbf
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.8.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7645c5cc08248a97031708e37a8869793e72e86be7d529ee2d38214aa125f326
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.8.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.8.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..6b623d7f4ebef4670369d48905c1f66aa9b3fd94
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.8.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0a76bb17ba96c365a1bf660f901c21c3fc1d15165b0532e97c7ad86158513f0
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.8.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.8.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..f7b56f5fefdb81227823903289604a2f9e33cbf6
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.8.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f6cc9bf35da7c08e89248a2d1151ca84f97e0d44fda2f474fbe090fa2b71bc6
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.8.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.8.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c837700cdf510ee1df94f861174695bb0e1ccfc8
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.8.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67d6a461146ce6fca245beab647f837c7718f50c1ae6d48f852becd4b88ecd68
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.8.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.8.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..63ba13362b7c68d37224b01f241452a27cf8717a
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.8.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22f763f7c06275a5821c55ab0428986c7982da93d02ec561c4c1cf0bc83cb82a
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.8.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.8.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..c4ec482ee099d1dd8d7b2633b38f9546642f8c04
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.8.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97f607d08fdcc7d4a7048194e994afa25c34242bddec4d56534a779484534dec
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.8.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.8.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..dae30d205782945d230c044159736e88b8c261e0
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.8.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55e7e6e9663622f872cb332c414eac32a102e97ffdf3f5a2b6afa6f8371e1a5f
+size 16
diff --git a/model_repository/turbomind/1/weights/layers.9.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.9.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..502cfce88cfb73bd839f1fb667fba672259c4294
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.9.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ad1c9bfda707333f5860de8512ec7db789721d5f17e96ec0c1f79f98533c42c
+size 12582912
diff --git a/model_repository/turbomind/1/weights/layers.9.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.9.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..47605d66d4acddffb2885150c9d68d184f94a9c6
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.9.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b5179dc3fba3abadb58abf409bfef33b382dc7373a002c3c43da9785c86f614
+size 786432
diff --git a/model_repository/turbomind/1/weights/layers.9.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.9.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..0c3613bd080dd0fe0abbe07c8a567bf85e48e33d
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.9.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:535eb0ed2a008590448c38ddcfcf990219dd0c1752e28d11fe3310cdf4039d57
+size 8388608
diff --git a/model_repository/turbomind/1/weights/layers.9.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.9.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..bc68d0462949d41fb22495d6fc4d8a2c6c21b6a6
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.9.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee2d02d009e36ca78d86a48ea408c2017c21903b64400397a77f437f495d936c
+size 524288
diff --git a/model_repository/turbomind/1/weights/layers.9.attention_norm.weight b/model_repository/turbomind/1/weights/layers.9.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..8493ee9741dd897107d9fe3cea7c2d01fdd4dee5
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.9.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fcacb811b4cf62144e1ac2d3eadbafab30083e3420c46a92df1ab21840b29fe5
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.9.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.9.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..bcb62122ef3b2bf1d13099eb7e64cd4f6266f02c
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.9.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aca67258bcd3c39f17fb15a14b72cfe8ca597aeb30e0f4f298efa5eb093abcf3
+size 58720256
diff --git a/model_repository/turbomind/1/weights/layers.9.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.9.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..3e0e6af0add56eeb2e1cf7bc0142e52be7a5ae29
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.9.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4b60ceaccc0af57c36de7cd69acf05d8c307f2d6d27a7e765e0f132ae95d17a
+size 3670016
diff --git a/model_repository/turbomind/1/weights/layers.9.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.9.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..399c1fc8d6cc43a27e802ca067c88fc4f9a3bc73
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.9.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e07e422f44ddda11dc7404b257cacd675b2b7f44491941e6754155df3a31d2e
+size 29360128
diff --git a/model_repository/turbomind/1/weights/layers.9.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.9.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..9509fd872d04e11bf53f07f99129e785b2056187
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.9.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3cc346804097116087236c77f2e2c018922efba4f2e32d8a71ddf8b026c9d34d
+size 1835008
diff --git a/model_repository/turbomind/1/weights/layers.9.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.9.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..185031880012c613c2cf8937d4aa159e1c93a4c0
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.9.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98119ccde8c54eacba56311e43a7c74e62e30e0d7302b011202dea6a6348ba66
+size 8192
diff --git a/model_repository/turbomind/1/weights/layers.9.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.9.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..0ec9f90c9c5be11398b7b1bdba1df5b0975ab0d4
--- /dev/null
+++ b/model_repository/turbomind/1/weights/layers.9.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62cf0a7960b56038dd17b81e2a1c38a016c2b78bd7272299dee18ae8e53e5c92
+size 16
diff --git a/model_repository/turbomind/1/weights/norm.weight b/model_repository/turbomind/1/weights/norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..906361178f72cf7bd1f01447accc35bf0e1b633a
--- /dev/null
+++ b/model_repository/turbomind/1/weights/norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efcd3fb0c1c5225c17e0eeb5b46068bb7311f716a4908d5a39d79b37985b58e7
+size 8192
diff --git a/model_repository/turbomind/1/weights/output.weight b/model_repository/turbomind/1/weights/output.weight
new file mode 100644
index 0000000000000000000000000000000000000000..04e8f86f0b46051b3db62d5eefcbebda87641472
--- /dev/null
+++ b/model_repository/turbomind/1/weights/output.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b0ed41b4df8f91647fc8bdd2aa61f55c39e09b6e063c8bd509b591797293919
+size 758120448
diff --git a/model_repository/turbomind/1/weights/tok_embeddings.weight b/model_repository/turbomind/1/weights/tok_embeddings.weight
new file mode 100644
index 0000000000000000000000000000000000000000..0b3edbd16fbb690f7c781043ea905fd4380e5f04
--- /dev/null
+++ b/model_repository/turbomind/1/weights/tok_embeddings.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8986115ad7e59813a41c88c0d601235fa36138d6c15e5657a050cf4ec40fb037
+size 758120448
diff --git a/model_repository/turbomind/config.pbtxt b/model_repository/turbomind/config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f139d5b2234c0dfa94e3792dda985f9e8034a5a8
--- /dev/null
+++ b/model_repository/turbomind/config.pbtxt
@@ -0,0 +1,293 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "turbomind"
+backend: "turbomind"
+default_model_filename: "weights"
+max_batch_size: 1
+
+model_transaction_policy {
+ decoupled: True
+}
+
+instance_group [
+ {
+ # max concurrent instances
+ count: 48
+ kind: KIND_CPU
+ }
+]
+
+input [
+ {
+ name: "input_ids"
+ data_type: TYPE_UINT32
+ dims: [ -1 ]
+ # allow_ragged_batch: true
+ },
+ {
+ name: "input_lengths"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ },
+ {
+ name: "request_output_len"
+ data_type: TYPE_UINT32
+ dims: [ -1 ]
+ },
+ {
+ name: "input_embeddings"
+ data_type: TYPE_INT8
+ dims: [ -1 ]
+ optional: true
+ },
+ {
+ name: "input_embedding_ranges"
+ data_type: TYPE_UINT32
+ dims: [ -1, 2 ]
+ optional: true
+ },
+ {
+ name: "step"
+ data_type: TYPE_INT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "session_len"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "runtime_top_k"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "runtime_top_p"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "beam_search_diversity_rate"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "temperature"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "len_penalty"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "repetition_penalty"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "random_seed"
+ data_type: TYPE_UINT64
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "is_return_log_probs"
+ data_type: TYPE_BOOL
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "beam_width"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "start_id"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "end_id"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "bad_words_list"
+ data_type: TYPE_INT32
+ dims: [ 2, -1 ]
+ optional: true
+ },
+ {
+ name: "stop_words_list"
+ data_type: TYPE_INT32
+ dims: [ 2, -1 ]
+ optional: true
+ },
+ {
+ name: "prompt_learning_task_name_ids"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "top_p_decay"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "top_p_min"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "top_p_reset_ids"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "START"
+ data_type: TYPE_INT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "END"
+ data_type: TYPE_INT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "STOP"
+ data_type: TYPE_INT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "CORRID"
+ data_type: TYPE_UINT64
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ }
+]
+output [
+ {
+ name: "output_ids"
+ data_type: TYPE_UINT32
+ dims: [ -1, -1 ]
+ },
+ {
+ name: "sequence_length"
+ data_type: TYPE_UINT32
+ dims: [ -1 ]
+ },
+ {
+ name: "cum_log_probs"
+ data_type: TYPE_FP32
+ dims: [ -1 ]
+ },
+ {
+ name: "output_log_probs"
+ data_type: TYPE_FP32
+ dims: [ -1, -1 ]
+ }
+]
+
+parameters {
+ key: "pipeline_para_size"
+ value: {
+ string_value: "1"
+ }
+}
+parameters {
+ key: "data_type"
+ value: {
+ string_value: "fp16"
+ }
+}
+parameters {
+ key: "model_type"
+ value: {
+ string_value: "Llama"
+ }
+}
+
+parameters {
+ key: "enable_custom_all_reduce"
+ value: {
+ string_value: "0"
+ }
+}
+parameters {
+ key: "tensor_para_size"
+ value: {
+ string_value: "1"
+ }
+}
+parameters {
+ key: "model_name"
+ value: {
+ string_value: "internlm2-chat-7b"
+ }
+}
diff --git a/triton_models/interactive/1/placeholder b/triton_models/interactive/1/placeholder
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/triton_models/interactive/1/weights/config.ini b/triton_models/interactive/1/weights/config.ini
new file mode 100644
index 0000000000000000000000000000000000000000..88f3d40970a1e663689736be546f8d3d64bb8734
--- /dev/null
+++ b/triton_models/interactive/1/weights/config.ini
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c8358cd3fffcb86829f6b600bdd0ba77b6147eed572f88700ec4d914db070d6
+size 645
diff --git a/triton_models/interactive/1/weights/layers.0.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.0.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..4f5435a75963ce7ce17b0536f500c8ebf8ca4220
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.0.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1763929a6e7bbdafdb81d39ebfa08263351ccea12347aa68b292b1b7c458e45
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.0.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.0.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..52107ec494683ad0e0403e4189bcceed1ceabdcb
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.0.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ed40e83191f5304fd2df93ff5b90ae9a165bbe489af8020e06948fbbb289d7d
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.0.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.0.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..6e21231bbe43b92e43a0d2600ed6969f6c00e767
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.0.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6710235be94402052aaaae809e488f433d75d6d33acf546e2d0bf7aae4d8f0f
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.0.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.0.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..4961bf6cfbf6ae7592675c56d719924794d8da68
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.0.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c069c91ef3a796ac2e9e0230319fabb6bc8433c68284c6e5ca71baa477a3438
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.0.attention_norm.weight b/triton_models/interactive/1/weights/layers.0.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..51dd734ab95204a4ce7fd026707a375f1a85219f
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.0.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dde3cfe82d02d87660f40c667186249cd17a5ee5924ab2a3ea0385919a2d0f3b
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.0.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.0.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f3167a75e6defd59aa396437f58c797bb5cf1b2c
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.0.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26bc912102aa2b487baf312f3bfd8f97dc46ba6761c2328bfd3e45581bfbcfd4
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.0.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.0.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..68343cbdcbc17ec725af43c1a1d53b62bc5c32c0
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.0.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:309c93937a8778e4e4dce879efd1e0673f4bb7701644628abbaa8420e5b24cf0
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.0.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.0.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..3e82c77a6ba7b16d19d55f544f872223d33fba6d
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.0.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d096d08769d4b05f7483b4ed024224e0d4d35772231e757157e69c9c0dc1c6ef
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.0.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.0.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..fee7031bc4703588c99d993aaf4e1c0f1d080e5b
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.0.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdb73c0a0f614f1033850266d6ff4311374557a2653e0fa7857f8507ca87058e
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.0.ffn_norm.weight b/triton_models/interactive/1/weights/layers.0.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..e8f321d4e16161bcdf7f2b6979e9f90b8aa04ef3
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.0.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5b414270e0d50fbec62cdab6ecd217c2f688872d5ed7d9f91bb75dfff46651b
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.0.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.0.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..e376c6acc6ad65b07267f834beda69a889c5f0b1
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.0.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25f7250671024d0129c45c3f3d8f57887921d219c280350697d41e9170925c77
+size 16
diff --git a/triton_models/interactive/1/weights/layers.1.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.1.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..bb3ebc7beaa1d925c4a14fbad6d2df2ec6bad94f
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.1.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a125e82d7ee989858902abca2bec9dc3f4ad74008f5307a1e7a635d148c53f3a
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.1.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.1.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..bc0ed1f6f8ef00629e07ce4989e2ddde96723c08
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.1.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f96d91127194d8a8404809f81602727e59903c86473ee27012bb303f83cdf77
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.1.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.1.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2eaa43207863db980e17ed160bc4613b175baf27
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.1.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4905342d79812e6bd9d6d993443ee6b30df2f80cef44176d1398dc884c458bad
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.1.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.1.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c136a82b25947dc950216cf643734a4a5ee81a36
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.1.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c7971bdedd76bbe5630fd97b2badbdd26d22055ffe6fe0374fff051af9feb80
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.1.attention_norm.weight b/triton_models/interactive/1/weights/layers.1.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..abe49b3b4fe282cbcf269cc92e4a1b03f8304d1b
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.1.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d589a6b27b707580d37c4b198dc952071bb1a34967ebd9175f9055ac012bc781
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.1.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.1.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..7d2bbd8d926a99dd1ba3adf0859660ace736b884
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.1.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1dd761cf75a1f95c5a55a245fbe1a8bca8967be0d7a03dd12108d0be835d7682
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.1.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.1.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..9fb67e07dca86f3c043855b520b84ed83c9b4930
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.1.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d4fdfeee03517f7896aadab5adec50c8449a2e1bda2f0cf5b8725b26057d1f6
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.1.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.1.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..83348571bf69b92747b68f25d3755c7b2146e4c5
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.1.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0c42be27fe2e9f48473b5cc4ec63cd06575ade857ea8699b4bd05eb4f801dc6
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.1.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.1.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..7f8d31081aee57241eed23ae114dd5e39f9e6bbf
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.1.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe915a8697f98fe80270d235325b469219fac1c8a4529052fd15f6b1ee8f13e6
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.1.ffn_norm.weight b/triton_models/interactive/1/weights/layers.1.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..6db47869baaf62ea10c904bb39ca2fd8dcb35aa5
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.1.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90fa27f32ad04b368d7110fb689b24ea02904efb2f2b7a9f9be876c331fc7212
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.1.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.1.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..87ba80c2080cfc64bd645133d99c4fb0f602b920
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.1.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08456e5241a0fbd14699cb889680261c9e0ca7d30051066d899e99be24e15d52
+size 16
diff --git a/triton_models/interactive/1/weights/layers.10.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.10.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..35f6c98510eb157f0971d9d241b2ec765cd3c834
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.10.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4d8d7ae69eea66730a10e906758105f2c99b16d082b9ea84d7e7cd8afcdbd4c
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.10.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.10.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..77eb52490f504dbd5b089674f267142c27e7acc0
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.10.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2885240377b91bd85bbe4ee6f67b8ca23233584c35ce71b752f9f3bbb66e266c
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.10.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.10.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..440d3e309d85cdfb81736fd024a2834f4d0ce308
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.10.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae5115820467dcb2720eeb7abbdaf3ecd5edb56d9d7453fb0bf4f6b65323445a
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.10.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.10.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..18b5ecc65f6f8133a1821de0925d37622a67af48
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.10.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4438217ed5de15cb91f4e30f0644b08952e981d25015dd4b75c4a0cae83517c2
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.10.attention_norm.weight b/triton_models/interactive/1/weights/layers.10.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..4f0f39a02bb84010dd644e2fc96ef3b46d4c2820
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.10.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cd2c0d884542c0a881ef8fcfc9fbcc1feb67afbff0a8befc9bb741e2d8ea2af
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.10.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.10.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..bf50b623e7b1f4520d761286edd1db51a109c4c6
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.10.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a1258ea1e97e4c41db26a363eddedd3bd47c6d49f7bf738703c5746c54f4e37
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.10.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.10.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..ee36f684587a649d68d9579441ca3e90af8d7d6e
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.10.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48e7492a7d4447980961b5891a0997f2568bdbe10ed15ba0998f8ca1bdaf0a4c
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.10.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.10.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..b0cce8413321f6074dc61c7a28bc92377f4c7ab2
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.10.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fb81b3c6a3f7b674506b003621b7e92925754e97d23ecb1209003f2232e33cb
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.10.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.10.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..ce1603f2d10d9ae9ef7251cb66a02c3e0cba6b67
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.10.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:773b9c8eb4a3818b2667162b3169bd4fe813f2fcba5c708a49b79fa5c5053c61
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.10.ffn_norm.weight b/triton_models/interactive/1/weights/layers.10.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..bbe9a16316f0db34745e41ef00224f94b9237fee
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.10.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b576f4d059d0f37a4fd3e626e640dad540ff4758aa449bafe55a78046a01dc9b
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.10.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.10.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..da0421db9e924c29c37c13c09376487aaa383c8d
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.10.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:430d675f2f2e4512591d558ea6f29e42dd38c55ffcd8d21873a12e9ff90e15b2
+size 16
diff --git a/triton_models/interactive/1/weights/layers.11.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.11.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..d5058e0b21a7342d2379f3a9315e85ef9bbe7682
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.11.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2871ddd112a88bb89a549de3bf1c53af525e962e118eb7ad0feac6a56599a26e
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.11.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.11.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..92844164ec6f5b42e8222c577ce94bae5314a9c9
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.11.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de7017bdedc110df3a9f9fab19466968a5488b9ab3ad533f0908f2d368371adb
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.11.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.11.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c67e6d4b3e11faa456791b77155fef70589e246f
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.11.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:530e3110fadceb664c29ff9da577cf401128e93ae21601affd1c62137b04db35
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.11.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.11.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..4e0d310e48ae8ebd9b629872134eb3687a55e341
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.11.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1725da8fac86700a95c4ee9d40cf9ebf0d1ebabb4b145c2d57c4a31c42299cb8
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.11.attention_norm.weight b/triton_models/interactive/1/weights/layers.11.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..f57dfc1e256d2fca8f1c8d59982ea28fb2f209c8
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.11.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4cb24612b49347f84741d6daab9a90b828aab924fc9b21fd2d2ca6b67abf8ea8
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.11.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.11.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..27905dc8bb55b6305cefdf0135d72eda3e7e17d9
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.11.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0af7f58d1e58e6610b5b56291bf697d79471c1eeaefdff9466fdc87996c3c86
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.11.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.11.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..474796975c206470856a63e5627806fdd1a9d0e4
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.11.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46e2d6846839f995e9434c35519a1152c52285d29672febe66e9f07b0e7523e5
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.11.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.11.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..b8e4a4f967601a2151a7eb5da1c126599eea4743
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.11.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ae182cb83af72cac11a76113fc5492ae4ccda1cd45df36facac10e65369d22c
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.11.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.11.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..aac9a3ac0afb93d279461dacd82e1fd80dfb6161
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.11.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54af6ef8d3b0aaa32183d5fb176a4d2097bd043e44ebea37ba43ac4021e18253
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.11.ffn_norm.weight b/triton_models/interactive/1/weights/layers.11.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..6f958acb3e97bbc263ba99adb14ceb897dc7e573
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.11.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ae646b4e03481a9e0eccf0a151deeae360012b79d455f413d6b4c8c05ead016
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.11.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.11.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3bf7aed58e43958ad08d6b6e8beffe072f7e15e6
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.11.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:114046d9b18a39823a18019529563163f191e5a74c65e959db74c96b77c9b4b9
+size 16
diff --git a/triton_models/interactive/1/weights/layers.12.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.12.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..b026bcfd8643c18461670a5a2980cf9a8539bb2b
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.12.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d30b7fa1db362abf3186072da75c305cd7e79f90f4b1eea6095014d9f7989da7
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.12.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.12.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..011903f321dd322447298b693e1eedb17f35c3ac
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.12.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:654fe994288ed138b388cb0e14a9c4e7124b601ac4efa404788e3267ed137307
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.12.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.12.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..fd89f748d1ea906c6617d240a4e123d243105b64
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.12.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:069d9e054d6cd0171b229e37a70b6a2fca364783cc8e80de9f81060931964e0b
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.12.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.12.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..b46cd92e96aa0e40ba260aea37674bdb9fbf1fd6
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.12.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:394968e46096fa0f50701fe0d09193561276359f023ea5dbc3a16bb3f1aff8b8
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.12.attention_norm.weight b/triton_models/interactive/1/weights/layers.12.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..0020f8c429974d047571347728c95d5259c0da58
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.12.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:020a5a9ed0a5065303d1079d24ce7252b639f6f76bf49c7b8fb5fac3bc93fc1b
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.12.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.12.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f5cd9ca940d4417db1082cb6b445b56fc3ed304e
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.12.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9293f916e4009deb3dd715ac0fea08afe5be75548d2fe2e70a67fd5826664cea
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.12.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.12.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..be6c9b7b29a56d2d3afaec63b36099fc29d1ba80
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.12.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89899a4751211dda4328e2380ceec5d62d0d0b13fd164ccb7c9f5e189409a08f
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.12.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.12.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..53e4822e263ce179450dcfacefe7dd882447324d
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.12.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f0f0481d3c7eeecc2717614f38dcd54163c287431e82da95a1e8d5fd182cc27
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.12.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.12.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..2f8d90a6c38370788887ee529f4ad8c7b4fd6593
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.12.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:690b11e4c0f825ec39db6b53fc1ccdd51d051c752199195f2cff8079ef3b980d
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.12.ffn_norm.weight b/triton_models/interactive/1/weights/layers.12.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..35e00aeee302ec1726ef04c71f2a2f429fe0d23e
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.12.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce6abd982c6b4b398f13a6113cfaefff0fe65190ff1b232c8b9a68acb30fbfdb
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.12.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.12.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..8fb69a827363200f7cd82be1b4f35bab6e143bb7
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.12.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3cee21f879722a16a454f6455c8d8c3aec77cbfdba6cbebac9c4762d1d03bb2
+size 16
diff --git a/triton_models/interactive/1/weights/layers.13.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.13.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..63d098e6067e1aac3d4f6083c34f967abcfb40f4
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.13.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:983fa35043fba20d8f39610fc859862486472388df708d85176e198b9493f194
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.13.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.13.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..f78fb596aaf17a70c0fc17098a02d2fbd9f8b12e
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.13.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcfbdb8a6f2d86500e49d21e3d0cf88dda2e18b505be8459e46962f1a5403902
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.13.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.13.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..d0443fc30519b3ca74b5e3d4e0317af1dbe8b32d
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.13.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e76d5b55510b3111a4c8068f8bf2abe8372c9868a5346fd03831633817f49a3
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.13.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.13.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..6cbcd17aed1ae804e9e87a936274b99c9ad81296
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.13.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da85282928c5b1723c48e93cdadc416b400deb61bb90f28c4675989ab7d2f4f8
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.13.attention_norm.weight b/triton_models/interactive/1/weights/layers.13.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..58edee2f8e729e06965c92f434900ae4f75e1a49
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.13.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:592d7039e973372cadcf8b3f717c19ecbcb911e2f40140d617855643bf2bfa3f
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.13.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.13.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..0f2f191246be551220b2b9df11e88d070f4b63c7
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.13.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1cbe619508e858a2637045e1e07f9cb0ec4c6020d6041e40bc9558aaa9fd290
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.13.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.13.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..8114a135ab96b7c28393bb44bad7050a71bd712c
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.13.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c555740ee91741c87411db09bc23b419caa191a4ac0ccf7e34b00fe64e614493
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.13.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.13.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..efc53988aa0826924baa6153c20d1fb1abae3183
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.13.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5434cecf17636b9bbdf1df6ae4b6d1eb6c06a611c93fe0291ad0d3892d850a81
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.13.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.13.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c927886fb77c90e7e2afb11bb38945c179e779cd
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.13.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c89194f222aef9d0488e0677d654d9f4cc783cebad2ba76e9013ef99684a1c2c
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.13.ffn_norm.weight b/triton_models/interactive/1/weights/layers.13.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..0044a510f007c3e66e363ee02bbc25f4c26cb6a6
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.13.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75cc6d0e292ec019791db0f7ef63b0508d8a5d19404fadb09c1b06a8dcae7cdb
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.13.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.13.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..313f047a7db61ca9b3fed45b948aad24958ec896
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.13.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e86a948027461837c94daa03c444ddaa2a484bdadcab47a89f78d0d332ba0370
+size 16
diff --git a/triton_models/interactive/1/weights/layers.14.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.14.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..d34a88071016d52838a914b177b787d6b7f5e989
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.14.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd65317b8701a195eabe835058a9366309ad055eebd4354fe994187573dcfcb4
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.14.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.14.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..dbf55a9dd11b2bb29fb5f7a2ec180b89f6372195
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.14.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a8b7af909bb0ee02940f92c80cde0a7a869e60bd4778c7eb5934ed7134b1e56
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.14.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.14.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f2e7385fd3b0a6c38260980964dfd035abe25f95
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.14.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f17aa0c464ae8e87100f9946574744e554c50847775d5e3cc888584c920b51bf
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.14.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.14.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..cca81645ed7af2fd8f2039c751f0856ab6332929
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.14.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac63fb5629b386babfc0cf09324e8388735c894def38688f57e5fa413a76a6b6
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.14.attention_norm.weight b/triton_models/interactive/1/weights/layers.14.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..a2e5c82b9d622524d9390c76957ed9e8994aa2b8
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.14.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9d54e43cc40808a7a12fb34802e7e3fa239938943e4f247ea54556f65191e0e
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.14.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.14.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..efb7ccb2234e6b179d310051c53ba547a39f7b6b
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.14.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f974af156ac932cd0619e0e86095071dccc8cd0608319df5c1042492b2002e9d
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.14.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.14.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..8d916976c94c174148b04db334b907ec77c7d638
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.14.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5be3c8f04a42c5e0c9de9d00508fbb981849cf188dba80cf6127d8f4b4b712d
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.14.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.14.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c926dcac71d930076be55189beacbb36cfb1a777
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.14.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c33e3534172410d4656b1a244becc400d680dc19664a6fe5d2531f0733b24b1
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.14.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.14.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..78c574771e660fcfc3a237c9d56afe57b62f1ea0
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.14.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3be2e077ef369c828ac8f31826249f327d120baaaf9d0141f67b9a814f95a57b
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.14.ffn_norm.weight b/triton_models/interactive/1/weights/layers.14.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3094bf1d424cd5ba8300cb6dddb32e4bc9d78073
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.14.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdb3dd1a12abaf094e03a1d933aa4ab506d5c4c0cd21cf0802c04f4a0d5a85c7
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.14.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.14.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..a1ff0007bbe4e1f0abfdccce67158196a9b3ba13
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.14.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39dfb751ce93881ea2c4e2f68155583024cfcf9e85b5705781348b079cc29b0d
+size 16
diff --git a/triton_models/interactive/1/weights/layers.15.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.15.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..8d981e2ef18ba6fa67894151d2e5d33aec76e769
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.15.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f2d6afe6100ef0eb47d5b379ce3faa38ec1063ba36d47d9526647ea7fa4bda2
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.15.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.15.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..92d62c8db383b4e459224b1370a1d87eaa416096
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.15.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8abb8c1bad2acba915885821b231c1884cd63fd978d62d23a25775671c97f9b
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.15.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.15.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..43781b59b7834c4758226fadd3757cd458eb9001
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.15.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fca2dec7e83b35a6b582edfc05ddf49890b234aeba53a3d88384a436cc96c4c1
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.15.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.15.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..51a58827bb1c84c5a11deab1134c99e4cd37f472
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.15.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83bb55b56df6d0d2c1f6f04d894e5d6e63d476b8fffe1dd0441a892eed850502
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.15.attention_norm.weight b/triton_models/interactive/1/weights/layers.15.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..7e895dc7fffaa82cf585391595f009adf667e4cd
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.15.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06c4e4b6e08466593216c5fffe5bb16fbe296be7d83b8d67084a728b4f0d26d0
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.15.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.15.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..8dfc85e4b6b9e369447163acf76550539913fb5a
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.15.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b271e071ebc5f1e37284433f76d394ee2ba20920d64e64355f6c37672bd68f3
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.15.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.15.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c0f10138fba546a8c454600fd6a73289e0a7f8fd
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.15.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b42f1cdd3b5b76e04cd4154950ade000eff8bfc44853c827ff351d00526201bc
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.15.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.15.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e0d0b67b1d9d4d9530690ac220e426dedaddb1fc
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.15.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c44d9731ffc2bbd8a368f60064a8e8e85f50b04677d059c25fce70aae38dc81
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.15.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.15.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..a99be30bc9c12257d3764ef09722a06f15ef0437
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.15.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:287e909a7bd9bcc0b456c57c361a614c1898383785bccf9f57eee7f91599e3b3
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.15.ffn_norm.weight b/triton_models/interactive/1/weights/layers.15.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..630c4372de835971e521542c84649a00c3b2e403
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.15.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8dafc8ea6132b5caec667dde3f6dda741e7ff23e40b8ff5f5ccc59232ca434b
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.15.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.15.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..a47b7192fa2a190ceb02a526a527aed679e93740
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.15.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c945e5779fcddbf5dff47a4c3502bce9ba0bace5158abc583e852d1418f9513a
+size 16
diff --git a/triton_models/interactive/1/weights/layers.16.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.16.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..b17d911138bd69b5faa2b303479e7cca9c12b659
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.16.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf8c2d841b0c3dfd0a4349bb4aa84c0d85141c14277e879c033484e225096715
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.16.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.16.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..bd4333af13bff4ad87c753e24461be8ab19102ab
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.16.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a54b05a6ce8083736ca7db382672bb83d215649338920308cf0edd2e4f1ae07
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.16.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.16.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e09e8104c2418067fc961e4fa84dc074da5eaa81
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.16.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b8f9b5eb6ea1827048eb48661af27f66fbf5f510055f7dfc813f28f79967c83
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.16.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.16.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..a056f4943ce26b8bb7e3c8d3d052feb2f324a4d8
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.16.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3031c7a07ae7554fdc02af0112aaf4f343c164f1da7e65ac0926e0b33ec1daf
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.16.attention_norm.weight b/triton_models/interactive/1/weights/layers.16.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..710904f88b607829b98f69d31a704b5ccb2180d3
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.16.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0996c709a45131cb25cd72865a06e38920f31941b25f83f2d78ed5751645c284
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.16.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.16.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..ea56d48779234f87b2b0a859e2cb110d0718e2b9
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.16.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50fe105dfc87e7a2f06e12b9d1d92899b4b20106d29198eb7f8156c888b57620
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.16.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.16.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..5773631e90c5be54da0f5ca15e355b6bf855b4e3
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.16.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8081c981a8cc02210f42ffa6b41e8f8a018cc273f18dd184e7a76ea6a14af908
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.16.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.16.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..5a19b7dd919248c1d8f24d12508ffb36be409a0b
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.16.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b58ad7e7bd4aaf5109590b6f4b500643cea2e5ee7ecf3de2f2bafd931fecbba
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.16.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.16.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..17e81af1aaa097a81bf4407a23e87dfb0810ba73
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.16.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05659661021dfb93c23ca810756fba0afa33f7dc7103bb74e79a5b5cee0630c2
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.16.ffn_norm.weight b/triton_models/interactive/1/weights/layers.16.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..f45d501c72951cd1746375922f7e113162bef097
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.16.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:990398b91f28bd4d0ea10d21a8f911746291d93d353659c273a0d263f3f8b26f
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.16.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.16.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..cc7a02ca2638e540d970eba9c8c2ca40c599f58e
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.16.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a46e5538c6531808ab35a4aa3f8acc92997393bf5778110738282e7d0b5a6253
+size 16
diff --git a/triton_models/interactive/1/weights/layers.17.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.17.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..b7d289a0a181f768648b3388209609a158c0d194
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.17.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a874ceb40f2cd87b1fbadffe4f336e766e4632d1486bae80a524aca3884a760
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.17.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.17.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..02676e7729a5ae2a782c7397622f5661a55ae306
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.17.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3e383f96fe0c11172a8eb7c833e16437243ddf5083fe742f2f5267c606bf46f
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.17.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.17.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f5d248ed5bb53bc83690b851c4850179affe3a1e
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.17.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ba47e294f57c2391d17559990d81c10b3febf1ac79cdaf9646ea4b5b1efe9ae
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.17.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.17.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..cec2b0826f0458f462a1f155b2420afe3cade230
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.17.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19655fc3273537cb5a737021f0914fcaba9f520ae85a241b6943a1e375859c5a
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.17.attention_norm.weight b/triton_models/interactive/1/weights/layers.17.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..493203ace8591c626f3ddd92a1d30a132fb91f7c
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.17.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f60382d336b8fe223742bf477d6e1d6b03a426c1397370821017d77560828a40
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.17.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.17.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..fada103f386b9576504b44aad9effb7227b81161
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.17.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6347e704f461d7d6ee0ae21b790cdd6180debf826b736f1862a27bc9ced0045
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.17.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.17.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..e34de3f6584cca7245e62f91730286274c18de9f
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.17.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13d6a83305e5bb3038ce5829693b70573fbcbfd18ef9251f42334a92a864f2f2
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.17.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.17.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..62706b91c086f1c95651471ed13767ce01618e08
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.17.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62bbff754141a2d1cf72617d73f2522333bb2694a88e8a5b37c1aca6b22b17a0
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.17.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.17.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..7d16b3f60264de0aab7805c342d890386aa3c7ec
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.17.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2aced42506d0f633676edf55b7de564b795eb6de86d8c0f6c0f1d1301233312
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.17.ffn_norm.weight b/triton_models/interactive/1/weights/layers.17.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..2115ea8bcc2774631a370c71a768d54242473864
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.17.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7866c4443b210b814e1bcca660a34c2b78f21172253d2c53300be2c3e3d44fc
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.17.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.17.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..945eb96703d8de2eef6085a642b1a27de7fb8cba
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.17.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8029ca34c285ba5e30b011338457cb6e1aa2bde375aa5bddeb10d5f735b827aa
+size 16
diff --git a/triton_models/interactive/1/weights/layers.18.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.18.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c8f8e2fdabca3f7c34468465c2a769b83df35ce8
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.18.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:802bfc3126429a1c8f50bb8bc82a62b62b5e4fac66b2e5201d5ca3dadc76b2b0
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.18.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.18.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..15b491c33507c9aa77edc43db2d844a6f497fca7
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.18.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5b1e35a7c3f4353a260afd771398ed0e6f3fb0cfe2c9e57c9c6aa837187477b
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.18.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.18.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..fda05fdf95a8e38dbba3ae8e857729fde60e6d1b
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.18.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d5e9b4b8ac11947e865c95a0ee01bea2b98bb4d8e186bc655980c0819220337
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.18.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.18.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..56d79eb2481c7040c86fa26964ede1eeae1395e4
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.18.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fb7cefb270cbf64d8347c25b5d776be71d432c570ac277fc6dcb8160f358040
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.18.attention_norm.weight b/triton_models/interactive/1/weights/layers.18.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3c20c25a40ad141d017b4cce8700f88ca3d8efca
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.18.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4dac1fd7000d40fa00eb19ec7e140c8fd08a7e2fba5ac80c0f15abf00fd9048e
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.18.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.18.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..3c1d6af45afa49731996db41ef7d18503411125c
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.18.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23dee44b6cb77a166863b69487459d9de5dfd4c3989306919d4c35dc20c884be
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.18.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.18.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..54489f50388ea9154fce92dbadd4bf6a1a861f86
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.18.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10a6c1e2ca46dac304c89690e837221b7cd15133dc1e7ccfb18f69187af51208
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.18.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.18.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e90ed3787e1ac9da6ffed10588e004c09bf3b9b1
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.18.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a35d9d5c12d752b160f51f53a49e9a763662605165cb85272e539b60a9f92055
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.18.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.18.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..17951129ba756efbad134062196862ef2b290c05
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.18.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:845ca7749cf6829cc274de80528f41dbd289d125720a4f68417677871dd528c9
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.18.ffn_norm.weight b/triton_models/interactive/1/weights/layers.18.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3fdc07d36718c6a4fb843c7a0e547971f25bbe50
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.18.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:885808cbeec44e76e545008343da6029dce51d48908c85d61f4e3e5734a316a7
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.18.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.18.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..4b8d6bdb257005f9da0843e14b064394e5e12366
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.18.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da3eda4da09ebaeb73ef447011ce0b9ef2ee982ab26d8d0408ad482f9b2b389e
+size 16
diff --git a/triton_models/interactive/1/weights/layers.19.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.19.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f58ac78fbf8480c4a875a904f3eca7296b9d1dc7
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.19.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a697cc9e5c643856df75e5d40a4ddc810ad41c0ab9362ad6c7745862c000ccf
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.19.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.19.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..ff2f26342ca1663ff6c89e5015b02b41e976f9a9
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.19.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5deb01a923b8c70c8adaa62c3b6128231899cb7c185908822279725696d1c819
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.19.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.19.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f444fcc2661a285f914957b05cedde19a4954ace
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.19.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:682754ebee51648ef7b0249fee7289fdf825e61916f97ec62087c8e39e9c14bb
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.19.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.19.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..41cb9a3fa2554343948079acebcb10fa2a940517
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.19.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6d4a938a39924f222f02b460355a83ffb98a00ff19d05048c3bcb82c9e57edc
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.19.attention_norm.weight b/triton_models/interactive/1/weights/layers.19.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..5acd5f2587a22bc1a1e2870e9b4af8ea1eaeb505
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.19.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63d26f2643a9aceebf2af38dbc611dc36da45a176257e478e62f85ddbc559f55
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.19.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.19.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..cc8dd8ef920737fc2e432adac1ce42303e7d7111
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.19.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a54bcfb108f050cf4a7c7cb37114ceb35476b3f8bb6cf6c541e8df014fbf6133
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.19.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.19.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c378e9b9bed297468e52701cb4eea8586e317e8f
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.19.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11cb4b7bd0b53f894236952f72793d3d4e647e6d07fc37e1112b0c5ba392176c
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.19.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.19.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..131386a17e034a3ba0ce59be9c0351b35dfc20e1
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.19.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f32b6e7bb6005ba215aa938a0b52300230f7008150b45a11916829314ef3494
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.19.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.19.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..af5383b2c8c39d1c54f5dea9298ea08f5cbe267b
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.19.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84f83448a65d6bf12e5484bdf2805b2648a5ee6c0f71f592f1399a71f787a365
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.19.ffn_norm.weight b/triton_models/interactive/1/weights/layers.19.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..6f5513a9af9eec5fbc82dd527339fb220156deb0
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.19.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7e2f003c72088419d2608b060a98ab42356eeffed53510f1d468f4ccd3f1141
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.19.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.19.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..fd5be00138be7b2df59bf0b592a9bef86dc82eb8
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.19.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c71b33b311eb0e23a8b2494a543ba1181fd72314b49cf78a9749b9cf4a00df4
+size 16
diff --git a/triton_models/interactive/1/weights/layers.2.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.2.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2d9c45e71e2c0ab82208f4202b06c9b97f6ba148
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.2.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fa15c6683fb8dd4f6a17b49bb0a989e462a984b2b1a62741c0261b0205e4d3a
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.2.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.2.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..cf230e2e4ec022b7dadc04504edd265c2736423a
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.2.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d46a56b3063ca3e890569f20f0f9554bd4b8b3dce4dd28c6de2a2c8b018de692
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.2.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.2.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2ec2d68e756cc1afd558415a1c748d3366f51240
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.2.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:745bd18832a4be0427eecf06fbd16e5b4d9045d9bae02a538648bf061f1bcd31
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.2.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.2.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..d1e959a3fa4ef4072ae44bb537bc108a99c3799e
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.2.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f165998aa89a2e93b82203e08444995edcdc00ed2dd2b3dc3171ed8c4aef68f
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.2.attention_norm.weight b/triton_models/interactive/1/weights/layers.2.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..775cfb53b3214e57d496df775c7f2e98df37a237
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.2.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35db76352c3fef9616c14aefa7c0b05850df54a54e3e6c922df8876639c7048e
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.2.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.2.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..1b19b3f633c84fa1134ae29f0bf9f119d9b25d42
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.2.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5d14e61c9cc1a1874bbf7c1db7fb04e8b97f8d49e011bf0b5c2003a072083cf
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.2.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.2.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..e293bf94f00d2acb588e4a05e8b36c07adfd4cfe
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.2.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02a79b8fb1590037f3bcbe91f25dbcb82b2b91fe0a109dca31de0493a089fcdd
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.2.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.2.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c43fcc94e533822deff81b234c66897d23c2a5aa
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.2.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cbde66d92d3be35621cdb2171a2b9e5ab5448d229f07d7da65d25553adcce029
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.2.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.2.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c5beb7d2b7d8320386a5105a4a2618ceec4e4943
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.2.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41bfc952713a7fd5409f909e9ab107d9ef734e730f7b00d97fc34ef24395e62e
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.2.ffn_norm.weight b/triton_models/interactive/1/weights/layers.2.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..45e884fea486483f4689411e2b0f5841bb3e6317
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.2.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f16599930e314f9a8ef2b760cc6773e75961152d32432b5fc3e411955dbdc227
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.2.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.2.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..70e74bf48eaad9dd65823e3d66a8d46c4452b13d
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.2.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7808c14f00dcb7b2b77edadc8852138f46802e013a3025e161a669adde20339
+size 16
diff --git a/triton_models/interactive/1/weights/layers.20.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.20.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..6053a83955560e1c2a84e72515c7672d70304835
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.20.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45521551eeea8b702589fe7c6b19749333abf647f53f56713807dc38f58041ec
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.20.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.20.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..0e188dc213c48bf55e4b2001a68e495c895187a7
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.20.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7d9740714493408c67acb934d26406c11421ab7efdabd743bd990103a90f701
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.20.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.20.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..730a6aa484d4286f408baf8abf88ea73e0b5aa02
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.20.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55586decc011d181feef941588d73d75de2ec8040bce7db734699a33a7bd6f42
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.20.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.20.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..affb6ab65788c985dc6ccf43d5cb3fcc8f4e91f6
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.20.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3dff92bdb0d4bd34ecf08c0c024d9aabfeb9dc6407b55b55d25835922bddb9c
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.20.attention_norm.weight b/triton_models/interactive/1/weights/layers.20.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..a4b06c9551477c77ebc9de6151cd219a9c13f63c
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.20.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4dfd453a8ca7eaa0368df85c67b0c4520d044c50e21e3e9c642016e56425fe2c
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.20.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.20.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e0aa342e545feda824e44af8745b7bf6714e3672
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.20.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a12408ddaac163c3473e187a838044bf3c05b1a72758d6b77338da700a74f845
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.20.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.20.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..595f2605064e623b1acbbbb39aad1abe47d2b5fe
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.20.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a20c9c4a6621e851abb268c647e4f9459277dc53bc5f64a0504562c9e7736b61
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.20.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.20.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..3881b21e76f4c55a6f5a94d56794ece1d12912e8
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.20.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e13a13177f50e58cd454dfef4083e8b8da065d25bd277aeabcbbd65d9c7ee2db
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.20.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.20.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..f0c038b596c5143988722e1d044fdba36b9f4c53
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.20.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2bb55062eaf5f412bae85c9ac428ddc2e0e59d0e53ebd21abb1228cf4d1ea3c
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.20.ffn_norm.weight b/triton_models/interactive/1/weights/layers.20.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3cfe4cc50ce587ea9b564a20130b4fe2225d7d52
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.20.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37c809eef52d6f683a42650531b04e14b95934556c2f3607466882fff2c7a049
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.20.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.20.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3fe9d60389494bd97b6721514bbf76a4a2f4aeea
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.20.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97801b00a17ab91f1019edf80b667e915c772df1461e322cb8602d8bd831a8b1
+size 16
diff --git a/triton_models/interactive/1/weights/layers.21.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.21.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..905d5eb82f1967282905cf3974e526f1e48e2b90
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.21.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2792bae2516c6d5167b1efdd66141ddc18439be883865eee923aa0d64f3501f7
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.21.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.21.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..9a1f6b2beb40845a92a60a5b1ea44afefad5446c
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.21.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:953b7c49b7ba4bab3b5ab552b697d5be9184144ec4f8f6ea9815a0e12420a4c6
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.21.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.21.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..fbd8d63b76ae1f3a0394dfd4c09e724627ce656a
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.21.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f719914491c7941474c1b6efa5a79541ade54eff71a6d65a28dcff17baeacd89
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.21.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.21.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..3199f31825d84cf98169a9ac8361fd01195c513a
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.21.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21e70d0275306b0d766b533780955602dc9d5163028c509745120b4e9dd070d1
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.21.attention_norm.weight b/triton_models/interactive/1/weights/layers.21.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..ace9b471c09970005b6d8dcb34406ac8671f3340
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.21.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f5b37279d734e53f01e524b941104c4a2a0794819cb443255e46130190eb060
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.21.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.21.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..93ad736f2b44139c784864069aece4a59db96543
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.21.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7298a7ea1a9a2f16bfcca14510dce8da6342ceaccf48354e63945a00c86a8887
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.21.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.21.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..a7e502a74af20d234730806f84f0ee0fbec81a3d
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.21.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90e896e7361f2fde100ee9cbf4591ba2509c11ad2e06ff9150614c28f39f6cc7
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.21.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.21.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e129776d2c3518130aa1688eefa5ce1d57e1f1cb
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.21.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0415c4da6fb2feb289a75e84a73c525272f0098ee5c14faf5544454178576f62
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.21.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.21.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..60435a424658f628b48358ed84954acb2782b727
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.21.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ff5c969303a6b351d8bb80064aad2c92e8c5c32d85bff840317ca0739ced463
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.21.ffn_norm.weight b/triton_models/interactive/1/weights/layers.21.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..6655336998857a70516ff902b71f61175fd1a6c3
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.21.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8042770bf17c4b7520332fdeeef3decf2eb77871e6d80a2fcfe79e850827faae
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.21.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.21.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..68bb063c7fe76ee11dc858fe2552eff20f89fc06
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.21.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:babef4e3b7889042e89f865f3c8bb53f6191e2c9329e3eb418e0627256b4bbf7
+size 16
diff --git a/triton_models/interactive/1/weights/layers.22.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.22.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..26e5e328af67eb6995b4eccd4f3f47e2a5572bbb
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.22.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3845fa57cee6ae1adc7c640c17820f11d196a86138e3ab1b26d1fcdb5a12d480
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.22.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.22.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..25e896649de6e4eebef3fb52b4695e66834ea627
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.22.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60a8fb6d26d3741fbf2dbd24d9e96a689ce0d8311349bc7b7d487a94ffae7309
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.22.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.22.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..30d513ba9872686a172b2e5bb54d7dc19c89b18b
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.22.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e8c0a44652ccfbbb876d6c56c552653b788b14188b48f41b957d17036111f93
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.22.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.22.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..63489132ff37547f3c5a7082e39f7d6e60d99e2f
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.22.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cf24c066812a6a36df8eec192b40520df7d10573d5a2bfd2327ddaecf6e938a
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.22.attention_norm.weight b/triton_models/interactive/1/weights/layers.22.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..67e9beee3472ac10efd53bef75c3678f86f0287a
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.22.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87603494aa61475dfc747464841436f303bcf654dc27b1a07564f53558ebc0e8
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.22.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.22.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..a6f81f752873c957d60d333f567fcf45dc101888
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.22.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37604a1d32f8001155e15ab4e13282b050da543ad0d0a25b759081246fdbdb15
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.22.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.22.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..7fc132bdca2ee4128bec7e863686fdca2f7aebf4
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.22.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06d1aced0b15076b9f26d4ea4f4f6b732368d7b373e7a588635da39cb9db5f39
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.22.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.22.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2679586d03d73f48a045c13e8c8b19ad6eaa9b50
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.22.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15b2a9ac0ae91a96deefa360ba92e79339705410d925b2356b9815692ea31061
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.22.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.22.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..7216f3454da54e1117fd4e92befe84b4c8b46a1a
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.22.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a99b63ab8c94e4d8f81bc8cab1561f47e3c2bac9f6e13f0b23d9438e02d7d1e
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.22.ffn_norm.weight b/triton_models/interactive/1/weights/layers.22.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..4d71b5ceacf9dcc9afaaf1adf8978c2911ea951f
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.22.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:309c8793e4e6d01a426ded64878ab5bb81fc897a4369e2e12e180067d9e2f97f
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.22.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.22.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..265569647dc54011c0c7aa312cda60679eddf224
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.22.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a792b8d14741661477851bbe77b6f5dc4fecf7ce07009fb7d6bd25090b2ad2b
+size 16
diff --git a/triton_models/interactive/1/weights/layers.23.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.23.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..3c4b6c3a2d7fa4c456839afe2c5df63b4801cf29
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.23.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2a664f7c9133d9a3d3f013ae68b7c826124f0ce8ee3e2a8b7a3d412fc4ce18c
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.23.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.23.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..6980321a22d78892613c341246abfd4fa6a6ec1b
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.23.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d1caf7d6d040d5052d79ec08aa4282d486d3fd63e54ce73293b62776d97cc01
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.23.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.23.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..a959f9c51c2010dee1865544214aa31aca8e384b
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.23.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:019ccc843a3257c4a7b36900f96de821382e2847851af142ae89a9238b434b20
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.23.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.23.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..63ad5cf1b74567dc10825bf3797cef1aeaf45b20
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.23.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80a82f597426b697fe58ed646f41dd9a6f4514d8d93e7f2791fac932dac100ca
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.23.attention_norm.weight b/triton_models/interactive/1/weights/layers.23.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..95ac563b56807e330af49708f5e09a5b5d763971
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.23.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d621b52a30d8a04c1866972255522c844eebd9f0b57ee2b90fd4f8e5e7ba07a
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.23.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.23.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..070dac5924104453edc840b81f83c3af7c79534c
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.23.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e95a18e90a00cd47b6fce45cb8c1eeedb6ec2b8fed6f0cd8de85f36cfd5dedee
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.23.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.23.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..53c5e980f8815c039d907e5466820c61f9d1076c
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.23.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae6d90f0468717c0bf1b22ab4914319697011c4ee53f13241c0ca1970acc3331
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.23.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.23.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..3dbd1908961ec50661072cfe35a0e65123ee0522
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.23.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1036d81bd9d055c59bed34241ec3328c1035676dbcd78a0186946147c58af98b
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.23.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.23.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..377898876f13249c94c85b69c632e4edbf89ca0d
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.23.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f354eef95b3a2007598e99428488351bc81e825cc08c8a22beea2a74432f0e91
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.23.ffn_norm.weight b/triton_models/interactive/1/weights/layers.23.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..6034309e63a873c266790385d8a50379dff8c851
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.23.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36a712b30e1f4b920e2bf0e553bf62898650a968b94cb544d4c0cb45dd9724ba
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.23.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.23.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..2054dd9b5bac4cc5f3947a6a29b0a00ee9c8f9c6
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.23.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:362bc48a1da392c1d9c1404743b87e700f048e91e2236c0f23136126cbd17a42
+size 16
diff --git a/triton_models/interactive/1/weights/layers.24.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.24.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..90ca332aa05b52f6a6c1174451a057235aeec1f3
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.24.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5cb069457b3e48f9401929077bc5a44b988b7741941ed8157cf23fc0af8fa2
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.24.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.24.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c424c3a6af59cdb2e6cd3d2acdd6fa6b8585e46b
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.24.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b47c34802342bd2a02dc98d311924169d7abdc703e43279cffdcf1422243038d
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.24.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.24.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..183cbc95eb079e344c88e1fa4774f568a66dbbd9
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.24.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6468f6b524dabe33d4487522c605b92a5c91eaaa9d6b39433dd31588bfd09215
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.24.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.24.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c435ad2044cc72cc87bf58ea590aea7b6e463349
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.24.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59fa63a2023ffc20a936686267ae08fe6c793889ca330e0fb0a44ab2b5fe8041
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.24.attention_norm.weight b/triton_models/interactive/1/weights/layers.24.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..dccff49fb462091aab55a0c4eb163652123ff7d5
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.24.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d38dd18c9fe84631f30cb2b7cb92efc25473d4ba1c438a7817690ed3bbaabd8
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.24.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.24.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f0bea0526b3fe332953eeee191fd4d279f3a8286
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.24.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db478db4b91a673763d0252f233423fa31c7a562f80cbc6c106931886d56e253
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.24.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.24.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..8d233c239c539161b7c5f0b5f890f196d9c544c2
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.24.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5329cd85fc6390d7fc596abdb5907e3c2576c2fb6fc87d7c0dc2dbae326a826
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.24.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.24.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..d4c99dfed4f5fd009c04c0693ddd1253dadfb80e
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.24.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78e4b556d2c58615b1f3bcbfe8780a1217bc0420383b55afbf6767315ca09e66
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.24.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.24.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..8d61abbf087e7f17d99482529ceb6649e5f98e4b
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.24.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9861b1f0dcf30259bc7a9d1c02969f271b805981c696d49b1dcdd939a7ff504b
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.24.ffn_norm.weight b/triton_models/interactive/1/weights/layers.24.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..a5247850bcab46ee044a136c8ca64f1223e6f1a7
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.24.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f64ff3faab2a3c58cde1f351d57bef281660b552a9dbb9c0aa49bff00dcd6719
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.24.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.24.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3a9a25a5c3ba55692571909bb40b460b6ed82ade
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.24.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d2ab419befc2e7b0391b3b7e7bfa13bf728db0d6cba53136aedc0802a4fcc8c
+size 16
diff --git a/triton_models/interactive/1/weights/layers.25.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.25.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..41c3344f95ab3594af8a3648d644979c8b8a3e84
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.25.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0971d51d3ac5fa3cb80bf7adb2616878c3921d6810a7b8c312f2c5edfc20ba2b
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.25.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.25.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..380f67b6fde572f2eecd73076b154bb56c631ceb
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.25.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd9d2322fc1ac860eeeb0ae4f57b15011ca5728cab0c2de14ad0734c813b1070
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.25.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.25.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..58a080a5403fbc6975a8c92d3d8890d106c41f32
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.25.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42757d1b84d12da08d617496b557df5dc43260ad03444559342e57effdeff897
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.25.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.25.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..a623dfbef7759c22ba42888f23b6af5e7c88703c
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.25.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc49597aa705026d30a172bcee0421ded59135ee57d2d1a38d511274fd00db51
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.25.attention_norm.weight b/triton_models/interactive/1/weights/layers.25.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..e330398be316b3c7d2b4e8091847c876352631d0
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.25.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f978aa26bb24bbd527a1e949719d548e1c7bf7d30f04b02f0f28d1343053132
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.25.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.25.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..941b657818aee3d6c553e08ef74566cd98e55321
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.25.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:063a4b6c0bb854f67986762bafa9651778da009fd725fe723fa47306a99a845f
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.25.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.25.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..4df2b6e64935f05f8ec6ea3db6b9723c6ca0a7bd
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.25.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4a77dbd2274b6de3cfb89254d1cb2c0af54b304bb9134a280cbe9b620a361a9
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.25.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.25.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..a2a36f211eb8cebc2e1ce26bbd4bcd9a806cee31
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.25.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1626e0d17ba4f05b0f1e65537f46ada22bef2d00deb136c30dd6bb481b617d58
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.25.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.25.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..09e7a5b567087d78bfcd3614b11b21106f5f8f59
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.25.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d9b0e50a31c6c29d57500a64edf731ea04db50967219bfdcb0853730c574333
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.25.ffn_norm.weight b/triton_models/interactive/1/weights/layers.25.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..026c4beed926345148e983d57a1eb89a25c4fd1c
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.25.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0eea4a26418b7a503c71abf443da9d784c2adca6551e4f1b998f94d6145d696
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.25.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.25.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..67871afaf8d1df47fbde1f4a65674ded07d4a864
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.25.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cad249894548c60911d6d65a7d5846938c1e479698b4466d4cc6e03d2444922
+size 16
diff --git a/triton_models/interactive/1/weights/layers.26.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.26.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..8e3258b77728a5579d15c2a374b61be41a2afa09
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.26.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3b88ded4b32bf8ff5ab7fa3616ab98f1bfea6fd86f37b729ad69ffe89d33e97
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.26.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.26.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..cb16882090f73a8651b55899be0c7b66b7d89aef
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.26.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1303373a67371e1e2f3ed25bc8cd8e559b9503bc5b4fdc37bfaf758cd26acfb3
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.26.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.26.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f65b33bea38f966cd6cd26980998df21898fad28
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.26.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da07e11c5ce840df7eaa7de1ddff66356a2995b93b6d1cdefe1d96f6d4eb62a6
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.26.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.26.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..e34f9fbc1e33e117eb223353e64a0d03c3a1ce09
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.26.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec446a339a8b88e9d35b0feb0dc82c82f64420cc45aa67b0730bc6fdfeb33b24
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.26.attention_norm.weight b/triton_models/interactive/1/weights/layers.26.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..bd89d7d2bb2a10e4537def6bc6550ddf681db645
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.26.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:452e37de79706d39a7fddbbd901e8353363bb41bb1178eebb42b0a9aad1998fc
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.26.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.26.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..ef1f200bdb37b79404804e211dddd09441a90cfb
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.26.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fac2317afed02f28c9f68eae5e04821f1fea2d7553bd4ce30b68b9a7e896be65
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.26.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.26.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..3613b7754b7de11bd7146b2f99bbb2aabad43346
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.26.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e892079f260d62e05e5169a508c1b50c3beffc1e568e189b358850a9596863ac
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.26.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.26.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..42508b0d05c03cfe54875df80e5848f92e3a2148
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.26.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b2ab3bee38aee899c1454a69dc424ae61b6d14d67438c307369be02f6460085
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.26.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.26.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..6078af07ebbfebda87b1016fd58cdcffbb0b4c73
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.26.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:552933cb4c5ad88c47fcfc8c8982e8a9d6c2bcf4975d0a1ff17f85a0de9a72a0
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.26.ffn_norm.weight b/triton_models/interactive/1/weights/layers.26.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..659727ca29164c591b4db04c441375c79e981fce
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.26.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a474d6dce328dea51c94d84fde68d4472d68dbbf19ce347181b5956b98d41847
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.26.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.26.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..db316b10f011519fdc39c70e40706bb6499001f4
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.26.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d995b27407d7307c6a5b4a4fa7f6247eac5d8c1cc62c066c9bd4395d0455a939
+size 16
diff --git a/triton_models/interactive/1/weights/layers.27.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.27.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2b398a0b63fe43f5bd6467e9001673b60b3d8b76
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.27.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fb11cc9d2229d99f45200d53d2430007eca65a120d988a8ace070a0e3754128
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.27.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.27.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..36269d2bb210deac5bfb20fc68c3a3c0ba2430d9
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.27.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3b885790c722268908e56129344337198b0c0e4b3bf5e21a7f091d0846a5d30
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.27.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.27.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..75c54cf768728053f1051c6d1260296c943bc2cd
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.27.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d46493db19a5dc9a8d01151f769f22f10733969cad257ff2372fe9ef169efdc7
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.27.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.27.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..91523912e1e6240ee472d551a8422724c7f9396f
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.27.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f80605e605d11e0f5a9e470c80c72859f9651f99f3db043b9eab3989fffd647
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.27.attention_norm.weight b/triton_models/interactive/1/weights/layers.27.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..92e464dfb802dd2cde189e137b6e908acaec5c38
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.27.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b48e7db8fe774bd46f4eecc92ef7f6bde3cb8e3ba66836e6cae00572ea0e14e
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.27.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.27.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e7392da13e07a3f00396eb1965e2c22daece98a8
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.27.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a634ce6c3f2743a5e0fa245a0adf32df70a41dc7c969d40b1a3197f0436cdf5
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.27.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.27.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..4fadfc7e45425848c37d17c3f39ffbbb822a8c78
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.27.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc138f3c7e31e1be2b6e2a57d7d5a2ffab4fa52343122dd272e41ac4bfd9096e
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.27.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.27.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..aae88c0abda360c16b47ef75abda1c4077edf25e
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.27.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9052da467e48c0c4138fd3769e456cb753464bb30a03a4942846a5b3877131f
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.27.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.27.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..3b2fa2b516a8c83d6eed1702e517e005ac19f281
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.27.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e1f67441bf5d4f5ca51f1f289e07a3c59907d324265741f76ad966bf1755749
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.27.ffn_norm.weight b/triton_models/interactive/1/weights/layers.27.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..c15c40329868b970cca611aff6e2bbe13d48abf0
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.27.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7fda3309eb353c9341280ab8f2a516011494cba8b769560e91cd0c9d27fc6561
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.27.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.27.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..d8710f2aebc08c7c65db4a66ef9daeba362df5ce
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.27.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2367dba495b15a673a5e8f907f19e98254caa8845195d88897b3ecc36d7c794
+size 16
diff --git a/triton_models/interactive/1/weights/layers.28.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.28.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..11c1eafa7f15149287cd144977ef8e5a42645397
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.28.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1f9e7857882c7a56236572f8a03d72222b257c8d9ed6e2efa1d66c6b5e21fb1
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.28.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.28.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..f725cdf5914a0af48485baa5a948fb90c3030913
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.28.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da00a72b006477cacf5f86157b6206faefb0b9a1945fed4e5f2a2f9fc9846f55
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.28.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.28.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..294eeaef86a93508f7f8b171fb8a303bcfb5602c
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.28.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:626eff3b0dc5215c6954f774fc8116aa989824ab9c971a3782d8bce5ad31d0a8
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.28.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.28.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..90a1002de820fee0fabb5d5081cde6d434fa08dc
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.28.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5faf82a3313ab0b53237e677fa72b3b44137a47ab5f26d401a3bf43f5beb1bd8
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.28.attention_norm.weight b/triton_models/interactive/1/weights/layers.28.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..1ec94894ca9c51e452e351065e83a91a22a1d264
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.28.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac4a8732ba2c28970db1dc7e821bd6c8b0e4de12f8de1b6bc6692840154562a4
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.28.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.28.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2ad5905fe8ebd68dafedb5c0bbe70d34f3f8c71d
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.28.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f375cdf0cd1a60d7c9d00319853242606c44be5322598f91dbff37284f0ab67
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.28.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.28.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..f8676ba3b145e257dc1c75c1f9d9dd86413bc37d
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.28.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f57f5b0745ad5281aa67d83c0da6f1ebc7539dff487ae1345761bf995aedb1c
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.28.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.28.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e3532b664b06cd727ceb44f27462084bddb160c3
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.28.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:393b972c36770d253df01db59d0c889a018a26ec7a18cf1e69617828344e2ed4
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.28.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.28.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..9cba65bef1506cf3787aac95439d21334e5424fa
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.28.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f4650f45c05fbd9d52eade717d47d32b1127ad57db10133ba490f5af3843551
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.28.ffn_norm.weight b/triton_models/interactive/1/weights/layers.28.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..0a50537a8d1863c6ea2bf1177d91c15f67d42dec
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.28.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26ab58696d625c79d618dd907bbeefb29dcb441a358411ed99c0f88e8649e74b
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.28.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.28.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..335aa2710f889028753142ad7c1c770b5aaece8c
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.28.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be67c63310802e47b331969149928657a52d9caadc4dcd0599f0ed63fa8fe4c3
+size 16
diff --git a/triton_models/interactive/1/weights/layers.29.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.29.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f7fb2a0c283d5309b0acac81e3f78bf535e119e0
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.29.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:964846927bb91f85e501fe1626e8958dba12656845d1c2963d6f0d31ba0e6fe9
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.29.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.29.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..e4616ace3831b1353261ce821a222788574a6a7e
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.29.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59389b1002ea4286ef68d6a28a48de0070a8fe63bb33881a4ea5b4d4824b586a
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.29.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.29.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c339b504ad1ca7893a586fe0fbab27e0414733d4
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.29.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a9f306da7ef17418be8aa9f47f97e653aeab2c155aaf1f32ea93c6e3e424c19
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.29.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.29.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..880d7d9c3c95158609d1215b2f6bba14a3a6c655
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.29.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1370f068209c9ab1f42b6657508b06a3511d1d2d8d2c5b5988f4d58591d40279
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.29.attention_norm.weight b/triton_models/interactive/1/weights/layers.29.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..dc3408e864d2f349f03d2ea9f976241c0dd4ae19
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.29.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0136d8df649cc27c395128240a43f899929866414704347f851202cc638b9ec0
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.29.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.29.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..12bd5dfc4141909486de6f81eb5de2cd0541f243
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.29.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90f34915975f77f41c0057ec1ddc7e83098a74c6efe44d5cfcbd6252f7483773
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.29.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.29.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..92ba76313e8ccbbbbf563a230bc24e60c122fbbb
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.29.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56814e27f2fc6ea900d3623c77d1df558ea69fe154c99fe57fd45b6567a62186
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.29.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.29.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..850b76dcf051ec7876aa7626f2aee3c02df70a73
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.29.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95e520a4a76d63d5f4cfad6bb9577ab1343c24d563ee6491b0120e8b8f605a24
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.29.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.29.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..8d8434eea29d62735d93ec7d3ed91e73a56773a5
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.29.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a45ecef0ec7bb53ccdd1499338dfc1590c5b4d4e64ca01119d8e2eac40c5249
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.29.ffn_norm.weight b/triton_models/interactive/1/weights/layers.29.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..cf3ccd85ec2a836282f95d8ffa96f001a6c78bfb
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.29.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80000d50b78aad7b0076bc159838fbc0e679d1b07aa00f374142e40c5fcbba01
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.29.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.29.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..135fea41df0db406183c0c705ee1bf4e15b3d938
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.29.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2334dc6b4e2acee8b2c60625419023d8b5cb9692341970a8cb0cb0950658940d
+size 16
diff --git a/triton_models/interactive/1/weights/layers.3.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.3.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..741f2dbe9906898116ac1c0bcf6b6f1305ac0c7d
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.3.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b023e843f1b897e2768f8aa9d1f18e1a2fcb8a17ee904981117c3822cafda263
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.3.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.3.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..58882890a176f4e5d124ddfbdce381fc920d5b9d
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.3.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02c5a27de7ab84dc800a722021cefc12233818ba708f7ef20abed96d1efa3b29
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.3.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.3.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..28835af03e975d2a253d1b43e9094dcef5665859
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.3.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:961c0e6293f13ca0eb880f274fcf96b1394f554b645856d99f898ae03ba05ab1
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.3.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.3.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..4941d02a83a0dab878ad6795511df8e08e216ce0
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.3.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6a94458f402b8342d3936d5c436bcc1125e642d5216c1cf70ad7850d134dbdf
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.3.attention_norm.weight b/triton_models/interactive/1/weights/layers.3.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..fee571b50c58b11c6d17e7daaf1a1796af101e8a
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.3.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e702523cc2696abf9ea5f86ca0c3b8110cbc92f9074f3573cd0935519da7f326
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.3.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.3.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..6576fcc897f882a63b4376d2366b8a16b75529b2
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.3.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec59414d327ec0ca8adf200f8593102b1cbef09d5a97e88f7e6f3d1d941e32d7
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.3.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.3.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..39bfc8b9158d17ace10985a0aefa5ed9b27c830f
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.3.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:592014759039919238673a2d601e2d397b3eb60f2b684d06201310dc35e6f870
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.3.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.3.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..a2dc182c2e093651d77ac65087453506558cc6df
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.3.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c67555a8eae4e6cc55420ec37ea21933418f802190fc809bb33855011f8ec82a
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.3.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.3.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..b12f9eae6cb382f2ef562f1e7dad7d8f2c7f4f48
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.3.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b8d6409835e70b1c0fdf81979b61995fb90f43381277f9e457070df5a91229c
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.3.ffn_norm.weight b/triton_models/interactive/1/weights/layers.3.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..1ac16014018db6a631b37da0836ea438c9d2fdaa
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.3.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b485c2892ea53a76f21e84c2ed42436b05a41f5dab146fab77f25d2b506ae53
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.3.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.3.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..86f8adc521ad298ee51185ebf02afa53325facc9
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.3.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76bf77db19b1d0234ee2da545c98ee3d5921030e6deaa8b2742d4e9d400d7207
+size 16
diff --git a/triton_models/interactive/1/weights/layers.30.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.30.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..087b322573894903eb8e5cf81dc0e4962ccbb4bb
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.30.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b085323586c5f61228e43ec3cf935799c983d169abd417a55a6c3f82cd255a1
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.30.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.30.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..de17498ac115e410694314f9e590322ecc3140ef
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.30.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:602a6e94ab5a7bda70167414ea1e71c46be0e7b46a69689d093f991dc6930079
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.30.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.30.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e9eddf6db391e55430e3ca4f04fc6966cdb3bc10
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.30.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5864869bc2f57778cafb236ed45dbcacce36836e1c8b3dd94fd1375829174baa
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.30.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.30.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..f810acf8fcee1cdadd5b34adde32f9c37b177343
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.30.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c899fc162f4dbec0809e3059f9ed0ba9d3004a75d31841ade9aaf16df93493e
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.30.attention_norm.weight b/triton_models/interactive/1/weights/layers.30.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..ad23a4893d3cffe2d398058b89dc78f528c91053
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.30.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:683f799d6ecb59ef5b47ee78d4d1653b6a49da4dc6c6865734f2832457ad888e
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.30.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.30.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..b61119e589e6b7759f74e927ba8c5a5286eb965f
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.30.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb97c170f0415eeb563dfaab343a6b7c736fb302b605cf65ac29e190d485f03a
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.30.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.30.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..3f892216a36905289e63b4b93c0eaf050e7acc02
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.30.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:debf89602b57cf687b1f434d484beefd647c3ea0e8305484658248c8238a347f
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.30.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.30.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..d0743b7b13a262d47d3c95ff5f00bcf70dca3937
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.30.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00143d530f528cfdded636568772b1ac564990d10d52c943463e8198b0f45b22
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.30.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.30.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..649ffe4f3c74051e77a62d2bd111b1c8956635a4
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.30.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6881934dda1754f8b7bdb5619bed9e9ec7cd819080a5080d36c545274e7563bd
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.30.ffn_norm.weight b/triton_models/interactive/1/weights/layers.30.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..19611f78c82d05c2fa778fc4099462db96768018
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.30.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c07830c7b5e53981d0d97e28af650885ba42b1395e88e2a8b553c080258be805
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.30.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.30.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..ebf0f2ce5ad46a9897b292cf74ea4074253d9e00
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.30.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7a7079eaefe501289467f67ff3ec35deb358c17022eff2a2d77c011d87a7485
+size 16
diff --git a/triton_models/interactive/1/weights/layers.31.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.31.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..33f1f7e919ab93f0f093697cc6564c8041cf7c9a
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.31.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42e8c9373e34e9f38c5aa5b7f9e7282f283dd138fa488699361a998289d4f0b8
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.31.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.31.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..51b423248b2e8762a232cb9f6524cc2d2882e6a1
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.31.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e74870d817de1f15c0b372de19d9049754192d574290aa47cc2da4114e02fbe3
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.31.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.31.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..7976fa7add831d946d9634761ff8db4d07f69a6b
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.31.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:882c11872607c376a08d0e7ab4025ebae8050ca0a958b4678fa7c5f5fe34af8c
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.31.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.31.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..87b74517a018f5d65e974fc575140a80f0cf2f63
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.31.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:780d8a3fc0d41d7e42ab7524e0e8eb3a5044627584cb749954a08d74e8889cc2
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.31.attention_norm.weight b/triton_models/interactive/1/weights/layers.31.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..9e1759f5a7b8ce3bcbdf54ac4a167aa2a3836eeb
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.31.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13b79fca3496315c35d45be930b96ac34c0616ae9bb69018d41d4fe7d77fa1c3
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.31.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.31.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..fa724a72baf441d9817165d242ae54e77b819e7d
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.31.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d6490623b97868d9d81417ecbbc40bbcf24f872882ca23b74a76f6f384082cd
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.31.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.31.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..4e046750532412be4588ab28e7285c8f68bccf2f
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.31.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b03dd848d3c92adda40904bb369f812d1a2de1d72e53600bdf89cf3002aa5e4
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.31.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.31.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..7954c17e1c4aac980fc31bc92786998b66007879
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.31.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f491d3ff06bae3646c8cabbf8c8b6e14963e909e5a3f2cadd84931bb1acc076
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.31.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.31.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..1f95fe4038958211cbda9224b4161cae99e0c2e5
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.31.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7895c436da989422f207c0631685485aada8b0cf45d0db3bbf0cb18b8573d8f4
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.31.ffn_norm.weight b/triton_models/interactive/1/weights/layers.31.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..491eadebff5c76dbdda444c927fd0bb153d54dbd
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.31.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b010068e8df791fcfd32ddefe46198f72adc5cb104f59512820541ed232ed52
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.31.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.31.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..9ed6ce58e195ff81f658649f8fbf99311dad0183
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.31.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fcd30ad8a1a6ae548b3b6cdbe2b3693c1d260fcf73e63e4cb201f4ff3a9216e8
+size 16
diff --git a/triton_models/interactive/1/weights/layers.4.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.4.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..9efa7ae8526ee807be03ca3903436c1c4e096b2a
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.4.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd04897e691fff067678bfb5826f8c0dae0914c4a822266312a9fd08f9c8dfb9
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.4.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.4.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..b717a0bccf881f43c4dd4849aa9abac991f829b7
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.4.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a4e0a9b4313f6f28361952f5e1c00250e0bc8d8e348238f634679cc9983d4b0
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.4.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.4.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..bbc885705f67c282413e4e10b430177fa24c64d1
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.4.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83ef42f037338f04aa63a71554b631e20e2cc1f4c44d0498061891de5d46dfec
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.4.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.4.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..0dea56a4d1087a93efcf6c1d4c45d4eddcffd41d
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.4.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92669ba1e130035258630c4bb58a6ae23088baa4c818edb89d18126368fdd2b1
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.4.attention_norm.weight b/triton_models/interactive/1/weights/layers.4.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..85901d7d4381bcdd1d25c69d8652668e9e82e4d7
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.4.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4392ba124c790351e1e804e3f6954b04df59cabe55918fb2ab208b9fcb1a25d4
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.4.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.4.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2eecef389220ebcbbb1b399d81d28d5c7123895d
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.4.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efab7d32785919b64059b2e20f610eae03ee8a2ba95bcd5c2d786e3074f66875
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.4.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.4.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..203aad693c83911b91ea533a372c2414914f0c33
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.4.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:624fd673a1cb8d5eed0814f7d0ebcfa6de1f0933f2c808a43fe9915863d06992
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.4.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.4.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..22624a1646b9f3bc812053a3e4eccd3aa066e8cc
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.4.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2a9bc1f9a857eb51f12e913af082a9d065232ad278a46bf3312fee70b57c929
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.4.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.4.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..ba1d032b1632c72d516bf607d69ef9d858ec3f69
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.4.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f5a160ff8d293e97b6037541c207caf6ea4b15e625bd94dba7be81f1aa3052f
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.4.ffn_norm.weight b/triton_models/interactive/1/weights/layers.4.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..10fdc6cff9055cfb29be992fd58fec67e3a1e156
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.4.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7584bdc2460f81e60ad3db90f314b1c3c0bb458b724ad5a8ef2f6b87991871f
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.4.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.4.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..8ab0548585972c0f9a19539e4f0246ed192f0042
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.4.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:734c894776290dd532cb25f542e38b56c9151c45fb751e1d58f5aba3c1cf86ce
+size 16
diff --git a/triton_models/interactive/1/weights/layers.5.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.5.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..04ab0a16f4f6b5b500d30b4b27152a073d6efffb
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.5.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76f7240f7f94715ffc2e22da1e1986a7738b3a81d2803a89fa8d467ab37d52f3
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.5.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.5.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..35b017f6b8442ef2ed28b4f1d7f2aab7e6c8f3d4
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.5.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f30a98755d5e88115a8343930c20bbfd34ef8095694f4c0709b299e0ee587b25
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.5.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.5.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..4b270cc9d0768c5834bf5dee3db2ae53b9d1a2db
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.5.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c2c8b87162bc3f8d4c6044cbbba5bff1a0b4d484418966d683cd8edd5ffe289
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.5.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.5.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..2170f6316f894a43c57df7c6f3b6435d6d290e59
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.5.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8a0bc293e079e00c8fb29ea166613fb81fc7a51dfae01bda404298bd3541858
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.5.attention_norm.weight b/triton_models/interactive/1/weights/layers.5.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..e56c76ec2f895f4ab09e315bcb026a0cd110898e
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.5.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e322bf9e96c707a007b6cf18e95291034a7b4acc28cc9c868ba72a2067f42a4a
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.5.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.5.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c0603e429404aebb532d112009658a498d6a25d2
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.5.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b111a37c3e4700a7ac8bcc755e22baf0cdd205a4f64cce28587b12e6bf542fa5
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.5.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.5.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..54720e241e1c6574c937ac39760a84933da14ee8
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.5.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ccbdd88d473982cb63c5daa191f2956e0826feff876c6303ad46054ce474a9f3
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.5.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.5.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f69f281b519e24e86576e49e914a3f29b9833837
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.5.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d055b75469902bb480fb2470766fc359100caf6f512e030d846c895cb23501e
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.5.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.5.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..39d27ba627be29fdb76869d39b5a02b38030a6a9
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.5.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf2b8068885689ca049003d3dff4bc8e68b47ddb9be7d7fdd56b39582b7fd61e
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.5.ffn_norm.weight b/triton_models/interactive/1/weights/layers.5.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..8f90bb2bd06c0ff2405bb8ca61c65441dc384653
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.5.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c886bfe39172273f70831164b7b87f48054c0da65cd1724be839673c817009b9
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.5.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.5.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..0032439aec9359a437391315477b7201d232b7ba
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.5.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b479855806803e6c485764401a2ed76b362ac09f2606a6d58fbba9b134ee186
+size 16
diff --git a/triton_models/interactive/1/weights/layers.6.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.6.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..08c09cae235117db0cf2be801f075c4236bd6ba2
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.6.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebf9ddd2465c02a1a37bafe82e009127d6cbbcf0bec3b323eece36934bb6eeff
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.6.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.6.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..78b67e25716cf86de09b47dc537db6ec420fd21a
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.6.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b39acb9cc4de067c3ef5b0128c253ad0b646756445766d91f2421ca30ab6e272
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.6.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.6.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2d2cd5ddae6f67b08f6610fd6bfd8fe17ff43ad7
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.6.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81ad5a0787961305a05ec9b7c0fb89cc2aa70589a36efea39557a8ff33be93c9
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.6.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.6.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..380b6dedbd40afe6240e0271cfd0000ef9f17b01
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.6.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:edadc4493b3568ab5ebe758a1aedc2ef5fefcd688f5a78eb1866379967ca1cd6
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.6.attention_norm.weight b/triton_models/interactive/1/weights/layers.6.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..68cf1e82a5f3d60ef2c37bde39437efe411c0263
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.6.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5dcd4367593812ecec39d8b1ff7cd21912c1283686db24be488384fd2453162c
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.6.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.6.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f66c0c431c68905f3cc431d2b266b628bcc1f9b1
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.6.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3cc20446684f9b809fd52c40bda9d32c115789c650575c0e54f5ab030b7ceed
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.6.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.6.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..d158d234d215899f80ded95207cff364e20e0c1d
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.6.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f01f13b1cd0cd8080d7c4906d71e44200b8053aa605a37069f1a9e1034a81f93
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.6.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.6.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..0bee7d213091341bc193cd21b808a3776987b7dd
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.6.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95865a00e74b9d37ba9c21241922979b4f26eb06b78b84b25be12bcfba617657
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.6.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.6.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..80f3f7257450ba5de9d4dabaa61b516c7c807046
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.6.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0dcaefa2acb86a25aedc25d60558af179bbf8968f1fd023b20343dad73b0184
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.6.ffn_norm.weight b/triton_models/interactive/1/weights/layers.6.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..b56799656e38d049d14d02b2d7e4ab1e470bac6d
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.6.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e458ef7058c9d7734737447072dc2908dea9ebf64a2ebcef932e4d6832057f5b
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.6.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.6.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..70c460d32701c69c43ce43977e55d4c5e407b1c8
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.6.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa3e886e06b35057d676139206ed116fafd8c8dd29244eff07cf1221837e8807
+size 16
diff --git a/triton_models/interactive/1/weights/layers.7.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.7.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..4bd1b6da8292c5b10b20dbee8e2ee7e95a46637d
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.7.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0c4ca025a4e163c0dc2da98d463549125001a9cc93654f37907cce2a9882d52
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.7.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.7.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..8846088f9a04128c3626ebdde6d6747d1d663587
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.7.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c086c5de28164657905ed6eaed423d6244ae0368c6180aa26fc0a6eb89724a83
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.7.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.7.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c4891059c086711d0200456b57dc31f93418ba81
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.7.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efcb8926a09d3f78acbff4e19e2e5bafad04172d17321a6af2b4fe7974c40fe1
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.7.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.7.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..a08abb8652ecda43c661807290bbefa793fb0160
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.7.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c0cdf8402670c6998b317082c140f0eb51c4bb0b41ca4e6386c6f1648f56a76
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.7.attention_norm.weight b/triton_models/interactive/1/weights/layers.7.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..37c18cd18f7054a248d6352d4d5a25ac9a4175e5
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.7.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28cf5e25d536f7d9180c2eb1d7dcfd7d4bb749816849f75c5e09f0210cdbc417
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.7.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.7.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..9b50669a9dc81bf91e567a299ee57d333907a007
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.7.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0581fd7f812265f9b47b8eab7621664a046c4c6f98279676df767aaf339eee7
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.7.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.7.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..93d6f40d2e5bcd8b2a2da3d12418121279963070
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.7.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f86e5d5f8bd7d8eded5bf5a5cbefc9b1b3242cdb2b486f6b1b0289d75f4df828
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.7.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.7.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..9d07164c18362f5b0879cc88dbb43ef395f284f2
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.7.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b02b881d979d0fb77a4d705ed4bc68ca58e7cfa84a504d90b9e816ddd99a6b0
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.7.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.7.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..b95f34d475e6c10781aca4639fbcadc9e706fc5a
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.7.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0c7e60168198f2ac9347ac8eb4fc59ea42fe0380e24550cd4fa2e989a2d90b4
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.7.ffn_norm.weight b/triton_models/interactive/1/weights/layers.7.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..7669f396fbea22312892ecc7e69f5847e3e3d0f7
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.7.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bce0233aef9e8401ea7eaddce5b44f2a28b6fd1018023ec3f2cae495f4d205b6
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.7.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.7.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..d2b299db6620c0abf87b67b228dd03b696854499
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.7.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae08ed15fa296e998f7e93b866fb5536103b357ca8fd0e8ee44423c4fe3ea4d3
+size 16
diff --git a/triton_models/interactive/1/weights/layers.8.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.8.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..9a071d9e1c24a362c04a0f4335000d1eeeadbfea
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.8.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:312a5231076c36e023c30c18761d4793c7aaf2d1658f740a4ed6fe3ab9fb9532
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.8.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.8.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..b756258fc2694a8580c1d6d55d73c1aae4f88737
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.8.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:045eb164e9d18487951013b4a69dab786f034139e232a0c079e6c6de0b84d445
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.8.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.8.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..79dcacb0bc5ed37629a105bb0afdc20c383e1736
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.8.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:917ac6b4102a88cb5fe47a13834f30fb45329e8234e6bf4a6d5def09acfca138
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.8.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.8.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..3f21f5d05d73002cb0251350fce183ec3b6f82cc
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.8.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:075ca25071e36779993618787bcad51f47a6210b5c7efb13836b9f0c39113c7b
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.8.attention_norm.weight b/triton_models/interactive/1/weights/layers.8.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..6441edc914d86ab07b46c530e63df5e212499fbf
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.8.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7645c5cc08248a97031708e37a8869793e72e86be7d529ee2d38214aa125f326
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.8.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.8.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..6b623d7f4ebef4670369d48905c1f66aa9b3fd94
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.8.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0a76bb17ba96c365a1bf660f901c21c3fc1d15165b0532e97c7ad86158513f0
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.8.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.8.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..f7b56f5fefdb81227823903289604a2f9e33cbf6
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.8.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f6cc9bf35da7c08e89248a2d1151ca84f97e0d44fda2f474fbe090fa2b71bc6
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.8.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.8.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c837700cdf510ee1df94f861174695bb0e1ccfc8
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.8.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67d6a461146ce6fca245beab647f837c7718f50c1ae6d48f852becd4b88ecd68
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.8.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.8.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..63ba13362b7c68d37224b01f241452a27cf8717a
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.8.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22f763f7c06275a5821c55ab0428986c7982da93d02ec561c4c1cf0bc83cb82a
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.8.ffn_norm.weight b/triton_models/interactive/1/weights/layers.8.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..c4ec482ee099d1dd8d7b2633b38f9546642f8c04
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.8.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97f607d08fdcc7d4a7048194e994afa25c34242bddec4d56534a779484534dec
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.8.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.8.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..dae30d205782945d230c044159736e88b8c261e0
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.8.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55e7e6e9663622f872cb332c414eac32a102e97ffdf3f5a2b6afa6f8371e1a5f
+size 16
diff --git a/triton_models/interactive/1/weights/layers.9.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.9.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..502cfce88cfb73bd839f1fb667fba672259c4294
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.9.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ad1c9bfda707333f5860de8512ec7db789721d5f17e96ec0c1f79f98533c42c
+size 12582912
diff --git a/triton_models/interactive/1/weights/layers.9.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.9.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..47605d66d4acddffb2885150c9d68d184f94a9c6
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.9.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b5179dc3fba3abadb58abf409bfef33b382dc7373a002c3c43da9785c86f614
+size 786432
diff --git a/triton_models/interactive/1/weights/layers.9.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.9.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..0c3613bd080dd0fe0abbe07c8a567bf85e48e33d
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.9.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:535eb0ed2a008590448c38ddcfcf990219dd0c1752e28d11fe3310cdf4039d57
+size 8388608
diff --git a/triton_models/interactive/1/weights/layers.9.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.9.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..bc68d0462949d41fb22495d6fc4d8a2c6c21b6a6
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.9.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee2d02d009e36ca78d86a48ea408c2017c21903b64400397a77f437f495d936c
+size 524288
diff --git a/triton_models/interactive/1/weights/layers.9.attention_norm.weight b/triton_models/interactive/1/weights/layers.9.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..8493ee9741dd897107d9fe3cea7c2d01fdd4dee5
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.9.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fcacb811b4cf62144e1ac2d3eadbafab30083e3420c46a92df1ab21840b29fe5
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.9.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.9.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..bcb62122ef3b2bf1d13099eb7e64cd4f6266f02c
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.9.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aca67258bcd3c39f17fb15a14b72cfe8ca597aeb30e0f4f298efa5eb093abcf3
+size 58720256
diff --git a/triton_models/interactive/1/weights/layers.9.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.9.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..3e0e6af0add56eeb2e1cf7bc0142e52be7a5ae29
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.9.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4b60ceaccc0af57c36de7cd69acf05d8c307f2d6d27a7e765e0f132ae95d17a
+size 3670016
diff --git a/triton_models/interactive/1/weights/layers.9.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.9.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..399c1fc8d6cc43a27e802ca067c88fc4f9a3bc73
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.9.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e07e422f44ddda11dc7404b257cacd675b2b7f44491941e6754155df3a31d2e
+size 29360128
diff --git a/triton_models/interactive/1/weights/layers.9.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.9.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..9509fd872d04e11bf53f07f99129e785b2056187
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.9.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3cc346804097116087236c77f2e2c018922efba4f2e32d8a71ddf8b026c9d34d
+size 1835008
diff --git a/triton_models/interactive/1/weights/layers.9.ffn_norm.weight b/triton_models/interactive/1/weights/layers.9.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..185031880012c613c2cf8937d4aa159e1c93a4c0
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.9.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98119ccde8c54eacba56311e43a7c74e62e30e0d7302b011202dea6a6348ba66
+size 8192
diff --git a/triton_models/interactive/1/weights/layers.9.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.9.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..0ec9f90c9c5be11398b7b1bdba1df5b0975ab0d4
--- /dev/null
+++ b/triton_models/interactive/1/weights/layers.9.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62cf0a7960b56038dd17b81e2a1c38a016c2b78bd7272299dee18ae8e53e5c92
+size 16
diff --git a/triton_models/interactive/1/weights/norm.weight b/triton_models/interactive/1/weights/norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..906361178f72cf7bd1f01447accc35bf0e1b633a
--- /dev/null
+++ b/triton_models/interactive/1/weights/norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efcd3fb0c1c5225c17e0eeb5b46068bb7311f716a4908d5a39d79b37985b58e7
+size 8192
diff --git a/triton_models/interactive/1/weights/output.weight b/triton_models/interactive/1/weights/output.weight
new file mode 100644
index 0000000000000000000000000000000000000000..04e8f86f0b46051b3db62d5eefcbebda87641472
--- /dev/null
+++ b/triton_models/interactive/1/weights/output.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b0ed41b4df8f91647fc8bdd2aa61f55c39e09b6e063c8bd509b591797293919
+size 758120448
diff --git a/triton_models/interactive/1/weights/tok_embeddings.weight b/triton_models/interactive/1/weights/tok_embeddings.weight
new file mode 100644
index 0000000000000000000000000000000000000000..0b3edbd16fbb690f7c781043ea905fd4380e5f04
--- /dev/null
+++ b/triton_models/interactive/1/weights/tok_embeddings.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8986115ad7e59813a41c88c0d601235fa36138d6c15e5657a050cf4ec40fb037
+size 758120448
diff --git a/triton_models/interactive/config.pbtxt b/triton_models/interactive/config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f139d5b2234c0dfa94e3792dda985f9e8034a5a8
--- /dev/null
+++ b/triton_models/interactive/config.pbtxt
@@ -0,0 +1,293 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "turbomind"
+backend: "turbomind"
+default_model_filename: "weights"
+max_batch_size: 1
+
+model_transaction_policy {
+ decoupled: True
+}
+
+instance_group [
+ {
+ # max concurrent instances
+ count: 48
+ kind: KIND_CPU
+ }
+]
+
+input [
+ {
+ name: "input_ids"
+ data_type: TYPE_UINT32
+ dims: [ -1 ]
+ # allow_ragged_batch: true
+ },
+ {
+ name: "input_lengths"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ },
+ {
+ name: "request_output_len"
+ data_type: TYPE_UINT32
+ dims: [ -1 ]
+ },
+ {
+ name: "input_embeddings"
+ data_type: TYPE_INT8
+ dims: [ -1 ]
+ optional: true
+ },
+ {
+ name: "input_embedding_ranges"
+ data_type: TYPE_UINT32
+ dims: [ -1, 2 ]
+ optional: true
+ },
+ {
+ name: "step"
+ data_type: TYPE_INT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "session_len"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "runtime_top_k"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "runtime_top_p"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "beam_search_diversity_rate"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "temperature"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "len_penalty"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "repetition_penalty"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "random_seed"
+ data_type: TYPE_UINT64
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "is_return_log_probs"
+ data_type: TYPE_BOOL
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "beam_width"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "start_id"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "end_id"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "bad_words_list"
+ data_type: TYPE_INT32
+ dims: [ 2, -1 ]
+ optional: true
+ },
+ {
+ name: "stop_words_list"
+ data_type: TYPE_INT32
+ dims: [ 2, -1 ]
+ optional: true
+ },
+ {
+ name: "prompt_learning_task_name_ids"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "top_p_decay"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "top_p_min"
+ data_type: TYPE_FP32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "top_p_reset_ids"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "START"
+ data_type: TYPE_INT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "END"
+ data_type: TYPE_INT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "STOP"
+ data_type: TYPE_INT32
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ },
+ {
+ name: "CORRID"
+ data_type: TYPE_UINT64
+ dims: [ 1 ]
+ reshape: { shape: [ ] }
+ optional: true
+ }
+]
+output [
+ {
+ name: "output_ids"
+ data_type: TYPE_UINT32
+ dims: [ -1, -1 ]
+ },
+ {
+ name: "sequence_length"
+ data_type: TYPE_UINT32
+ dims: [ -1 ]
+ },
+ {
+ name: "cum_log_probs"
+ data_type: TYPE_FP32
+ dims: [ -1 ]
+ },
+ {
+ name: "output_log_probs"
+ data_type: TYPE_FP32
+ dims: [ -1, -1 ]
+ }
+]
+
+parameters {
+ key: "pipeline_para_size"
+ value: {
+ string_value: "1"
+ }
+}
+parameters {
+ key: "data_type"
+ value: {
+ string_value: "fp16"
+ }
+}
+parameters {
+ key: "model_type"
+ value: {
+ string_value: "Llama"
+ }
+}
+
+parameters {
+ key: "enable_custom_all_reduce"
+ value: {
+ string_value: "0"
+ }
+}
+parameters {
+ key: "tensor_para_size"
+ value: {
+ string_value: "1"
+ }
+}
+parameters {
+ key: "model_name"
+ value: {
+ string_value: "internlm2-chat-7b"
+ }
+}
diff --git a/triton_models/postprocessing/1/__pycache__/model.cpython-310.pyc b/triton_models/postprocessing/1/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa0ac1382a7864add3a9bb04e6b328fa6995f67d
Binary files /dev/null and b/triton_models/postprocessing/1/__pycache__/model.cpython-310.pyc differ
diff --git a/triton_models/postprocessing/1/model.py b/triton_models/postprocessing/1/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..20de97595195da5dedc044a31c6086c1f49892da
--- /dev/null
+++ b/triton_models/postprocessing/1/model.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+from pathlib import Path
+
+import numpy as np
+import triton_python_backend_utils as pb_utils
+
+# This tokenizer is `lmdeploy/turbomind/tokenizer.py`. When an LLM is served
+# by triton inference server, it has to be converted first by running
+# `python lmdeploy/serve/turbomind/deploy.py`. Then
+# `lmdeploy/turbomind/tokenizer.py` will be copied to `tokenizer/tokenizer.py`
+from .tokenizer.tokenizer import Tokenizer
+
+
+class TritonPythonModel:
+ """Your Python model must use the same class name.
+
+ Every Python model that is created must have "TritonPythonModel" as the
+ class name.
+ """
+
+ def initialize(self, args):
+ """`initialize` is called only once when the model is being loaded.
+ Implementing `initialize` function is optional. This function allows
+ the model to initialize any state associated with this model.
+ Parameters
+ ----------
+ args : dict
+ Both keys and values are strings. The dictionary keys and values are:
+ * model_config: A JSON string containing the model configuration
+ * model_instance_kind: A string containing model instance kind
+ * model_instance_device_id: A string containing model instance device
+ ID
+ * model_repository: Model repository path
+ * model_version: Model version
+ * model_name: Model name
+ """
+ # Parse model configs
+ self.model_config = model_config = json.loads(args['model_config'])
+
+ # Parse model output configs
+ output_config = pb_utils.get_output_config_by_name(
+ model_config, 'OUTPUT')
+
+ # Convert Triton types to numpy types
+ self.output_dtype = pb_utils.triton_string_to_numpy(
+ output_config['data_type'])
+
+ cur_folder = Path(__file__).parent
+
+ self.tokenizer = Tokenizer(
+ osp.join(
+ cur_folder, self.model_config['parameters']['tokenizer_path']
+ ['string_value']))
+
+ def execute(self, requests):
+ """`execute` must be implemented in every Python model. `execute`
+ function receives a list of pb_utils.InferenceRequest as the only
+ argument. This function is called when an inference is requested
+ for this model. Depending on the batching configuration (e.g. Dynamic
+ Batching) used, `requests` may contain multiple requests. Every
+ Python model, must create one pb_utils.InferenceResponse for every
+ pb_utils.InferenceRequest in `requests`. If there is an error, you can
+ set the error argument when creating a pb_utils.InferenceResponse.
+ Parameters
+ ----------
+ requests : list
+ A list of pb_utils.InferenceRequest
+ Returns
+ -------
+ list
+ A list of pb_utils.InferenceResponse. The length of this list must
+ be the same as `requests`
+ """
+
+ responses = []
+
+ # Every Python backend must iterate over everyone of the requests
+ # and create a pb_utils.InferenceResponse for each of them.
+ for idx, request in enumerate(requests):
+ # Get input tensors
+ tokens_batch = pb_utils.get_input_tensor_by_name(
+ request, 'TOKENS_BATCH').as_numpy()
+ sequence_length = pb_utils.get_input_tensor_by_name(
+ request, 'sequence_length').as_numpy()
+
+ # Postprocessing output data.
+ outputs = self._postprocessing(tokens_batch.tolist(),
+ sequence_length)
+
+ # Create output tensors. You need pb_utils.Tensor
+ # objects to create pb_utils.InferenceResponse.
+ output_tensor = pb_utils.Tensor(
+ 'OUTPUT',
+ np.array(outputs).astype(self.output_dtype))
+
+ # Create InferenceResponse. You can set an error here in case
+ # there was a problem with handling this inference request.
+ # Below is an example of how you can set errors in inference
+ # response:
+ #
+ # pb_utils.InferenceResponse(
+ # output_tensors=..., TritonError("An error occurred"))
+ inference_response = pb_utils.InferenceResponse(
+ output_tensors=[output_tensor])
+ responses.append(inference_response)
+
+ # You should return a list of pb_utils.InferenceResponse. Length
+ # of this list must match the length of `requests` list.
+ return responses
+
+ def finalize(self):
+ """`finalize` is called only once when the model is being unloaded.
+
+ Implementing `finalize` function is optional. This function allows the
+ model to perform any necessary clean ups before exit.
+ """
+ print('Cleaning up...')
+
+ def _postprocessing(self, tokens_batch, sequence_length):
+ """decode token ids into texts."""
+ outputs = []
+ for beam_tokens, beam_len in zip(tokens_batch, sequence_length):
+ for tokens, _len in zip(beam_tokens, beam_len):
+ output = self.tokenizer.decode(tokens, _len)
+ output = output.encode('utf8')
+ outputs.append(output)
+ return outputs
diff --git a/triton_models/postprocessing/1/tokenizer/config.json b/triton_models/postprocessing/1/tokenizer/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..84235b8a1a9618cc0ac265caf61ea4088780e3b1
--- /dev/null
+++ b/triton_models/postprocessing/1/tokenizer/config.json
@@ -0,0 +1,37 @@
+{
+ "_name_or_path": "/root/psy/internlm2-7b/work_dirs/internlm2_chat_7b_qlora_oasst1_512_e3_copy/hf_2/merge",
+ "architectures": [
+ "InternLM2ForCausalLM"
+ ],
+ "attn_implementation": "eager",
+ "auto_map": {
+ "AutoConfig": "configuration_internlm.InternLMConfig",
+ "AutoModel": "modeling_internlm2.InternLM2ForCausalLM",
+ "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM"
+ },
+ "bias": false,
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "fp16": true,
+ "hidden_act": "silu",
+ "hidden_size": 4096,
+ "initializer_range": 0.02,
+ "intermediate_size": 14336,
+ "max_position_embeddings": 32768,
+ "model_type": "internlm",
+ "num_attention_heads": 32,
+ "num_hidden_layers": 32,
+ "num_key_value_heads": 8,
+ "pad_token_id": 2,
+ "rms_norm_eps": 1e-05,
+ "rope_scaling": {
+ "factor": 2.0,
+ "type": "dynamic"
+ },
+ "rope_theta": 1000000,
+ "tie_word_embeddings": false,
+ "torch_dtype": "float16",
+ "transformers_version": "4.37.2",
+ "use_cache": false,
+ "vocab_size": 92544
+}
diff --git a/triton_models/postprocessing/1/tokenizer/configuration_internlm.py b/triton_models/postprocessing/1/tokenizer/configuration_internlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d013582feaa1f9970a4256c4a0f77000fa645de
--- /dev/null
+++ b/triton_models/postprocessing/1/tokenizer/configuration_internlm.py
@@ -0,0 +1,164 @@
+# coding=utf-8
+# Copyright (c) InternLM. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" InternLM model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class InternLMConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
+ an InternLM model according to the specified arguments, defining the model architecture. Instantiating a
+ configuration with the defaults will yield a similar configuration to that of the InternLM-7B.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+
+ Args:
+ vocab_size (`int`, *optional*, defaults to 32000):
+ Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the
+ `inputs_ids` passed when calling [`InternLMModel`]
+ hidden_size (`int`, *optional*, defaults to 4096):
+ Dimension of the hidden representations.
+ intermediate_size (`int`, *optional*, defaults to 11008):
+ Dimension of the MLP representations.
+ num_hidden_layers (`int`, *optional*, defaults to 32):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 32):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ num_key_value_heads (`int`, *optional*):
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+ by meanpooling all the original heads within that group. For more details checkout [this
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+ `num_attention_heads`.
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+ The non-linear activation function (function or string) in the decoder.
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
+ just in case (e.g., 512 or 1024 or 2048).
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+ The epsilon used by the rms normalization layers.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
+ relevant if `config.is_decoder=True`.
+ tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+ Whether to tie weight embeddings
+ Example:
+
+ ```python
+ >>> from transformers import InternLMModel, InternLMConfig
+
+ >>> # Initializing a InternLM internlm-7b style configuration
+ >>> configuration = InternLMConfig()
+
+ >>> # Initializing a model from the internlm-7b style configuration
+ >>> model = InternLMModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+ model_type = "internlm"
+ _auto_class = "AutoConfig"
+
+ def __init__( # pylint: disable=W0102
+ self,
+ vocab_size=103168,
+ hidden_size=4096,
+ intermediate_size=11008,
+ num_hidden_layers=32,
+ num_attention_heads=32,
+ num_key_value_heads=None,
+ hidden_act="silu",
+ max_position_embeddings=2048,
+ initializer_range=0.02,
+ rms_norm_eps=1e-6,
+ use_cache=True,
+ pad_token_id=0,
+ bos_token_id=1,
+ eos_token_id=2,
+ tie_word_embeddings=False,
+ bias=True,
+ rope_theta=10000,
+ rope_scaling=None,
+ attn_implementation="eager",
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.max_position_embeddings = max_position_embeddings
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.bias = bias
+
+ if num_key_value_heads is None:
+ num_key_value_heads = num_attention_heads
+ self.num_key_value_heads = num_key_value_heads
+
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.rms_norm_eps = rms_norm_eps
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.rope_scaling = rope_scaling
+ self._rope_scaling_validation()
+
+ self.attn_implementation = attn_implementation
+ if self.attn_implementation is None:
+ self.attn_implementation = "eager"
+ super().__init__(
+ pad_token_id=pad_token_id,
+ bos_token_id=bos_token_id,
+ eos_token_id=eos_token_id,
+ tie_word_embeddings=tie_word_embeddings,
+ **kwargs,
+ )
+
+ def _rope_scaling_validation(self):
+ """
+ Validate the `rope_scaling` configuration.
+ """
+ if self.rope_scaling is None:
+ return
+
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+ raise ValueError(
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+ f"got {self.rope_scaling}"
+ )
+ rope_scaling_type = self.rope_scaling.get("type", None)
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+ raise ValueError(
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+ )
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
+ raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")
diff --git a/triton_models/postprocessing/1/tokenizer/generation_config.json b/triton_models/postprocessing/1/tokenizer/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc5efeadd3bf2caa4462a3be79d580690f410668
--- /dev/null
+++ b/triton_models/postprocessing/1/tokenizer/generation_config.json
@@ -0,0 +1,7 @@
+{
+ "_from_model_config": true,
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "pad_token_id": 2,
+ "transformers_version": "4.37.2"
+}
diff --git a/triton_models/postprocessing/1/tokenizer/modeling_internlm2.py b/triton_models/postprocessing/1/tokenizer/modeling_internlm2.py
new file mode 100644
index 0000000000000000000000000000000000000000..39d6f71d2933385988ec05f845d3f6386c97f74b
--- /dev/null
+++ b/triton_models/postprocessing/1/tokenizer/modeling_internlm2.py
@@ -0,0 +1,1385 @@
+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on transformers/src/transformers/models/llama/modeling_llama.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch InternLM2 model."""
+import math
+import queue
+import threading
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+ BaseModelOutputWithPast,
+ CausalLMOutputWithPast,
+ SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+)
+
+try:
+ from transformers.generation.streamers import BaseStreamer
+except: # noqa # pylint: disable=bare-except
+ BaseStreamer = None
+
+from .configuration_internlm import InternLMConfig as InternLM2Config
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "InternLM2Config"
+
+flash_attn_func, flash_attn_varlen_func = None, None
+pad_input, index_first_axis, unpad_input = None, None, None
+def _import_flash_attn():
+ global flash_attn_func, flash_attn_varlen_func
+ global pad_input, index_first_axis, unpad_input
+ try:
+ from flash_attn import flash_attn_func as _flash_attn_func, flash_attn_varlen_func as _flash_attn_varlen_func
+ from flash_attn.bert_padding import pad_input as _pad_input, index_first_axis as _index_first_axis, unpad_input as _unpad_input
+ flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func
+ pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input
+ except ImportError:
+ raise ImportError("flash_attn is not installed.")
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+ return (
+ indices,
+ cu_seqlens,
+ max_seqlen_in_batch,
+ )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+
+ if past_key_values_length > 0:
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ bsz, src_len = mask.size()
+ tgt_len = tgt_len if tgt_len is not None else src_len
+
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+ inverted_mask = 1.0 - expanded_mask
+
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM2
+class InternLM2RMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ InternLM2RMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM2
+class InternLM2RotaryEmbedding(nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=torch.float32)
+
+ return (
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
+ )
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->InternLM2
+class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding):
+ """InternLM2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ t = t / self.scaling_factor
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM2
+class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding):
+ """InternLM2RotaryEmbedding extended with Dynamic NTK scaling.
+ Credits to the Reddit users /u/bloc97 and /u/emozilla.
+ """
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+
+ if seq_len > self.max_position_embeddings:
+ base = self.base * (
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+ ) ** (self.dim / (self.dim - 2))
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+# Copied from transformers.model.llama.modeling_llama.rotate_half
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+ """Applies Rotary Position Embedding to the query and key tensors."""
+ cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+ sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class InternLM2MLP(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.intermediate_size = config.intermediate_size
+ self.w1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.w3 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.w2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+ self.act_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, x):
+ down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x))
+
+ return down_proj
+
+
+# Copied from transformers.model.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# Modified from transformers.model.llama.modeling_llama.LlamaAttention
+class InternLM2Attention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: InternLM2Config):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = self.hidden_size // self.num_heads
+ self.num_key_value_heads = config.num_key_value_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ self.max_position_embeddings = config.max_position_embeddings
+ self.is_causal = True
+
+ if (self.head_dim * self.num_heads) != self.hidden_size:
+ raise ValueError(
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+ f" and `num_heads`: {self.num_heads})."
+ )
+
+ self.wqkv = nn.Linear(
+ self.hidden_size,
+ (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim,
+ bias=config.bias,
+ )
+
+ self.wo = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
+ self._init_rope()
+
+ def _init_rope(self):
+ if self.config.rope_scaling is None:
+ self.rotary_emb = InternLM2RotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.config.rope_theta,
+ )
+ else:
+ scaling_type = self.config.rope_scaling["type"]
+ scaling_factor = self.config.rope_scaling["factor"]
+ if scaling_type == "dynamic":
+ self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.config.rope_theta,
+ scaling_factor=scaling_factor,
+ )
+ elif scaling_type == "linear":
+ self.rotary_emb = InternLM2LinearScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.config.rope_theta,
+ scaling_factor=scaling_factor,
+ )
+ else:
+ raise ValueError("Currently we only support rotary embedding's type being 'dynamic' or 'linear'.")
+ return self.rotary_emb
+
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+ "Please make sure use `attention_mask` instead.`"
+ )
+
+ bsz, q_len, _ = hidden_states.size()
+
+ qkv_states = self.wqkv(hidden_states)
+
+ qkv_states = rearrange(
+ qkv_states,
+ "b q (h gs d) -> b q h gs d",
+ gs=2 + self.num_key_value_groups,
+ d=self.head_dim,
+ )
+
+ query_states = qkv_states[..., : self.num_key_value_groups, :]
+ query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d")
+ key_states = qkv_states[..., -2, :]
+ value_states = qkv_states[..., -1, :]
+
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+ past_key_value = (key_states, value_states) if use_cache else None
+
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+ )
+ attn_weights = attn_weights + attention_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+ attn_output = self.wo(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+# Modified from transformers.model.llama.modeling_llama.InternLM2FlashAttention2
+class InternLM2FlashAttention2(InternLM2Attention):
+ """
+ InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+ flash attention and deal with padding tokens in case the input contains any of them.
+ """
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ # InternLM2FlashAttention2 attention does not support output_attentions
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+ "Please make sure use `attention_mask` instead.`"
+ )
+
+ # overwrite attention_mask with padding_mask
+ attention_mask = kwargs.pop("padding_mask")
+
+ output_attentions = False
+
+ bsz, q_len, _ = hidden_states.size()
+
+ qkv_states = self.wqkv(hidden_states)
+
+ qkv_states = rearrange(
+ qkv_states,
+ "b q (h gs d) -> b q h gs d",
+ gs=2 + self.num_key_value_groups,
+ d=self.head_dim,
+ )
+
+ query_states = qkv_states[..., : self.num_key_value_groups, :]
+ query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d")
+ key_states = qkv_states[..., -2, :]
+ value_states = qkv_states[..., -1, :]
+
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+ past_key_value = (key_states, value_states) if use_cache else None
+
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ attn_output = self._flash_attention_forward(
+ query_states, key_states, value_states, attention_mask, q_len
+ )
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+ attn_output = self.wo(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+ def _flash_attention_forward(
+ self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+ ):
+ """
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+ first unpad the input, then computes the attention scores and pad the final attention scores.
+
+ Args:
+ query_states (`torch.Tensor`):
+ Input query states to be passed to Flash Attention API
+ key_states (`torch.Tensor`):
+ Input key states to be passed to Flash Attention API
+ value_states (`torch.Tensor`):
+ Input value states to be passed to Flash Attention API
+ attention_mask (`torch.Tensor`):
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+ position of padding tokens and 1 for the position of non-padding tokens.
+ dropout (`int`, *optional*):
+ Attention dropout
+ softmax_scale (`float`, *optional*):
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+ """
+ # Contains at least one padding token in the sequence
+ causal = self.is_causal and query_length != 1
+ if attention_mask is not None:
+ batch_size = query_states.shape[0]
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input(
+ query_states, key_states, value_states, attention_mask, query_length
+ )
+
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+ attn_output_unpad = flash_attn_varlen_func(
+ query_states,
+ key_states,
+ value_states,
+ cu_seqlens_q=cu_seqlens_q,
+ cu_seqlens_k=cu_seqlens_k,
+ max_seqlen_q=max_seqlen_in_batch_q,
+ max_seqlen_k=max_seqlen_in_batch_k,
+ dropout_p=dropout,
+ softmax_scale=softmax_scale,
+ causal=causal,
+ )
+
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+ else:
+ attn_output = flash_attn_func(
+ query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+ )
+
+ return attn_output
+
+ def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+ key_layer = index_first_axis(
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+ )
+ value_layer = index_first_axis(
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+ )
+
+ if query_length == kv_seq_len:
+ query_layer = index_first_axis(
+ query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+ )
+ cu_seqlens_q = cu_seqlens_k
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
+ indices_q = indices_k
+ elif query_length == 1:
+ max_seqlen_in_batch_q = 1
+ cu_seqlens_q = torch.arange(
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
+ ) # There is a memcpy here, that is very bad.
+ indices_q = cu_seqlens_q[:-1]
+ query_layer = query_layer.squeeze(1)
+ else:
+ # The -q_len: slice assumes left padding.
+ attention_mask = attention_mask[:, -query_length:]
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+ return (
+ query_layer,
+ key_layer,
+ value_layer,
+ indices_q.to(torch.int64),
+ (cu_seqlens_q, cu_seqlens_k),
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+ )
+
+INTERNLM2_ATTENTION_CLASSES = {
+ "eager": InternLM2Attention,
+ "flash_attention_2": InternLM2FlashAttention2,
+}
+
+# Modified from transformers.model.llama.modeling_llama.LlamaDecoderLayer
+class InternLM2DecoderLayer(nn.Module):
+ def __init__(self, config: InternLM2Config):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+
+ self.attention = INTERNLM2_ATTENTION_CLASSES[config.attn_implementation](config=config)
+
+ self.feed_forward = InternLM2MLP(config)
+ self.attention_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.ffn_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ **kwargs,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`, *optional*):
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+ query_sequence_length, key_sequence_length)` if default attention is used.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+ """
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+ "Please make sure use `attention_mask` instead.`"
+ )
+
+ residual = hidden_states
+
+ hidden_states = self.attention_norm(hidden_states)
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.attention(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ **kwargs,
+ )
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.ffn_norm(hidden_states)
+ hidden_states = self.feed_forward(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ return outputs
+
+
+InternLM2_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`InternLM2Config`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2
+@add_start_docstrings(
+ "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.",
+ InternLM2_START_DOCSTRING,
+)
+class InternLM2PreTrainedModel(PreTrainedModel):
+ config_class = InternLM2Config
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["InternLM2DecoderLayer"]
+ _skip_keys_device_placement = "past_key_values"
+
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+
+InternLM2_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+ `past_key_values`).
+
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+ information on the default strategy.
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.n_positions - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or
+ when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+ `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+ of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Modified from transformers.model.llama.modeling_llama.LlamaModel
+@add_start_docstrings(
+ "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.",
+ InternLM2_START_DOCSTRING,
+)
+class InternLM2Model(InternLM2PreTrainedModel):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLM2DecoderLayer`]
+
+ Args:
+ config: InternLM2Config
+ """
+
+ _auto_class = "AutoModel"
+
+ def __init__(self, config: InternLM2Config):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+ self.config = config
+
+ self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+
+ self.layers = nn.ModuleList([InternLM2DecoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.tok_embeddings
+
+ def set_input_embeddings(self, value):
+ self.tok_embeddings = value
+
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+ # create causal mask
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ combined_attention_mask = None
+ if input_shape[-1] > 1:
+ combined_attention_mask = _make_causal_mask(
+ input_shape,
+ inputs_embeds.dtype,
+ device=inputs_embeds.device,
+ past_key_values_length=past_key_values_length,
+ )
+
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+ inputs_embeds.device
+ )
+ combined_attention_mask = (
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+ )
+
+ return combined_attention_mask
+
+ @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if self.config.attn_implementation == "flash_attention_2":
+ _import_flash_attn()
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ batch_size, seq_length = input_ids.shape[:2]
+ elif inputs_embeds is not None:
+ batch_size, seq_length = inputs_embeds.shape[:2]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ seq_length_with_past = seq_length
+ past_key_values_length = 0
+ if past_key_values is not None:
+ past_key_values_length = past_key_values[0][0].shape[2]
+ seq_length_with_past = seq_length_with_past + past_key_values_length
+
+ if position_ids is None:
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+ position_ids = torch.arange(
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+ )
+ position_ids = position_ids.unsqueeze(0)
+
+ if inputs_embeds is None:
+ inputs_embeds = self.tok_embeddings(input_ids)
+
+ if self.config.attn_implementation == "flash_attention_2":
+ # 2d mask is passed through the layers
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+ else:
+ if attention_mask is None:
+ attention_mask = torch.ones(
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+ )
+ attention_mask = self._prepare_decoder_attention_mask(
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+ )
+
+ # embed positions
+ hidden_states = inputs_embeds
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ next_decoder_cache = () if use_cache else None
+
+ for idx, decoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ # None for past_key_value
+ return module(*inputs, output_attentions, None)
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(decoder_layer),
+ hidden_states,
+ attention_mask,
+ position_ids,
+ None,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ hidden_states = self.norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = next_decoder_cache if use_cache else None
+ if not return_dict:
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ )
+
+
+# Modified from transformers.model.llama.modeling_llama.LlamaForCausalLM
+class InternLM2ForCausalLM(InternLM2PreTrainedModel):
+ _auto_class = "AutoModelForCausalLM"
+
+ _tied_weights_keys = ["output.weight"]
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = InternLM2Model(config)
+ self.vocab_size = config.vocab_size
+ self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.tok_embeddings
+
+ def set_input_embeddings(self, value):
+ self.model.tok_embeddings = value
+
+ def get_output_embeddings(self):
+ return self.output
+
+ def set_output_embeddings(self, new_embeddings):
+ self.output = new_embeddings
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+ r"""
+ Args:
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+ Returns:
+
+ Example:
+
+ ```python
+ >>> from transformers import AutoTokenizer, InternLM2ForCausalLM
+
+ >>> model = InternLM2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+ >>> # Generate
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+ ```"""
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = outputs[0]
+ logits = self.output(hidden_states)
+ logits = logits.float()
+
+ loss = None
+ if labels is not None:
+ # Shift so that tokens < n predict n
+ shift_logits = logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+ # Flatten the tokens
+ loss_fct = CrossEntropyLoss()
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
+ shift_labels = shift_labels.view(-1)
+ # Enable model parallelism
+ shift_labels = shift_labels.to(shift_logits.device)
+ loss = loss_fct(shift_logits, shift_labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def prepare_inputs_for_generation(
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+ ):
+ if past_key_values is not None:
+ past_length = past_key_values[0][0].shape[2]
+
+ # Some generation methods already pass only the last input ID
+ if input_ids.shape[1] > past_length:
+ remove_prefix_length = past_length
+ else:
+ # Default to old behavior: keep only final ID
+ remove_prefix_length = input_ids.shape[1] - 1
+
+ input_ids = input_ids[:, remove_prefix_length:]
+
+ position_ids = kwargs.get("position_ids", None)
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -input_ids.shape[1] :]
+
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+ if inputs_embeds is not None and past_key_values is None:
+ model_inputs = {"inputs_embeds": inputs_embeds}
+ else:
+ model_inputs = {"input_ids": input_ids}
+
+ model_inputs.update(
+ {
+ "position_ids": position_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "attention_mask": attention_mask,
+ }
+ )
+ return model_inputs
+
+ @staticmethod
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+ )
+ return reordered_past
+
+ def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
+ prompt = ""
+ if meta_instruction:
+ prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n"""
+ else:
+ prompt += ""
+ for record in history:
+ prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
+ prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
+ return tokenizer([prompt], return_tensors="pt")
+
+ @torch.no_grad()
+ def chat(
+ self,
+ tokenizer,
+ query: str,
+ history: List[Tuple[str, str]] = [],
+ streamer: Optional[BaseStreamer] = None,
+ max_new_tokens: int = 1024,
+ do_sample: bool = True,
+ temperature: float = 0.8,
+ top_p: float = 0.8,
+ meta_instruction: str = "You are an AI assistant whose name is InternLM (书生·浦语).\n"
+ "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
+ "- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.",
+ **kwargs,
+ ):
+ inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
+ inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
+ # also add end-of-assistant token in eos token id to avoid unnecessary generation
+ eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
+ outputs = self.generate(
+ **inputs,
+ streamer=streamer,
+ max_new_tokens=max_new_tokens,
+ do_sample=do_sample,
+ temperature=temperature,
+ top_p=top_p,
+ eos_token_id=eos_token_id,
+ **kwargs,
+ )
+ outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
+ response = tokenizer.decode(outputs, skip_special_tokens=True)
+ response = response.split("<|im_end|>")[0]
+ history = history + [(query, response)]
+ return response, history
+
+ @torch.no_grad()
+ def stream_chat(
+ self,
+ tokenizer,
+ query: str,
+ history: List[Tuple[str, str]] = [],
+ max_new_tokens: int = 1024,
+ do_sample: bool = True,
+ temperature: float = 0.8,
+ top_p: float = 0.8,
+ **kwargs,
+ ):
+ """
+ Return a generator in format: (response, history)
+ Eg.
+ ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')])
+ ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')])
+ """
+ if BaseStreamer is None:
+ raise ModuleNotFoundError(
+ "The version of `transformers` is too low. Please make sure "
+ "that you have installed `transformers>=4.28.0`."
+ )
+
+ response_queue = queue.Queue(maxsize=20)
+
+ class ChatStreamer(BaseStreamer):
+ def __init__(self, tokenizer) -> None:
+ super().__init__()
+ self.tokenizer = tokenizer
+ self.queue = response_queue
+ self.query = query
+ self.history = history
+ self.response = ""
+ self.received_inputs = False
+ self.queue.put((self.response, history + [(self.query, self.response)]))
+
+ def put(self, value):
+ if len(value.shape) > 1 and value.shape[0] > 1:
+ raise ValueError("ChatStreamer only supports batch size 1")
+ elif len(value.shape) > 1:
+ value = value[0]
+
+ if not self.received_inputs:
+ # The first received value is input_ids, ignore here
+ self.received_inputs = True
+ return
+
+ token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
+ if token.strip() != "<|im_end|>":
+ self.response = self.response + token
+ history = self.history + [(self.query, self.response)]
+ self.queue.put((self.response, history))
+
+ def end(self):
+ self.queue.put(None)
+
+ def stream_producer():
+ return self.chat(
+ tokenizer=tokenizer,
+ query=query,
+ streamer=ChatStreamer(tokenizer=tokenizer),
+ history=history,
+ max_new_tokens=max_new_tokens,
+ do_sample=do_sample,
+ temperature=temperature,
+ top_p=top_p,
+ **kwargs,
+ )
+
+ def consumer():
+ producer = threading.Thread(target=stream_producer)
+ producer.start()
+ while True:
+ res = response_queue.get()
+ if res is None:
+ return
+ yield res
+
+ return consumer()
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
+@add_start_docstrings(
+ """
+ The InternLM2 Model transformer with a sequence classification head on top (linear layer).
+
+ [`InternLM2ForSequenceClassification`] uses the last token in order to do the classification,
+ as other causal models (e.g. GPT-2) do.
+
+ Since it does classification on the last token, it requires to know the position of the last token. If a
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+ each row of the batch).
+ """,
+ InternLM2_START_DOCSTRING,
+)
+class InternLM2ForSequenceClassification(InternLM2PreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+ self.model = InternLM2Model(config)
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.tok_embeddings
+
+ def set_input_embeddings(self, value):
+ self.model.tok_embeddings = value
+
+ @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ transformer_outputs = self.model(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ hidden_states = transformer_outputs[0]
+ logits = self.score(hidden_states)
+
+ if input_ids is not None:
+ batch_size = input_ids.shape[0]
+ else:
+ batch_size = inputs_embeds.shape[0]
+
+ if self.config.pad_token_id is None and batch_size != 1:
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+ if self.config.pad_token_id is None:
+ sequence_lengths = -1
+ else:
+ if input_ids is not None:
+ sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to(
+ logits.device
+ )
+ else:
+ sequence_lengths = -1
+
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+ loss = None
+ if labels is not None:
+ labels = labels.to(logits.device)
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(pooled_logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(pooled_logits, labels)
+ if not return_dict:
+ output = (pooled_logits,) + transformer_outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SequenceClassifierOutputWithPast(
+ loss=loss,
+ logits=pooled_logits,
+ past_key_values=transformer_outputs.past_key_values,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
diff --git a/triton_models/postprocessing/1/tokenizer/placeholder b/triton_models/postprocessing/1/tokenizer/placeholder
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/triton_models/postprocessing/1/tokenizer/pytorch_model.bin.index.json b/triton_models/postprocessing/1/tokenizer/pytorch_model.bin.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d95cf180df4c423e817c55f30f5ce93ac80e220
--- /dev/null
+++ b/triton_models/postprocessing/1/tokenizer/pytorch_model.bin.index.json
@@ -0,0 +1,554 @@
+{
+ "metadata": {
+ "total_size": 5251801088
+ },
+ "weight_map": {
+ "model.layers.0.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.2.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.20.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.3.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.30.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.4.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.tok_embeddings.weight": "pytorch_model-00001-of-00003.bin",
+ "output.weight": "pytorch_model-00003-of-00003.bin"
+ }
+}
diff --git a/triton_models/postprocessing/1/tokenizer/special_tokens_map.json b/triton_models/postprocessing/1/tokenizer/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..492d4b2966a1763442d426d880dbc29f94906e4c
--- /dev/null
+++ b/triton_models/postprocessing/1/tokenizer/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/triton_models/postprocessing/1/tokenizer/tokenization_internlm.py b/triton_models/postprocessing/1/tokenizer/tokenization_internlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9792349c7fed6fc64476eabdd9dad7a84640c3ee
--- /dev/null
+++ b/triton_models/postprocessing/1/tokenizer/tokenization_internlm.py
@@ -0,0 +1,240 @@
+# coding=utf-8
+# Copyright (c) InternLM. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes for IntermLM."""
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {}
+
+
+class InternLMTokenizer(PreTrainedTokenizer):
+ """
+ Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+ Args:
+ vocab_file (`str`):
+ Path to the vocabulary file.
+ """
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+ model_input_names = ["input_ids", "attention_mask"]
+ _auto_class = "AutoTokenizer"
+
+ def __init__(
+ self,
+ vocab_file,
+ unk_token="",
+ bos_token="",
+ eos_token="",
+ pad_token="",
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
+ add_bos_token=True,
+ add_eos_token=False,
+ decode_with_prefix_space=False,
+ clean_up_tokenization_spaces=False,
+ **kwargs,
+ ):
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ self.vocab_file = vocab_file
+ self.add_bos_token = add_bos_token
+ self.add_eos_token = add_eos_token
+ self.decode_with_prefix_space = decode_with_prefix_space
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+ self._no_prefix_space_tokens = None
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ pad_token=pad_token,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ **kwargs,
+ )
+
+ """ Initialization"""
+
+ @property
+ def no_prefix_space_tokens(self):
+ if self._no_prefix_space_tokens is None:
+ vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
+ self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
+ return self._no_prefix_space_tokens
+
+ @property
+ def vocab_size(self):
+ """Returns vocab size"""
+ return self.sp_model.get_piece_size()
+
+ @property
+ def bos_token_id(self) -> Optional[int]:
+ return self.sp_model.bos_id()
+
+ @property
+ def eos_token_id(self) -> Optional[int]:
+ return self.sp_model.eos_id()
+
+ def get_vocab(self):
+ """Returns vocab as a dict"""
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
+ def _tokenize(self, text):
+ """Returns a tokenized string."""
+ return self.sp_model.encode(text, out_type=str)
+
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ return self.sp_model.piece_to_id(token)
+
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ token = self.sp_model.IdToPiece(index)
+ return token
+
+ def _maybe_add_prefix_space(self, tokens, decoded):
+ if tokens and tokens[0] not in self.no_prefix_space_tokens:
+ return " " + decoded
+ else:
+ return decoded
+
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (string) in a single string."""
+ current_sub_tokens = []
+ out_string = ""
+ prev_is_special = False
+ for token in tokens:
+ # make sure that special tokens are not decoded using sentencepiece model
+ if token in self.all_special_tokens:
+ if not prev_is_special:
+ out_string += " "
+ out_string += self.sp_model.decode(current_sub_tokens) + token
+ prev_is_special = True
+ current_sub_tokens = []
+ else:
+ current_sub_tokens.append(token)
+ prev_is_special = False
+ out_string += self.sp_model.decode(current_sub_tokens)
+ out_string = self.clean_up_tokenization(out_string)
+ out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
+ return out_string[1:]
+
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ """
+ Save the vocabulary and special tokens file to a directory.
+
+ Args:
+ save_directory (`str`):
+ The directory in which to save the vocabulary.
+
+ Returns:
+ `Tuple(str)`: Paths to the files saved.
+ """
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ out_vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+ copyfile(self.vocab_file, out_vocab_file)
+ elif not os.path.isfile(self.vocab_file):
+ with open(out_vocab_file, "wb") as fi:
+ content_spiece_model = self.sp_model.serialized_model_proto()
+ fi.write(content_spiece_model)
+
+ return (out_vocab_file,)
+
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+ if self.add_bos_token:
+ bos_token_ids = [self.bos_token_id]
+ else:
+ bos_token_ids = []
+
+ output = bos_token_ids + token_ids_0
+
+ if token_ids_1 is not None:
+ output = output + token_ids_1
+
+ if self.add_eos_token:
+ output = output + [self.eos_token_id]
+
+ return output
+
+ def get_special_tokens_mask(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+ ) -> List[int]:
+ """
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+ special tokens using the tokenizer `prepare_for_model` method.
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+ Whether or not the token list is already formatted with special tokens for the model.
+
+ Returns:
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ """
+ if already_has_special_tokens:
+ return super().get_special_tokens_mask(
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+ )
+
+ if token_ids_1 is None:
+ return [1] + ([0] * len(token_ids_0)) + [1]
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+ def create_token_type_ids_from_sequences(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ """
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
+ use of token type ids, therefore a list of zeros is returned.
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of zeros.
+ """
+ eos = [self.eos_token_id]
+
+ if token_ids_1 is None:
+ return len(token_ids_0 + eos) * [0]
+ return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
diff --git a/triton_models/postprocessing/1/tokenizer/tokenizer.model b/triton_models/postprocessing/1/tokenizer/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6600712949ca9c4ffb50f25275993a21fba0b408
--- /dev/null
+++ b/triton_models/postprocessing/1/tokenizer/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b
+size 1477754
diff --git a/triton_models/postprocessing/1/tokenizer/tokenizer.py b/triton_models/postprocessing/1/tokenizer/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..db936a5501cb07d33d56083656dbd734ba7431bf
--- /dev/null
+++ b/triton_models/postprocessing/1/tokenizer/tokenizer.py
@@ -0,0 +1,400 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import os.path as osp
+from collections import deque
+from typing import List, Optional, Sequence, Union
+
+import torch
+
+from lmdeploy.utils import get_logger
+
+# this file will be copied to triton server, make sure all
+# importing are starting from the package root lmdeploy
+
+
+class SentencePieceTokenizer:
+ """Tokenizer of sentencepiece.
+
+ Args:
+ model_file (str): the path of the tokenizer model
+ """
+
+ def __init__(self, model_file: str):
+ from sentencepiece import SentencePieceProcessor
+ self.model = SentencePieceProcessor(model_file=model_file)
+ self._prefix_space_tokens = None
+ # for stop words
+ self._maybe_decode_bytes: bool = None
+ # TODO maybe lack a constant.py
+ self._indexes_tokens_deque = deque(maxlen=10)
+ self.max_indexes_num = 5
+ self.logger = get_logger('lmdeploy')
+
+ @property
+ def vocab_size(self):
+ """vocabulary size."""
+ return self.model.vocab_size()
+
+ @property
+ def bos_token_id(self):
+ """begine of the sentence token id."""
+ return self.model.bos_id()
+
+ @property
+ def eos_token_id(self):
+ """end of the sentence token id."""
+ return self.model.eos_id()
+
+ @property
+ def prefix_space_tokens(self):
+ """tokens without prefix space."""
+ if self._prefix_space_tokens is None:
+ vocab = self.model.IdToPiece(list(range(self.vocab_size)))
+ self._prefix_space_tokens = {
+ i
+ for i, tok in enumerate(vocab) if tok.startswith('▁')
+ }
+ return self._prefix_space_tokens
+
+ def _maybe_add_prefix_space(self, tokens, decoded):
+ """maybe add prefix space for incremental decoding."""
+ if len(tokens) and not decoded.startswith(' ') and\
+ tokens[0] in self.prefix_space_tokens:
+ return ' ' + decoded
+ else:
+ return decoded
+
+ def indexes_containing_token(self, token: str):
+ """Return all the possible indexes, whose decoding output may contain
+ the input token."""
+ # traversing vocab is time consuming, can not be accelerated with
+ # multi threads (computation) or multi process (can't pickle tokenizer)
+ # so, we maintain latest 10 stop words and return directly if matched
+ for _token, _indexes in self._indexes_tokens_deque:
+ if token == _token:
+ return _indexes
+ if token == ' ': # ' ' is special
+ token = '▁'
+ vocab = self.model.IdToPiece(list(range(self.vocab_size)))
+ indexes = [i for i, voc in enumerate(vocab) if token in voc]
+ if len(indexes) > self.max_indexes_num:
+ indexes = self.encode(token, add_bos=False)[-1:]
+ self.logger.warning(
+ f'There are too many(>{self.max_indexes_num}) possible '
+ f'indexes may decoding {token}, we will use {indexes} only')
+ self._indexes_tokens_deque.append((token, indexes))
+ return indexes
+
+ def encode(self, s: str, add_bos: bool = True, **kwargs):
+ """Tokenize a prompt.
+
+ Args:
+ s (str): a prompt
+ Returns:
+ list[int]: token ids
+ """
+ return self.model.Encode(s, add_bos=add_bos, **kwargs)
+
+ def decode(self, t: Sequence[int], offset: Optional[int] = None):
+ """De-tokenize.
+
+ Args:
+ t (List[int]): a list of token ids
+ offset (int): for incrementally decoding. Default to None, which
+ means not applied.
+ Returns:
+ str: text of decoding tokens
+ """
+ if isinstance(t, torch.Tensor):
+ t = t.tolist()
+ t = t[offset:]
+ out_string = self.model.Decode(t)
+ if offset:
+ out_string = self._maybe_add_prefix_space(t, out_string)
+ return out_string
+
+ def __call__(self, s: Union[str, Sequence[str]]):
+ """Tokenize prompts.
+
+ Args:
+ s (str): prompts
+ Returns:
+ list[int]: token ids
+ """
+ import addict
+ add_bos = False
+ add_eos = False
+
+ input_ids = self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
+ return addict.Addict(input_ids=input_ids)
+
+
+class HuggingFaceTokenizer:
+ """Tokenizer of sentencepiece.
+
+ Args:
+ model_dir (str): the directory of the tokenizer model
+ """
+
+ def __init__(self, model_dir: str):
+ from transformers import AutoTokenizer
+ model_file = osp.join(model_dir, 'tokenizer.model')
+ backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json')
+ model_file_exists = osp.exists(model_file)
+ self.logger = get_logger('lmdeploy')
+ if not osp.exists(backend_tokenizer_file) and model_file_exists:
+ self.logger.warning(
+ 'Can not find tokenizer.json. '
+ 'It may take long time to initialize the tokenizer.')
+ self.model = AutoTokenizer.from_pretrained(model_dir,
+ trust_remote_code=True)
+ self._prefix_space_tokens = None
+ # save tokenizer.json to reuse
+ if not osp.exists(backend_tokenizer_file) and model_file_exists:
+ if hasattr(self.model, 'backend_tokenizer'):
+ if os.access(model_dir, os.W_OK):
+ self.model.backend_tokenizer.save(backend_tokenizer_file)
+
+ if self.model.eos_token_id is None:
+ generation_config_file = osp.join(model_dir,
+ 'generation_config.json')
+ if osp.exists(generation_config_file):
+ with open(generation_config_file, 'r') as f:
+ cfg = json.load(f)
+ self.model.eos_token_id = cfg['eos_token_id']
+ elif hasattr(self.model, 'eod_id'): # Qwen remote
+ self.model.eos_token_id = self.model.eod_id
+
+ # for stop words
+ self._vocab_size_with_added: int = None
+ self._maybe_decode_bytes: bool = None
+ # TODO maybe lack a constant.py
+ self._indexes_tokens_deque = deque(maxlen=10)
+ self.max_indexes_num = 5
+ self.token2id = {}
+
+ @property
+ def vocab_size(self):
+ """vocabulary size."""
+ return self.model.vocab_size
+
+ @property
+ def vocab_size_with_added(self):
+ """vocabulary size with added vocab."""
+ if self._vocab_size_with_added is not None:
+ return self._vocab_size_with_added
+ self._vocab_size_with_added = len(self.model.get_vocab())
+ return self._vocab_size_with_added
+
+ @property
+ def bos_token_id(self):
+ """begine of the sentence token id."""
+ return self.model.bos_token_id
+
+ @property
+ def eos_token_id(self):
+ """end of the sentence token id."""
+ return self.model.eos_token_id
+
+ @property
+ def prefix_space_tokens(self):
+ """tokens without prefix space."""
+ if self._prefix_space_tokens is None:
+ vocab = self.model.convert_ids_to_tokens(
+ list(range(self.vocab_size)))
+ self._prefix_space_tokens = {
+ i
+ for i, tok in enumerate(vocab)
+ if tok.startswith('▁' if isinstance(tok, str) else b' ')
+ }
+ return self._prefix_space_tokens
+
+ def _maybe_add_prefix_space(self, tokens: List[int], decoded: str):
+ """maybe add prefix space for incremental decoding."""
+ if len(tokens) and not decoded.startswith(' ') and\
+ tokens[0] in self.prefix_space_tokens:
+ return ' ' + decoded
+ else:
+ return decoded
+
+ @property
+ def maybe_decode_bytes(self):
+ """Check if self.model.convert_ids_to_tokens return not a str value."""
+ if self._maybe_decode_bytes is None:
+ self._maybe_decode_bytes = False
+ vocab = self.model.convert_ids_to_tokens(
+ list(range(self.vocab_size)))
+ for tok in vocab:
+ if not isinstance(tok, str):
+ self._maybe_decode_bytes = True
+ break
+ return self._maybe_decode_bytes
+
+ def indexes_containing_token(self, token: str):
+ """Return all the possible indexes, whose decoding output may contain
+ the input token."""
+ # traversing vocab is time consuming, can not be accelerated with
+ # multi threads (computation) or multi process (can't pickle tokenizer)
+ # so, we maintain latest 10 stop words and return directly if matched
+ for _token, _indexes in self._indexes_tokens_deque:
+ if token == _token:
+ return _indexes
+
+ if self.token2id == {}:
+ # decode is slower than convert_ids_to_tokens
+ if self.maybe_decode_bytes:
+ self.token2id = {
+ self.model.decode(i): i
+ for i in range(self.vocab_size)
+ }
+ else:
+ self.token2id = {
+ self.model.convert_ids_to_tokens(i): i
+ for i in range(self.vocab_size)
+ }
+ if token == ' ': # ' ' is special
+ token = '▁'
+ indexes = [i for _token, i in self.token2id.items() if token in _token]
+ if len(indexes) > self.max_indexes_num:
+ indexes = self.encode(token, add_bos=False)[-1:]
+ self.logger.warning(
+ f'There are too many(>{self.max_indexes_num}) possible '
+ f'indexes may decoding {token}, we will use {indexes} only')
+ # there might be token id that exceeds self.vocab_size
+ if len(indexes) == 0:
+ indexes = self.encode(token, False)
+ if len(indexes) != 1:
+ self.logger.warning(
+ f'The token {token}, its length of indexes {indexes} is '
+ 'not 1. Currently, it can not be used as stop words')
+ indexes = []
+ self._indexes_tokens_deque.append((token, indexes))
+ return indexes
+
+ def encode(self, s: str, add_bos: bool = True, **kwargs):
+ """Tokenize a prompt.
+
+ Args:
+ s (str): a prompt
+ Returns:
+ list[int]: token ids
+ """
+ encoded = self.model.encode(s, **kwargs)
+ if not add_bos:
+ # in the middle of a session
+ if len(encoded) and encoded[0] == self.bos_token_id:
+ encoded = encoded[1:]
+ return encoded
+
+ def decode(self, t: Sequence[int], offset: Optional[int] = None):
+ """De-tokenize.
+
+ Args:
+ t (List[int]): a list of token ids
+ offset (int): for incrementally decoding. Default to None, which
+ means not applied.
+ Returns:
+ str: text of decoding tokens
+ """
+ skip_special_tokens = True
+ t = t[offset:]
+ out_string = self.model.decode(t,
+ skip_special_tokens=skip_special_tokens)
+ if offset:
+ out_string = self._maybe_add_prefix_space(t, out_string)
+ return out_string
+
+ def __call__(self, s: Union[str, Sequence[str]]):
+ """Tokenize prompts.
+
+ Args:
+ s (str): prompts
+ Returns:
+ list[int]: token ids
+ """
+ add_special_tokens = False
+ return self.model(s, add_special_tokens=add_special_tokens)
+
+
+class Tokenizer:
+ """Tokenize prompts or de-tokenize tokens into texts.
+
+ Args:
+ model_file (str): the path of the tokenizer model
+ """
+
+ def __init__(self, model_file: str):
+ if model_file.endswith('.model'):
+ model_folder = osp.split(model_file)[0]
+ else:
+ model_folder = model_file
+ model_file = osp.join(model_folder, 'tokenizer.model')
+ tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json')
+
+ model_file_exists = osp.exists(model_file)
+ config_exists = osp.exists(tokenizer_config_file)
+ use_hf_model = config_exists or not model_file_exists
+ self.logger = get_logger('lmdeploy')
+ if not use_hf_model:
+ self.model = SentencePieceTokenizer(model_file)
+ else:
+ self.model = HuggingFaceTokenizer(model_folder)
+
+ @property
+ def vocab_size(self):
+ """vocabulary size."""
+ return self.model.vocab_size
+
+ @property
+ def bos_token_id(self):
+ """begine of the sentence token id."""
+ return self.model.bos_token_id
+
+ @property
+ def eos_token_id(self):
+ """end of the sentence token id."""
+ return self.model.eos_token_id
+
+ def encode(self, s: str, add_bos: bool = True, **kwargs):
+ """Tokenize a prompt.
+
+ Args:
+ s (str): a prompt
+ Returns:
+ list[int]: token ids
+ """
+ return self.model.encode(s, add_bos, **kwargs)
+
+ def decode(self, t: Sequence[int], offset: Optional[int] = None):
+ """De-tokenize.
+
+ Args:
+ t (List[int]): a list of token ids
+ offset (int): for incrementally decoding. Default to None, which
+ means not applied.
+ Returns:
+ str: text of decoding tokens
+ """
+ return self.model.decode(t, offset)
+
+ def __call__(self, s: Union[str, Sequence[str]]):
+ """Tokenize prompts.
+
+ Args:
+ s (str): prompts
+ Returns:
+ list[int]: token ids
+ """
+ return self.model(s)
+
+ def indexes_containing_token(self, token):
+ """Return all the possible indexes, whose decoding output may contain
+ the input token."""
+ encoded = self.encode(token, add_bos=False)
+ if len(encoded) > 1:
+ self.logger.warning(
+ f'The token {token}, its length of indexes {encoded} is over '
+ 'than 1. Currently, it can not be used as stop words')
+ return []
+ return self.model.indexes_containing_token(token)
diff --git a/triton_models/postprocessing/1/tokenizer/tokenizer_config.json b/triton_models/postprocessing/1/tokenizer/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f133449013be570f08fdf7c70f1a2c8ccb4724da
--- /dev/null
+++ b/triton_models/postprocessing/1/tokenizer/tokenizer_config.json
@@ -0,0 +1,90 @@
+{
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92538": {
+ "content": "<|plugin|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92539": {
+ "content": "<|interpreter|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92540": {
+ "content": "<|action_end|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92541": {
+ "content": "<|action_start|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92542": {
+ "content": "<|im_end|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92543": {
+ "content": "<|im_start|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "auto_map": {
+ "AutoTokenizer": [
+ "tokenization_internlm.InternLMTokenizer",
+ null
+ ]
+ },
+ "bos_token": "",
+ "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "tokenizer_class": "InternLMTokenizer",
+ "unk_token": ""
+}
diff --git a/triton_models/postprocessing/config.pbtxt b/triton_models/postprocessing/config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a4c3fd1041dcd03dc5c18b3fc28533cb82ac5653
--- /dev/null
+++ b/triton_models/postprocessing/config.pbtxt
@@ -0,0 +1,36 @@
+name: "postprocessing"
+backend: "python"
+max_batch_size: 1
+input [
+ {
+ name: "TOKENS_BATCH"
+ data_type: TYPE_UINT32
+ dims: [ -1, -1 ]
+ },
+ {
+ name: "sequence_length"
+ data_type: TYPE_UINT32
+ dims: [ -1 ]
+ }
+]
+output [
+ {
+ name: "OUTPUT"
+ data_type: TYPE_STRING
+ dims: [ -1, -1 ]
+ }
+]
+
+instance_group [
+ {
+ count: 16
+ kind: KIND_CPU
+ }
+]
+
+parameters {
+ key: "tokenizer_path"
+ value: {
+ string_value: "tokenizer/tokenizer.model"
+ }
+}
diff --git a/triton_models/preprocessing/1/__pycache__/model.cpython-310.pyc b/triton_models/preprocessing/1/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..447bea773ddcc3daff21ef636ce8437c6632fed8
Binary files /dev/null and b/triton_models/preprocessing/1/__pycache__/model.cpython-310.pyc differ
diff --git a/triton_models/preprocessing/1/model.py b/triton_models/preprocessing/1/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e659fbae01737bd0a83980faf0e1eff9e607c3f
--- /dev/null
+++ b/triton_models/preprocessing/1/model.py
@@ -0,0 +1,151 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+from pathlib import Path
+
+import numpy as np
+import torch
+import triton_python_backend_utils as pb_utils
+from torch.nn.utils.rnn import pad_sequence
+
+# This tokenizer is `lmdeploy/turbomind/tokenizer.py`. When an LLM is served
+# by triton inference server, it has to be converted first by running
+# `python lmdeploy/serve/turbomind/deploy.py`. Then
+# `lmdeploy/turbomind/tokenizer.py` will be copied to `tokenizer/tokenizer.py`
+from .tokenizer.tokenizer import Tokenizer
+
+
+class TritonPythonModel:
+ """Your Python model must use the same class name.
+
+ Every Python model that is created must have "TritonPythonModel" as the
+ class name.
+ """
+
+ def initialize(self, args):
+ """`initialize` is called only once when the model is being loaded.
+ Implementing `initialize` function is optional. This function allows
+ the model to initialize any state associated with this model.
+ Parameters
+ ----------
+ args : dict
+ Both keys and values are strings. The dictionary keys and values are:
+ * model_config: A JSON string containing the model configuration
+ * model_instance_kind: A string containing model instance kind
+ * model_instance_device_id: A string containing model instance device
+ ID
+ * model_repository: Model repository path
+ * model_version: Model version
+ * model_name: Model name
+ """
+ # Parse model configs
+ self.model_config = model_config = json.loads(args['model_config'])
+
+ # Parse model output configs and convert Triton types to numpy types
+ input_names = ['INPUT_ID', 'REQUEST_INPUT_LEN']
+ for input_name in input_names:
+ setattr(
+ self,
+ input_name.lower() + '_dtype',
+ pb_utils.triton_string_to_numpy(
+ pb_utils.get_output_config_by_name(
+ model_config, input_name)['data_type']))
+
+ cur_folder = Path(__file__).parent
+ self.tokenizer = Tokenizer(
+ osp.join(
+ cur_folder, self.model_config['parameters']['tokenizer_path']
+ ['string_value']))
+ self.start_id = self.tokenizer.bos_token_id
+ self.end_id = self.tokenizer.eos_token_id
+
+ def execute(self, requests):
+ """`execute` must be implemented in every Python model. `execute`
+ function receives a list of pb_utils.InferenceRequest as the only
+ argument. This function is called when an inference is requested
+ for this model. Depending on the batching configuration (e.g. Dynamic
+ Batching) used, `requests` may contain multiple requests. Every
+ Python model, must create one pb_utils.InferenceResponse for every
+ pb_utils.InferenceRequest in `requests`. If there is an error, you can
+ set the error argument when creating a pb_utils.InferenceResponse.
+ Parameters
+ ----------
+ requests : list
+ A list of pb_utils.InferenceRequest
+ Returns
+ -------
+ list
+ A list of pb_utils.InferenceResponse. The length of this list must
+ be the same as `requests`
+ """
+
+ responses = []
+
+ # Every Python backend must iterate over everyone of the requests
+ # and create a pb_utils.InferenceResponse for each of them.
+ for idx, request in enumerate(requests):
+ # Get input tensors
+ query = pb_utils.get_input_tensor_by_name(request,
+ 'QUERY').as_numpy()
+
+ # Preprocessing input data.
+ input_id, request_input_len = self._create_request(query)
+
+ # Create output tensors. You need pb_utils.Tensor
+ # objects to create pb_utils.InferenceResponse.
+ input_id_tensor = pb_utils.Tensor(
+ 'INPUT_ID',
+ np.array(input_id).astype(self.input_id_dtype))
+ request_input_len_tensor = pb_utils.Tensor(
+ 'REQUEST_INPUT_LEN',
+ np.array(request_input_len).astype(
+ self.request_input_len_dtype))
+
+ # Create InferenceResponse. You can set an error here in case
+ # there was a problem with handling this inference request.
+ # Below is an example of how you can set errors in inference
+ # response:
+ #
+ # pb_utils.InferenceResponse(
+ # output_tensors=..., TritonError("An error occurred"))
+ inference_response = pb_utils.InferenceResponse(
+ output_tensors=[input_id_tensor, request_input_len_tensor])
+ responses.append(inference_response)
+
+ # You should return a list of pb_utils.InferenceResponse. Length
+ # of this list must match the length of `requests` list.
+ return responses
+
+ def finalize(self):
+ """`finalize` is called only once when the model is being unloaded.
+
+ Implementing `finalize` function is optional. This function allows the
+ model to perform any necessary clean ups before exit.
+ """
+ print('Cleaning up...')
+
+ def _create_request(self, query):
+ """Tokenize prompts and return the token ids and their length.
+
+ Args:
+ query (List[str]): a list of prompt
+ Returns:
+ tuple: token ids and their length
+ """
+ start_ids = []
+ for s in query:
+ _s = s[0].decode()
+ if _s == '':
+ start_id = [self.start_id
+ ] if self.start_id is not None else [-1]
+ elif _s == '':
+ start_id = [self.end_id] if self.end_id is not None else [-1]
+ else:
+ start_id = self.tokenizer.encode(_s)
+ start_ids.append(torch.IntTensor(start_id))
+
+ start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
+ start_ids = pad_sequence(start_ids,
+ batch_first=True,
+ padding_value=self.end_id)
+ return start_ids, start_lengths
diff --git a/triton_models/preprocessing/1/tokenizer/config.json b/triton_models/preprocessing/1/tokenizer/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..84235b8a1a9618cc0ac265caf61ea4088780e3b1
--- /dev/null
+++ b/triton_models/preprocessing/1/tokenizer/config.json
@@ -0,0 +1,37 @@
+{
+ "_name_or_path": "/root/psy/internlm2-7b/work_dirs/internlm2_chat_7b_qlora_oasst1_512_e3_copy/hf_2/merge",
+ "architectures": [
+ "InternLM2ForCausalLM"
+ ],
+ "attn_implementation": "eager",
+ "auto_map": {
+ "AutoConfig": "configuration_internlm.InternLMConfig",
+ "AutoModel": "modeling_internlm2.InternLM2ForCausalLM",
+ "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM"
+ },
+ "bias": false,
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "fp16": true,
+ "hidden_act": "silu",
+ "hidden_size": 4096,
+ "initializer_range": 0.02,
+ "intermediate_size": 14336,
+ "max_position_embeddings": 32768,
+ "model_type": "internlm",
+ "num_attention_heads": 32,
+ "num_hidden_layers": 32,
+ "num_key_value_heads": 8,
+ "pad_token_id": 2,
+ "rms_norm_eps": 1e-05,
+ "rope_scaling": {
+ "factor": 2.0,
+ "type": "dynamic"
+ },
+ "rope_theta": 1000000,
+ "tie_word_embeddings": false,
+ "torch_dtype": "float16",
+ "transformers_version": "4.37.2",
+ "use_cache": false,
+ "vocab_size": 92544
+}
diff --git a/triton_models/preprocessing/1/tokenizer/configuration_internlm.py b/triton_models/preprocessing/1/tokenizer/configuration_internlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d013582feaa1f9970a4256c4a0f77000fa645de
--- /dev/null
+++ b/triton_models/preprocessing/1/tokenizer/configuration_internlm.py
@@ -0,0 +1,164 @@
+# coding=utf-8
+# Copyright (c) InternLM. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" InternLM model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class InternLMConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
+ an InternLM model according to the specified arguments, defining the model architecture. Instantiating a
+ configuration with the defaults will yield a similar configuration to that of the InternLM-7B.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+
+ Args:
+ vocab_size (`int`, *optional*, defaults to 32000):
+ Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the
+ `inputs_ids` passed when calling [`InternLMModel`]
+ hidden_size (`int`, *optional*, defaults to 4096):
+ Dimension of the hidden representations.
+ intermediate_size (`int`, *optional*, defaults to 11008):
+ Dimension of the MLP representations.
+ num_hidden_layers (`int`, *optional*, defaults to 32):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 32):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ num_key_value_heads (`int`, *optional*):
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+ by meanpooling all the original heads within that group. For more details checkout [this
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+ `num_attention_heads`.
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+ The non-linear activation function (function or string) in the decoder.
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
+ just in case (e.g., 512 or 1024 or 2048).
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+ The epsilon used by the rms normalization layers.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
+ relevant if `config.is_decoder=True`.
+ tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+ Whether to tie weight embeddings
+ Example:
+
+ ```python
+ >>> from transformers import InternLMModel, InternLMConfig
+
+ >>> # Initializing a InternLM internlm-7b style configuration
+ >>> configuration = InternLMConfig()
+
+ >>> # Initializing a model from the internlm-7b style configuration
+ >>> model = InternLMModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+ model_type = "internlm"
+ _auto_class = "AutoConfig"
+
+ def __init__( # pylint: disable=W0102
+ self,
+ vocab_size=103168,
+ hidden_size=4096,
+ intermediate_size=11008,
+ num_hidden_layers=32,
+ num_attention_heads=32,
+ num_key_value_heads=None,
+ hidden_act="silu",
+ max_position_embeddings=2048,
+ initializer_range=0.02,
+ rms_norm_eps=1e-6,
+ use_cache=True,
+ pad_token_id=0,
+ bos_token_id=1,
+ eos_token_id=2,
+ tie_word_embeddings=False,
+ bias=True,
+ rope_theta=10000,
+ rope_scaling=None,
+ attn_implementation="eager",
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.max_position_embeddings = max_position_embeddings
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.bias = bias
+
+ if num_key_value_heads is None:
+ num_key_value_heads = num_attention_heads
+ self.num_key_value_heads = num_key_value_heads
+
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.rms_norm_eps = rms_norm_eps
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.rope_scaling = rope_scaling
+ self._rope_scaling_validation()
+
+ self.attn_implementation = attn_implementation
+ if self.attn_implementation is None:
+ self.attn_implementation = "eager"
+ super().__init__(
+ pad_token_id=pad_token_id,
+ bos_token_id=bos_token_id,
+ eos_token_id=eos_token_id,
+ tie_word_embeddings=tie_word_embeddings,
+ **kwargs,
+ )
+
+ def _rope_scaling_validation(self):
+ """
+ Validate the `rope_scaling` configuration.
+ """
+ if self.rope_scaling is None:
+ return
+
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+ raise ValueError(
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+ f"got {self.rope_scaling}"
+ )
+ rope_scaling_type = self.rope_scaling.get("type", None)
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+ raise ValueError(
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+ )
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
+ raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")
diff --git a/triton_models/preprocessing/1/tokenizer/generation_config.json b/triton_models/preprocessing/1/tokenizer/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc5efeadd3bf2caa4462a3be79d580690f410668
--- /dev/null
+++ b/triton_models/preprocessing/1/tokenizer/generation_config.json
@@ -0,0 +1,7 @@
+{
+ "_from_model_config": true,
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "pad_token_id": 2,
+ "transformers_version": "4.37.2"
+}
diff --git a/triton_models/preprocessing/1/tokenizer/modeling_internlm2.py b/triton_models/preprocessing/1/tokenizer/modeling_internlm2.py
new file mode 100644
index 0000000000000000000000000000000000000000..39d6f71d2933385988ec05f845d3f6386c97f74b
--- /dev/null
+++ b/triton_models/preprocessing/1/tokenizer/modeling_internlm2.py
@@ -0,0 +1,1385 @@
+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on transformers/src/transformers/models/llama/modeling_llama.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch InternLM2 model."""
+import math
+import queue
+import threading
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+ BaseModelOutputWithPast,
+ CausalLMOutputWithPast,
+ SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+)
+
+try:
+ from transformers.generation.streamers import BaseStreamer
+except: # noqa # pylint: disable=bare-except
+ BaseStreamer = None
+
+from .configuration_internlm import InternLMConfig as InternLM2Config
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "InternLM2Config"
+
+flash_attn_func, flash_attn_varlen_func = None, None
+pad_input, index_first_axis, unpad_input = None, None, None
+def _import_flash_attn():
+ global flash_attn_func, flash_attn_varlen_func
+ global pad_input, index_first_axis, unpad_input
+ try:
+ from flash_attn import flash_attn_func as _flash_attn_func, flash_attn_varlen_func as _flash_attn_varlen_func
+ from flash_attn.bert_padding import pad_input as _pad_input, index_first_axis as _index_first_axis, unpad_input as _unpad_input
+ flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func
+ pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input
+ except ImportError:
+ raise ImportError("flash_attn is not installed.")
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+ return (
+ indices,
+ cu_seqlens,
+ max_seqlen_in_batch,
+ )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+
+ if past_key_values_length > 0:
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ bsz, src_len = mask.size()
+ tgt_len = tgt_len if tgt_len is not None else src_len
+
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+ inverted_mask = 1.0 - expanded_mask
+
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM2
+class InternLM2RMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ InternLM2RMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM2
+class InternLM2RotaryEmbedding(nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=torch.float32)
+
+ return (
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
+ )
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->InternLM2
+class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding):
+ """InternLM2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ t = t / self.scaling_factor
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM2
+class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding):
+ """InternLM2RotaryEmbedding extended with Dynamic NTK scaling.
+ Credits to the Reddit users /u/bloc97 and /u/emozilla.
+ """
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+
+ if seq_len > self.max_position_embeddings:
+ base = self.base * (
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+ ) ** (self.dim / (self.dim - 2))
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+# Copied from transformers.model.llama.modeling_llama.rotate_half
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+ """Applies Rotary Position Embedding to the query and key tensors."""
+ cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+ sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class InternLM2MLP(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.intermediate_size = config.intermediate_size
+ self.w1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.w3 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.w2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+ self.act_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, x):
+ down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x))
+
+ return down_proj
+
+
+# Copied from transformers.model.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# Modified from transformers.model.llama.modeling_llama.LlamaAttention
+class InternLM2Attention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: InternLM2Config):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = self.hidden_size // self.num_heads
+ self.num_key_value_heads = config.num_key_value_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ self.max_position_embeddings = config.max_position_embeddings
+ self.is_causal = True
+
+ if (self.head_dim * self.num_heads) != self.hidden_size:
+ raise ValueError(
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+ f" and `num_heads`: {self.num_heads})."
+ )
+
+ self.wqkv = nn.Linear(
+ self.hidden_size,
+ (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim,
+ bias=config.bias,
+ )
+
+ self.wo = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
+ self._init_rope()
+
+ def _init_rope(self):
+ if self.config.rope_scaling is None:
+ self.rotary_emb = InternLM2RotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.config.rope_theta,
+ )
+ else:
+ scaling_type = self.config.rope_scaling["type"]
+ scaling_factor = self.config.rope_scaling["factor"]
+ if scaling_type == "dynamic":
+ self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.config.rope_theta,
+ scaling_factor=scaling_factor,
+ )
+ elif scaling_type == "linear":
+ self.rotary_emb = InternLM2LinearScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.config.rope_theta,
+ scaling_factor=scaling_factor,
+ )
+ else:
+ raise ValueError("Currently we only support rotary embedding's type being 'dynamic' or 'linear'.")
+ return self.rotary_emb
+
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+ "Please make sure use `attention_mask` instead.`"
+ )
+
+ bsz, q_len, _ = hidden_states.size()
+
+ qkv_states = self.wqkv(hidden_states)
+
+ qkv_states = rearrange(
+ qkv_states,
+ "b q (h gs d) -> b q h gs d",
+ gs=2 + self.num_key_value_groups,
+ d=self.head_dim,
+ )
+
+ query_states = qkv_states[..., : self.num_key_value_groups, :]
+ query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d")
+ key_states = qkv_states[..., -2, :]
+ value_states = qkv_states[..., -1, :]
+
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+ past_key_value = (key_states, value_states) if use_cache else None
+
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+ )
+ attn_weights = attn_weights + attention_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+ attn_output = self.wo(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+# Modified from transformers.model.llama.modeling_llama.InternLM2FlashAttention2
+class InternLM2FlashAttention2(InternLM2Attention):
+ """
+ InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+ flash attention and deal with padding tokens in case the input contains any of them.
+ """
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ # InternLM2FlashAttention2 attention does not support output_attentions
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+ "Please make sure use `attention_mask` instead.`"
+ )
+
+ # overwrite attention_mask with padding_mask
+ attention_mask = kwargs.pop("padding_mask")
+
+ output_attentions = False
+
+ bsz, q_len, _ = hidden_states.size()
+
+ qkv_states = self.wqkv(hidden_states)
+
+ qkv_states = rearrange(
+ qkv_states,
+ "b q (h gs d) -> b q h gs d",
+ gs=2 + self.num_key_value_groups,
+ d=self.head_dim,
+ )
+
+ query_states = qkv_states[..., : self.num_key_value_groups, :]
+ query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d")
+ key_states = qkv_states[..., -2, :]
+ value_states = qkv_states[..., -1, :]
+
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+ past_key_value = (key_states, value_states) if use_cache else None
+
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ attn_output = self._flash_attention_forward(
+ query_states, key_states, value_states, attention_mask, q_len
+ )
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+ attn_output = self.wo(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+ def _flash_attention_forward(
+ self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+ ):
+ """
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+ first unpad the input, then computes the attention scores and pad the final attention scores.
+
+ Args:
+ query_states (`torch.Tensor`):
+ Input query states to be passed to Flash Attention API
+ key_states (`torch.Tensor`):
+ Input key states to be passed to Flash Attention API
+ value_states (`torch.Tensor`):
+ Input value states to be passed to Flash Attention API
+ attention_mask (`torch.Tensor`):
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+ position of padding tokens and 1 for the position of non-padding tokens.
+ dropout (`int`, *optional*):
+ Attention dropout
+ softmax_scale (`float`, *optional*):
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+ """
+ # Contains at least one padding token in the sequence
+ causal = self.is_causal and query_length != 1
+ if attention_mask is not None:
+ batch_size = query_states.shape[0]
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input(
+ query_states, key_states, value_states, attention_mask, query_length
+ )
+
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+ attn_output_unpad = flash_attn_varlen_func(
+ query_states,
+ key_states,
+ value_states,
+ cu_seqlens_q=cu_seqlens_q,
+ cu_seqlens_k=cu_seqlens_k,
+ max_seqlen_q=max_seqlen_in_batch_q,
+ max_seqlen_k=max_seqlen_in_batch_k,
+ dropout_p=dropout,
+ softmax_scale=softmax_scale,
+ causal=causal,
+ )
+
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+ else:
+ attn_output = flash_attn_func(
+ query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+ )
+
+ return attn_output
+
+ def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+ key_layer = index_first_axis(
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+ )
+ value_layer = index_first_axis(
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+ )
+
+ if query_length == kv_seq_len:
+ query_layer = index_first_axis(
+ query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+ )
+ cu_seqlens_q = cu_seqlens_k
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
+ indices_q = indices_k
+ elif query_length == 1:
+ max_seqlen_in_batch_q = 1
+ cu_seqlens_q = torch.arange(
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
+ ) # There is a memcpy here, that is very bad.
+ indices_q = cu_seqlens_q[:-1]
+ query_layer = query_layer.squeeze(1)
+ else:
+ # The -q_len: slice assumes left padding.
+ attention_mask = attention_mask[:, -query_length:]
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+ return (
+ query_layer,
+ key_layer,
+ value_layer,
+ indices_q.to(torch.int64),
+ (cu_seqlens_q, cu_seqlens_k),
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+ )
+
+INTERNLM2_ATTENTION_CLASSES = {
+ "eager": InternLM2Attention,
+ "flash_attention_2": InternLM2FlashAttention2,
+}
+
+# Modified from transformers.model.llama.modeling_llama.LlamaDecoderLayer
+class InternLM2DecoderLayer(nn.Module):
+ def __init__(self, config: InternLM2Config):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+
+ self.attention = INTERNLM2_ATTENTION_CLASSES[config.attn_implementation](config=config)
+
+ self.feed_forward = InternLM2MLP(config)
+ self.attention_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.ffn_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ **kwargs,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`, *optional*):
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+ query_sequence_length, key_sequence_length)` if default attention is used.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+ """
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+ "Please make sure use `attention_mask` instead.`"
+ )
+
+ residual = hidden_states
+
+ hidden_states = self.attention_norm(hidden_states)
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.attention(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ **kwargs,
+ )
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.ffn_norm(hidden_states)
+ hidden_states = self.feed_forward(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ return outputs
+
+
+InternLM2_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`InternLM2Config`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2
+@add_start_docstrings(
+ "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.",
+ InternLM2_START_DOCSTRING,
+)
+class InternLM2PreTrainedModel(PreTrainedModel):
+ config_class = InternLM2Config
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["InternLM2DecoderLayer"]
+ _skip_keys_device_placement = "past_key_values"
+
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+
+InternLM2_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+ `past_key_values`).
+
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+ information on the default strategy.
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.n_positions - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or
+ when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+ `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+ of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Modified from transformers.model.llama.modeling_llama.LlamaModel
+@add_start_docstrings(
+ "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.",
+ InternLM2_START_DOCSTRING,
+)
+class InternLM2Model(InternLM2PreTrainedModel):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLM2DecoderLayer`]
+
+ Args:
+ config: InternLM2Config
+ """
+
+ _auto_class = "AutoModel"
+
+ def __init__(self, config: InternLM2Config):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+ self.config = config
+
+ self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+
+ self.layers = nn.ModuleList([InternLM2DecoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.tok_embeddings
+
+ def set_input_embeddings(self, value):
+ self.tok_embeddings = value
+
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+ # create causal mask
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ combined_attention_mask = None
+ if input_shape[-1] > 1:
+ combined_attention_mask = _make_causal_mask(
+ input_shape,
+ inputs_embeds.dtype,
+ device=inputs_embeds.device,
+ past_key_values_length=past_key_values_length,
+ )
+
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+ inputs_embeds.device
+ )
+ combined_attention_mask = (
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+ )
+
+ return combined_attention_mask
+
+ @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if self.config.attn_implementation == "flash_attention_2":
+ _import_flash_attn()
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ batch_size, seq_length = input_ids.shape[:2]
+ elif inputs_embeds is not None:
+ batch_size, seq_length = inputs_embeds.shape[:2]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ seq_length_with_past = seq_length
+ past_key_values_length = 0
+ if past_key_values is not None:
+ past_key_values_length = past_key_values[0][0].shape[2]
+ seq_length_with_past = seq_length_with_past + past_key_values_length
+
+ if position_ids is None:
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+ position_ids = torch.arange(
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+ )
+ position_ids = position_ids.unsqueeze(0)
+
+ if inputs_embeds is None:
+ inputs_embeds = self.tok_embeddings(input_ids)
+
+ if self.config.attn_implementation == "flash_attention_2":
+ # 2d mask is passed through the layers
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+ else:
+ if attention_mask is None:
+ attention_mask = torch.ones(
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+ )
+ attention_mask = self._prepare_decoder_attention_mask(
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+ )
+
+ # embed positions
+ hidden_states = inputs_embeds
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ next_decoder_cache = () if use_cache else None
+
+ for idx, decoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ # None for past_key_value
+ return module(*inputs, output_attentions, None)
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(decoder_layer),
+ hidden_states,
+ attention_mask,
+ position_ids,
+ None,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ hidden_states = self.norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = next_decoder_cache if use_cache else None
+ if not return_dict:
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ )
+
+
+# Modified from transformers.model.llama.modeling_llama.LlamaForCausalLM
+class InternLM2ForCausalLM(InternLM2PreTrainedModel):
+ _auto_class = "AutoModelForCausalLM"
+
+ _tied_weights_keys = ["output.weight"]
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = InternLM2Model(config)
+ self.vocab_size = config.vocab_size
+ self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.tok_embeddings
+
+ def set_input_embeddings(self, value):
+ self.model.tok_embeddings = value
+
+ def get_output_embeddings(self):
+ return self.output
+
+ def set_output_embeddings(self, new_embeddings):
+ self.output = new_embeddings
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+ r"""
+ Args:
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+ Returns:
+
+ Example:
+
+ ```python
+ >>> from transformers import AutoTokenizer, InternLM2ForCausalLM
+
+ >>> model = InternLM2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+ >>> # Generate
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+ ```"""
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = outputs[0]
+ logits = self.output(hidden_states)
+ logits = logits.float()
+
+ loss = None
+ if labels is not None:
+ # Shift so that tokens < n predict n
+ shift_logits = logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+ # Flatten the tokens
+ loss_fct = CrossEntropyLoss()
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
+ shift_labels = shift_labels.view(-1)
+ # Enable model parallelism
+ shift_labels = shift_labels.to(shift_logits.device)
+ loss = loss_fct(shift_logits, shift_labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def prepare_inputs_for_generation(
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+ ):
+ if past_key_values is not None:
+ past_length = past_key_values[0][0].shape[2]
+
+ # Some generation methods already pass only the last input ID
+ if input_ids.shape[1] > past_length:
+ remove_prefix_length = past_length
+ else:
+ # Default to old behavior: keep only final ID
+ remove_prefix_length = input_ids.shape[1] - 1
+
+ input_ids = input_ids[:, remove_prefix_length:]
+
+ position_ids = kwargs.get("position_ids", None)
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -input_ids.shape[1] :]
+
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+ if inputs_embeds is not None and past_key_values is None:
+ model_inputs = {"inputs_embeds": inputs_embeds}
+ else:
+ model_inputs = {"input_ids": input_ids}
+
+ model_inputs.update(
+ {
+ "position_ids": position_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "attention_mask": attention_mask,
+ }
+ )
+ return model_inputs
+
+ @staticmethod
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+ )
+ return reordered_past
+
+ def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
+ prompt = ""
+ if meta_instruction:
+ prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n"""
+ else:
+ prompt += ""
+ for record in history:
+ prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
+ prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
+ return tokenizer([prompt], return_tensors="pt")
+
+ @torch.no_grad()
+ def chat(
+ self,
+ tokenizer,
+ query: str,
+ history: List[Tuple[str, str]] = [],
+ streamer: Optional[BaseStreamer] = None,
+ max_new_tokens: int = 1024,
+ do_sample: bool = True,
+ temperature: float = 0.8,
+ top_p: float = 0.8,
+ meta_instruction: str = "You are an AI assistant whose name is InternLM (书生·浦语).\n"
+ "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
+ "- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.",
+ **kwargs,
+ ):
+ inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
+ inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
+ # also add end-of-assistant token in eos token id to avoid unnecessary generation
+ eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
+ outputs = self.generate(
+ **inputs,
+ streamer=streamer,
+ max_new_tokens=max_new_tokens,
+ do_sample=do_sample,
+ temperature=temperature,
+ top_p=top_p,
+ eos_token_id=eos_token_id,
+ **kwargs,
+ )
+ outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
+ response = tokenizer.decode(outputs, skip_special_tokens=True)
+ response = response.split("<|im_end|>")[0]
+ history = history + [(query, response)]
+ return response, history
+
+ @torch.no_grad()
+ def stream_chat(
+ self,
+ tokenizer,
+ query: str,
+ history: List[Tuple[str, str]] = [],
+ max_new_tokens: int = 1024,
+ do_sample: bool = True,
+ temperature: float = 0.8,
+ top_p: float = 0.8,
+ **kwargs,
+ ):
+ """
+ Return a generator in format: (response, history)
+ Eg.
+ ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')])
+ ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')])
+ """
+ if BaseStreamer is None:
+ raise ModuleNotFoundError(
+ "The version of `transformers` is too low. Please make sure "
+ "that you have installed `transformers>=4.28.0`."
+ )
+
+ response_queue = queue.Queue(maxsize=20)
+
+ class ChatStreamer(BaseStreamer):
+ def __init__(self, tokenizer) -> None:
+ super().__init__()
+ self.tokenizer = tokenizer
+ self.queue = response_queue
+ self.query = query
+ self.history = history
+ self.response = ""
+ self.received_inputs = False
+ self.queue.put((self.response, history + [(self.query, self.response)]))
+
+ def put(self, value):
+ if len(value.shape) > 1 and value.shape[0] > 1:
+ raise ValueError("ChatStreamer only supports batch size 1")
+ elif len(value.shape) > 1:
+ value = value[0]
+
+ if not self.received_inputs:
+ # The first received value is input_ids, ignore here
+ self.received_inputs = True
+ return
+
+ token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
+ if token.strip() != "<|im_end|>":
+ self.response = self.response + token
+ history = self.history + [(self.query, self.response)]
+ self.queue.put((self.response, history))
+
+ def end(self):
+ self.queue.put(None)
+
+ def stream_producer():
+ return self.chat(
+ tokenizer=tokenizer,
+ query=query,
+ streamer=ChatStreamer(tokenizer=tokenizer),
+ history=history,
+ max_new_tokens=max_new_tokens,
+ do_sample=do_sample,
+ temperature=temperature,
+ top_p=top_p,
+ **kwargs,
+ )
+
+ def consumer():
+ producer = threading.Thread(target=stream_producer)
+ producer.start()
+ while True:
+ res = response_queue.get()
+ if res is None:
+ return
+ yield res
+
+ return consumer()
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
+@add_start_docstrings(
+ """
+ The InternLM2 Model transformer with a sequence classification head on top (linear layer).
+
+ [`InternLM2ForSequenceClassification`] uses the last token in order to do the classification,
+ as other causal models (e.g. GPT-2) do.
+
+ Since it does classification on the last token, it requires to know the position of the last token. If a
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+ each row of the batch).
+ """,
+ InternLM2_START_DOCSTRING,
+)
+class InternLM2ForSequenceClassification(InternLM2PreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+ self.model = InternLM2Model(config)
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.tok_embeddings
+
+ def set_input_embeddings(self, value):
+ self.model.tok_embeddings = value
+
+ @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ transformer_outputs = self.model(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ hidden_states = transformer_outputs[0]
+ logits = self.score(hidden_states)
+
+ if input_ids is not None:
+ batch_size = input_ids.shape[0]
+ else:
+ batch_size = inputs_embeds.shape[0]
+
+ if self.config.pad_token_id is None and batch_size != 1:
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+ if self.config.pad_token_id is None:
+ sequence_lengths = -1
+ else:
+ if input_ids is not None:
+ sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to(
+ logits.device
+ )
+ else:
+ sequence_lengths = -1
+
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+ loss = None
+ if labels is not None:
+ labels = labels.to(logits.device)
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(pooled_logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(pooled_logits, labels)
+ if not return_dict:
+ output = (pooled_logits,) + transformer_outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SequenceClassifierOutputWithPast(
+ loss=loss,
+ logits=pooled_logits,
+ past_key_values=transformer_outputs.past_key_values,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
diff --git a/triton_models/preprocessing/1/tokenizer/placeholder b/triton_models/preprocessing/1/tokenizer/placeholder
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/triton_models/preprocessing/1/tokenizer/pytorch_model.bin.index.json b/triton_models/preprocessing/1/tokenizer/pytorch_model.bin.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d95cf180df4c423e817c55f30f5ce93ac80e220
--- /dev/null
+++ b/triton_models/preprocessing/1/tokenizer/pytorch_model.bin.index.json
@@ -0,0 +1,554 @@
+{
+ "metadata": {
+ "total_size": 5251801088
+ },
+ "weight_map": {
+ "model.layers.0.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.2.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.20.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.3.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.30.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.4.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.tok_embeddings.weight": "pytorch_model-00001-of-00003.bin",
+ "output.weight": "pytorch_model-00003-of-00003.bin"
+ }
+}
diff --git a/triton_models/preprocessing/1/tokenizer/special_tokens_map.json b/triton_models/preprocessing/1/tokenizer/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..492d4b2966a1763442d426d880dbc29f94906e4c
--- /dev/null
+++ b/triton_models/preprocessing/1/tokenizer/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/triton_models/preprocessing/1/tokenizer/tokenization_internlm.py b/triton_models/preprocessing/1/tokenizer/tokenization_internlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9792349c7fed6fc64476eabdd9dad7a84640c3ee
--- /dev/null
+++ b/triton_models/preprocessing/1/tokenizer/tokenization_internlm.py
@@ -0,0 +1,240 @@
+# coding=utf-8
+# Copyright (c) InternLM. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes for IntermLM."""
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {}
+
+
+class InternLMTokenizer(PreTrainedTokenizer):
+ """
+ Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+ Args:
+ vocab_file (`str`):
+ Path to the vocabulary file.
+ """
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+ model_input_names = ["input_ids", "attention_mask"]
+ _auto_class = "AutoTokenizer"
+
+ def __init__(
+ self,
+ vocab_file,
+ unk_token="",
+ bos_token="",
+ eos_token="",
+ pad_token="",
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
+ add_bos_token=True,
+ add_eos_token=False,
+ decode_with_prefix_space=False,
+ clean_up_tokenization_spaces=False,
+ **kwargs,
+ ):
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ self.vocab_file = vocab_file
+ self.add_bos_token = add_bos_token
+ self.add_eos_token = add_eos_token
+ self.decode_with_prefix_space = decode_with_prefix_space
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+ self._no_prefix_space_tokens = None
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ pad_token=pad_token,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ **kwargs,
+ )
+
+ """ Initialization"""
+
+ @property
+ def no_prefix_space_tokens(self):
+ if self._no_prefix_space_tokens is None:
+ vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
+ self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
+ return self._no_prefix_space_tokens
+
+ @property
+ def vocab_size(self):
+ """Returns vocab size"""
+ return self.sp_model.get_piece_size()
+
+ @property
+ def bos_token_id(self) -> Optional[int]:
+ return self.sp_model.bos_id()
+
+ @property
+ def eos_token_id(self) -> Optional[int]:
+ return self.sp_model.eos_id()
+
+ def get_vocab(self):
+ """Returns vocab as a dict"""
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
+ def _tokenize(self, text):
+ """Returns a tokenized string."""
+ return self.sp_model.encode(text, out_type=str)
+
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ return self.sp_model.piece_to_id(token)
+
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ token = self.sp_model.IdToPiece(index)
+ return token
+
+ def _maybe_add_prefix_space(self, tokens, decoded):
+ if tokens and tokens[0] not in self.no_prefix_space_tokens:
+ return " " + decoded
+ else:
+ return decoded
+
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (string) in a single string."""
+ current_sub_tokens = []
+ out_string = ""
+ prev_is_special = False
+ for token in tokens:
+ # make sure that special tokens are not decoded using sentencepiece model
+ if token in self.all_special_tokens:
+ if not prev_is_special:
+ out_string += " "
+ out_string += self.sp_model.decode(current_sub_tokens) + token
+ prev_is_special = True
+ current_sub_tokens = []
+ else:
+ current_sub_tokens.append(token)
+ prev_is_special = False
+ out_string += self.sp_model.decode(current_sub_tokens)
+ out_string = self.clean_up_tokenization(out_string)
+ out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
+ return out_string[1:]
+
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ """
+ Save the vocabulary and special tokens file to a directory.
+
+ Args:
+ save_directory (`str`):
+ The directory in which to save the vocabulary.
+
+ Returns:
+ `Tuple(str)`: Paths to the files saved.
+ """
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ out_vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+ copyfile(self.vocab_file, out_vocab_file)
+ elif not os.path.isfile(self.vocab_file):
+ with open(out_vocab_file, "wb") as fi:
+ content_spiece_model = self.sp_model.serialized_model_proto()
+ fi.write(content_spiece_model)
+
+ return (out_vocab_file,)
+
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+ if self.add_bos_token:
+ bos_token_ids = [self.bos_token_id]
+ else:
+ bos_token_ids = []
+
+ output = bos_token_ids + token_ids_0
+
+ if token_ids_1 is not None:
+ output = output + token_ids_1
+
+ if self.add_eos_token:
+ output = output + [self.eos_token_id]
+
+ return output
+
+ def get_special_tokens_mask(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+ ) -> List[int]:
+ """
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+ special tokens using the tokenizer `prepare_for_model` method.
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+ Whether or not the token list is already formatted with special tokens for the model.
+
+ Returns:
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ """
+ if already_has_special_tokens:
+ return super().get_special_tokens_mask(
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+ )
+
+ if token_ids_1 is None:
+ return [1] + ([0] * len(token_ids_0)) + [1]
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+ def create_token_type_ids_from_sequences(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ """
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
+ use of token type ids, therefore a list of zeros is returned.
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of zeros.
+ """
+ eos = [self.eos_token_id]
+
+ if token_ids_1 is None:
+ return len(token_ids_0 + eos) * [0]
+ return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
diff --git a/triton_models/preprocessing/1/tokenizer/tokenizer.model b/triton_models/preprocessing/1/tokenizer/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6600712949ca9c4ffb50f25275993a21fba0b408
--- /dev/null
+++ b/triton_models/preprocessing/1/tokenizer/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b
+size 1477754
diff --git a/triton_models/preprocessing/1/tokenizer/tokenizer.py b/triton_models/preprocessing/1/tokenizer/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..db936a5501cb07d33d56083656dbd734ba7431bf
--- /dev/null
+++ b/triton_models/preprocessing/1/tokenizer/tokenizer.py
@@ -0,0 +1,400 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import os.path as osp
+from collections import deque
+from typing import List, Optional, Sequence, Union
+
+import torch
+
+from lmdeploy.utils import get_logger
+
+# this file will be copied to triton server, make sure all
+# importing are starting from the package root lmdeploy
+
+
+class SentencePieceTokenizer:
+ """Tokenizer of sentencepiece.
+
+ Args:
+ model_file (str): the path of the tokenizer model
+ """
+
+ def __init__(self, model_file: str):
+ from sentencepiece import SentencePieceProcessor
+ self.model = SentencePieceProcessor(model_file=model_file)
+ self._prefix_space_tokens = None
+ # for stop words
+ self._maybe_decode_bytes: bool = None
+ # TODO maybe lack a constant.py
+ self._indexes_tokens_deque = deque(maxlen=10)
+ self.max_indexes_num = 5
+ self.logger = get_logger('lmdeploy')
+
+ @property
+ def vocab_size(self):
+ """vocabulary size."""
+ return self.model.vocab_size()
+
+ @property
+ def bos_token_id(self):
+ """begine of the sentence token id."""
+ return self.model.bos_id()
+
+ @property
+ def eos_token_id(self):
+ """end of the sentence token id."""
+ return self.model.eos_id()
+
+ @property
+ def prefix_space_tokens(self):
+ """tokens without prefix space."""
+ if self._prefix_space_tokens is None:
+ vocab = self.model.IdToPiece(list(range(self.vocab_size)))
+ self._prefix_space_tokens = {
+ i
+ for i, tok in enumerate(vocab) if tok.startswith('▁')
+ }
+ return self._prefix_space_tokens
+
+ def _maybe_add_prefix_space(self, tokens, decoded):
+ """maybe add prefix space for incremental decoding."""
+ if len(tokens) and not decoded.startswith(' ') and\
+ tokens[0] in self.prefix_space_tokens:
+ return ' ' + decoded
+ else:
+ return decoded
+
+ def indexes_containing_token(self, token: str):
+ """Return all the possible indexes, whose decoding output may contain
+ the input token."""
+ # traversing vocab is time consuming, can not be accelerated with
+ # multi threads (computation) or multi process (can't pickle tokenizer)
+ # so, we maintain latest 10 stop words and return directly if matched
+ for _token, _indexes in self._indexes_tokens_deque:
+ if token == _token:
+ return _indexes
+ if token == ' ': # ' ' is special
+ token = '▁'
+ vocab = self.model.IdToPiece(list(range(self.vocab_size)))
+ indexes = [i for i, voc in enumerate(vocab) if token in voc]
+ if len(indexes) > self.max_indexes_num:
+ indexes = self.encode(token, add_bos=False)[-1:]
+ self.logger.warning(
+ f'There are too many(>{self.max_indexes_num}) possible '
+ f'indexes may decoding {token}, we will use {indexes} only')
+ self._indexes_tokens_deque.append((token, indexes))
+ return indexes
+
+ def encode(self, s: str, add_bos: bool = True, **kwargs):
+ """Tokenize a prompt.
+
+ Args:
+ s (str): a prompt
+ Returns:
+ list[int]: token ids
+ """
+ return self.model.Encode(s, add_bos=add_bos, **kwargs)
+
+ def decode(self, t: Sequence[int], offset: Optional[int] = None):
+ """De-tokenize.
+
+ Args:
+ t (List[int]): a list of token ids
+ offset (int): for incrementally decoding. Default to None, which
+ means not applied.
+ Returns:
+ str: text of decoding tokens
+ """
+ if isinstance(t, torch.Tensor):
+ t = t.tolist()
+ t = t[offset:]
+ out_string = self.model.Decode(t)
+ if offset:
+ out_string = self._maybe_add_prefix_space(t, out_string)
+ return out_string
+
+ def __call__(self, s: Union[str, Sequence[str]]):
+ """Tokenize prompts.
+
+ Args:
+ s (str): prompts
+ Returns:
+ list[int]: token ids
+ """
+ import addict
+ add_bos = False
+ add_eos = False
+
+ input_ids = self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
+ return addict.Addict(input_ids=input_ids)
+
+
+class HuggingFaceTokenizer:
+ """Tokenizer of sentencepiece.
+
+ Args:
+ model_dir (str): the directory of the tokenizer model
+ """
+
+ def __init__(self, model_dir: str):
+ from transformers import AutoTokenizer
+ model_file = osp.join(model_dir, 'tokenizer.model')
+ backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json')
+ model_file_exists = osp.exists(model_file)
+ self.logger = get_logger('lmdeploy')
+ if not osp.exists(backend_tokenizer_file) and model_file_exists:
+ self.logger.warning(
+ 'Can not find tokenizer.json. '
+ 'It may take long time to initialize the tokenizer.')
+ self.model = AutoTokenizer.from_pretrained(model_dir,
+ trust_remote_code=True)
+ self._prefix_space_tokens = None
+ # save tokenizer.json to reuse
+ if not osp.exists(backend_tokenizer_file) and model_file_exists:
+ if hasattr(self.model, 'backend_tokenizer'):
+ if os.access(model_dir, os.W_OK):
+ self.model.backend_tokenizer.save(backend_tokenizer_file)
+
+ if self.model.eos_token_id is None:
+ generation_config_file = osp.join(model_dir,
+ 'generation_config.json')
+ if osp.exists(generation_config_file):
+ with open(generation_config_file, 'r') as f:
+ cfg = json.load(f)
+ self.model.eos_token_id = cfg['eos_token_id']
+ elif hasattr(self.model, 'eod_id'): # Qwen remote
+ self.model.eos_token_id = self.model.eod_id
+
+ # for stop words
+ self._vocab_size_with_added: int = None
+ self._maybe_decode_bytes: bool = None
+ # TODO maybe lack a constant.py
+ self._indexes_tokens_deque = deque(maxlen=10)
+ self.max_indexes_num = 5
+ self.token2id = {}
+
+ @property
+ def vocab_size(self):
+ """vocabulary size."""
+ return self.model.vocab_size
+
+ @property
+ def vocab_size_with_added(self):
+ """vocabulary size with added vocab."""
+ if self._vocab_size_with_added is not None:
+ return self._vocab_size_with_added
+ self._vocab_size_with_added = len(self.model.get_vocab())
+ return self._vocab_size_with_added
+
+ @property
+ def bos_token_id(self):
+ """begine of the sentence token id."""
+ return self.model.bos_token_id
+
+ @property
+ def eos_token_id(self):
+ """end of the sentence token id."""
+ return self.model.eos_token_id
+
+ @property
+ def prefix_space_tokens(self):
+ """tokens without prefix space."""
+ if self._prefix_space_tokens is None:
+ vocab = self.model.convert_ids_to_tokens(
+ list(range(self.vocab_size)))
+ self._prefix_space_tokens = {
+ i
+ for i, tok in enumerate(vocab)
+ if tok.startswith('▁' if isinstance(tok, str) else b' ')
+ }
+ return self._prefix_space_tokens
+
+ def _maybe_add_prefix_space(self, tokens: List[int], decoded: str):
+ """maybe add prefix space for incremental decoding."""
+ if len(tokens) and not decoded.startswith(' ') and\
+ tokens[0] in self.prefix_space_tokens:
+ return ' ' + decoded
+ else:
+ return decoded
+
+ @property
+ def maybe_decode_bytes(self):
+ """Check if self.model.convert_ids_to_tokens return not a str value."""
+ if self._maybe_decode_bytes is None:
+ self._maybe_decode_bytes = False
+ vocab = self.model.convert_ids_to_tokens(
+ list(range(self.vocab_size)))
+ for tok in vocab:
+ if not isinstance(tok, str):
+ self._maybe_decode_bytes = True
+ break
+ return self._maybe_decode_bytes
+
+ def indexes_containing_token(self, token: str):
+ """Return all the possible indexes, whose decoding output may contain
+ the input token."""
+ # traversing vocab is time consuming, can not be accelerated with
+ # multi threads (computation) or multi process (can't pickle tokenizer)
+ # so, we maintain latest 10 stop words and return directly if matched
+ for _token, _indexes in self._indexes_tokens_deque:
+ if token == _token:
+ return _indexes
+
+ if self.token2id == {}:
+ # decode is slower than convert_ids_to_tokens
+ if self.maybe_decode_bytes:
+ self.token2id = {
+ self.model.decode(i): i
+ for i in range(self.vocab_size)
+ }
+ else:
+ self.token2id = {
+ self.model.convert_ids_to_tokens(i): i
+ for i in range(self.vocab_size)
+ }
+ if token == ' ': # ' ' is special
+ token = '▁'
+ indexes = [i for _token, i in self.token2id.items() if token in _token]
+ if len(indexes) > self.max_indexes_num:
+ indexes = self.encode(token, add_bos=False)[-1:]
+ self.logger.warning(
+ f'There are too many(>{self.max_indexes_num}) possible '
+ f'indexes may decoding {token}, we will use {indexes} only')
+ # there might be token id that exceeds self.vocab_size
+ if len(indexes) == 0:
+ indexes = self.encode(token, False)
+ if len(indexes) != 1:
+ self.logger.warning(
+ f'The token {token}, its length of indexes {indexes} is '
+ 'not 1. Currently, it can not be used as stop words')
+ indexes = []
+ self._indexes_tokens_deque.append((token, indexes))
+ return indexes
+
+ def encode(self, s: str, add_bos: bool = True, **kwargs):
+ """Tokenize a prompt.
+
+ Args:
+ s (str): a prompt
+ Returns:
+ list[int]: token ids
+ """
+ encoded = self.model.encode(s, **kwargs)
+ if not add_bos:
+ # in the middle of a session
+ if len(encoded) and encoded[0] == self.bos_token_id:
+ encoded = encoded[1:]
+ return encoded
+
+ def decode(self, t: Sequence[int], offset: Optional[int] = None):
+ """De-tokenize.
+
+ Args:
+ t (List[int]): a list of token ids
+ offset (int): for incrementally decoding. Default to None, which
+ means not applied.
+ Returns:
+ str: text of decoding tokens
+ """
+ skip_special_tokens = True
+ t = t[offset:]
+ out_string = self.model.decode(t,
+ skip_special_tokens=skip_special_tokens)
+ if offset:
+ out_string = self._maybe_add_prefix_space(t, out_string)
+ return out_string
+
+ def __call__(self, s: Union[str, Sequence[str]]):
+ """Tokenize prompts.
+
+ Args:
+ s (str): prompts
+ Returns:
+ list[int]: token ids
+ """
+ add_special_tokens = False
+ return self.model(s, add_special_tokens=add_special_tokens)
+
+
+class Tokenizer:
+ """Tokenize prompts or de-tokenize tokens into texts.
+
+ Args:
+ model_file (str): the path of the tokenizer model
+ """
+
+ def __init__(self, model_file: str):
+ if model_file.endswith('.model'):
+ model_folder = osp.split(model_file)[0]
+ else:
+ model_folder = model_file
+ model_file = osp.join(model_folder, 'tokenizer.model')
+ tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json')
+
+ model_file_exists = osp.exists(model_file)
+ config_exists = osp.exists(tokenizer_config_file)
+ use_hf_model = config_exists or not model_file_exists
+ self.logger = get_logger('lmdeploy')
+ if not use_hf_model:
+ self.model = SentencePieceTokenizer(model_file)
+ else:
+ self.model = HuggingFaceTokenizer(model_folder)
+
+ @property
+ def vocab_size(self):
+ """vocabulary size."""
+ return self.model.vocab_size
+
+ @property
+ def bos_token_id(self):
+ """begine of the sentence token id."""
+ return self.model.bos_token_id
+
+ @property
+ def eos_token_id(self):
+ """end of the sentence token id."""
+ return self.model.eos_token_id
+
+ def encode(self, s: str, add_bos: bool = True, **kwargs):
+ """Tokenize a prompt.
+
+ Args:
+ s (str): a prompt
+ Returns:
+ list[int]: token ids
+ """
+ return self.model.encode(s, add_bos, **kwargs)
+
+ def decode(self, t: Sequence[int], offset: Optional[int] = None):
+ """De-tokenize.
+
+ Args:
+ t (List[int]): a list of token ids
+ offset (int): for incrementally decoding. Default to None, which
+ means not applied.
+ Returns:
+ str: text of decoding tokens
+ """
+ return self.model.decode(t, offset)
+
+ def __call__(self, s: Union[str, Sequence[str]]):
+ """Tokenize prompts.
+
+ Args:
+ s (str): prompts
+ Returns:
+ list[int]: token ids
+ """
+ return self.model(s)
+
+ def indexes_containing_token(self, token):
+ """Return all the possible indexes, whose decoding output may contain
+ the input token."""
+ encoded = self.encode(token, add_bos=False)
+ if len(encoded) > 1:
+ self.logger.warning(
+ f'The token {token}, its length of indexes {encoded} is over '
+ 'than 1. Currently, it can not be used as stop words')
+ return []
+ return self.model.indexes_containing_token(token)
diff --git a/triton_models/preprocessing/1/tokenizer/tokenizer_config.json b/triton_models/preprocessing/1/tokenizer/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f133449013be570f08fdf7c70f1a2c8ccb4724da
--- /dev/null
+++ b/triton_models/preprocessing/1/tokenizer/tokenizer_config.json
@@ -0,0 +1,90 @@
+{
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92538": {
+ "content": "<|plugin|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92539": {
+ "content": "<|interpreter|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92540": {
+ "content": "<|action_end|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92541": {
+ "content": "<|action_start|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92542": {
+ "content": "<|im_end|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92543": {
+ "content": "<|im_start|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "auto_map": {
+ "AutoTokenizer": [
+ "tokenization_internlm.InternLMTokenizer",
+ null
+ ]
+ },
+ "bos_token": "",
+ "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "tokenizer_class": "InternLMTokenizer",
+ "unk_token": ""
+}
diff --git a/triton_models/preprocessing/config.pbtxt b/triton_models/preprocessing/config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..997ba399ba04f1f521bdbf088815d1dd3c26f696
--- /dev/null
+++ b/triton_models/preprocessing/config.pbtxt
@@ -0,0 +1,37 @@
+name: "preprocessing"
+backend: "python"
+max_batch_size: 1
+
+input [
+ {
+ name: "QUERY"
+ data_type: TYPE_STRING
+ dims: [ -1 ]
+ }
+]
+output [
+ {
+ name: "INPUT_ID"
+ data_type: TYPE_UINT32
+ dims: [ -1 ]
+ },
+ {
+ name: "REQUEST_INPUT_LEN"
+ data_type: TYPE_UINT32
+ dims: [ 1 ]
+ }
+]
+
+instance_group [
+ {
+ count: 4
+ kind: KIND_CPU
+ }
+]
+
+parameters {
+ key: "tokenizer_path"
+ value: {
+ string_value: "tokenizer/tokenizer.model"
+ }
+}
diff --git a/triton_models/tokenizer/config.json b/triton_models/tokenizer/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..84235b8a1a9618cc0ac265caf61ea4088780e3b1
--- /dev/null
+++ b/triton_models/tokenizer/config.json
@@ -0,0 +1,37 @@
+{
+ "_name_or_path": "/root/psy/internlm2-7b/work_dirs/internlm2_chat_7b_qlora_oasst1_512_e3_copy/hf_2/merge",
+ "architectures": [
+ "InternLM2ForCausalLM"
+ ],
+ "attn_implementation": "eager",
+ "auto_map": {
+ "AutoConfig": "configuration_internlm.InternLMConfig",
+ "AutoModel": "modeling_internlm2.InternLM2ForCausalLM",
+ "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM"
+ },
+ "bias": false,
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "fp16": true,
+ "hidden_act": "silu",
+ "hidden_size": 4096,
+ "initializer_range": 0.02,
+ "intermediate_size": 14336,
+ "max_position_embeddings": 32768,
+ "model_type": "internlm",
+ "num_attention_heads": 32,
+ "num_hidden_layers": 32,
+ "num_key_value_heads": 8,
+ "pad_token_id": 2,
+ "rms_norm_eps": 1e-05,
+ "rope_scaling": {
+ "factor": 2.0,
+ "type": "dynamic"
+ },
+ "rope_theta": 1000000,
+ "tie_word_embeddings": false,
+ "torch_dtype": "float16",
+ "transformers_version": "4.37.2",
+ "use_cache": false,
+ "vocab_size": 92544
+}
diff --git a/triton_models/tokenizer/configuration_internlm.py b/triton_models/tokenizer/configuration_internlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d013582feaa1f9970a4256c4a0f77000fa645de
--- /dev/null
+++ b/triton_models/tokenizer/configuration_internlm.py
@@ -0,0 +1,164 @@
+# coding=utf-8
+# Copyright (c) InternLM. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" InternLM model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class InternLMConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
+ an InternLM model according to the specified arguments, defining the model architecture. Instantiating a
+ configuration with the defaults will yield a similar configuration to that of the InternLM-7B.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+
+ Args:
+ vocab_size (`int`, *optional*, defaults to 32000):
+ Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the
+ `inputs_ids` passed when calling [`InternLMModel`]
+ hidden_size (`int`, *optional*, defaults to 4096):
+ Dimension of the hidden representations.
+ intermediate_size (`int`, *optional*, defaults to 11008):
+ Dimension of the MLP representations.
+ num_hidden_layers (`int`, *optional*, defaults to 32):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 32):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ num_key_value_heads (`int`, *optional*):
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+ by meanpooling all the original heads within that group. For more details checkout [this
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+ `num_attention_heads`.
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+ The non-linear activation function (function or string) in the decoder.
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
+ just in case (e.g., 512 or 1024 or 2048).
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+ The epsilon used by the rms normalization layers.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
+ relevant if `config.is_decoder=True`.
+ tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+ Whether to tie weight embeddings
+ Example:
+
+ ```python
+ >>> from transformers import InternLMModel, InternLMConfig
+
+ >>> # Initializing a InternLM internlm-7b style configuration
+ >>> configuration = InternLMConfig()
+
+ >>> # Initializing a model from the internlm-7b style configuration
+ >>> model = InternLMModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+ model_type = "internlm"
+ _auto_class = "AutoConfig"
+
+ def __init__( # pylint: disable=W0102
+ self,
+ vocab_size=103168,
+ hidden_size=4096,
+ intermediate_size=11008,
+ num_hidden_layers=32,
+ num_attention_heads=32,
+ num_key_value_heads=None,
+ hidden_act="silu",
+ max_position_embeddings=2048,
+ initializer_range=0.02,
+ rms_norm_eps=1e-6,
+ use_cache=True,
+ pad_token_id=0,
+ bos_token_id=1,
+ eos_token_id=2,
+ tie_word_embeddings=False,
+ bias=True,
+ rope_theta=10000,
+ rope_scaling=None,
+ attn_implementation="eager",
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.max_position_embeddings = max_position_embeddings
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.bias = bias
+
+ if num_key_value_heads is None:
+ num_key_value_heads = num_attention_heads
+ self.num_key_value_heads = num_key_value_heads
+
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.rms_norm_eps = rms_norm_eps
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.rope_scaling = rope_scaling
+ self._rope_scaling_validation()
+
+ self.attn_implementation = attn_implementation
+ if self.attn_implementation is None:
+ self.attn_implementation = "eager"
+ super().__init__(
+ pad_token_id=pad_token_id,
+ bos_token_id=bos_token_id,
+ eos_token_id=eos_token_id,
+ tie_word_embeddings=tie_word_embeddings,
+ **kwargs,
+ )
+
+ def _rope_scaling_validation(self):
+ """
+ Validate the `rope_scaling` configuration.
+ """
+ if self.rope_scaling is None:
+ return
+
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+ raise ValueError(
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+ f"got {self.rope_scaling}"
+ )
+ rope_scaling_type = self.rope_scaling.get("type", None)
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+ raise ValueError(
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+ )
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
+ raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")
diff --git a/triton_models/tokenizer/generation_config.json b/triton_models/tokenizer/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc5efeadd3bf2caa4462a3be79d580690f410668
--- /dev/null
+++ b/triton_models/tokenizer/generation_config.json
@@ -0,0 +1,7 @@
+{
+ "_from_model_config": true,
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "pad_token_id": 2,
+ "transformers_version": "4.37.2"
+}
diff --git a/triton_models/tokenizer/modeling_internlm2.py b/triton_models/tokenizer/modeling_internlm2.py
new file mode 100644
index 0000000000000000000000000000000000000000..39d6f71d2933385988ec05f845d3f6386c97f74b
--- /dev/null
+++ b/triton_models/tokenizer/modeling_internlm2.py
@@ -0,0 +1,1385 @@
+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on transformers/src/transformers/models/llama/modeling_llama.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch InternLM2 model."""
+import math
+import queue
+import threading
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+ BaseModelOutputWithPast,
+ CausalLMOutputWithPast,
+ SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+)
+
+try:
+ from transformers.generation.streamers import BaseStreamer
+except: # noqa # pylint: disable=bare-except
+ BaseStreamer = None
+
+from .configuration_internlm import InternLMConfig as InternLM2Config
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "InternLM2Config"
+
+flash_attn_func, flash_attn_varlen_func = None, None
+pad_input, index_first_axis, unpad_input = None, None, None
+def _import_flash_attn():
+ global flash_attn_func, flash_attn_varlen_func
+ global pad_input, index_first_axis, unpad_input
+ try:
+ from flash_attn import flash_attn_func as _flash_attn_func, flash_attn_varlen_func as _flash_attn_varlen_func
+ from flash_attn.bert_padding import pad_input as _pad_input, index_first_axis as _index_first_axis, unpad_input as _unpad_input
+ flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func
+ pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input
+ except ImportError:
+ raise ImportError("flash_attn is not installed.")
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+ return (
+ indices,
+ cu_seqlens,
+ max_seqlen_in_batch,
+ )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+
+ if past_key_values_length > 0:
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ bsz, src_len = mask.size()
+ tgt_len = tgt_len if tgt_len is not None else src_len
+
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+ inverted_mask = 1.0 - expanded_mask
+
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM2
+class InternLM2RMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ InternLM2RMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM2
+class InternLM2RotaryEmbedding(nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=torch.float32)
+
+ return (
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
+ )
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->InternLM2
+class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding):
+ """InternLM2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ t = t / self.scaling_factor
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM2
+class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding):
+ """InternLM2RotaryEmbedding extended with Dynamic NTK scaling.
+ Credits to the Reddit users /u/bloc97 and /u/emozilla.
+ """
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+
+ if seq_len > self.max_position_embeddings:
+ base = self.base * (
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+ ) ** (self.dim / (self.dim - 2))
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+# Copied from transformers.model.llama.modeling_llama.rotate_half
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+ """Applies Rotary Position Embedding to the query and key tensors."""
+ cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+ sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class InternLM2MLP(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.intermediate_size = config.intermediate_size
+ self.w1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.w3 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.w2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+ self.act_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, x):
+ down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x))
+
+ return down_proj
+
+
+# Copied from transformers.model.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# Modified from transformers.model.llama.modeling_llama.LlamaAttention
+class InternLM2Attention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: InternLM2Config):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = self.hidden_size // self.num_heads
+ self.num_key_value_heads = config.num_key_value_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ self.max_position_embeddings = config.max_position_embeddings
+ self.is_causal = True
+
+ if (self.head_dim * self.num_heads) != self.hidden_size:
+ raise ValueError(
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+ f" and `num_heads`: {self.num_heads})."
+ )
+
+ self.wqkv = nn.Linear(
+ self.hidden_size,
+ (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim,
+ bias=config.bias,
+ )
+
+ self.wo = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
+ self._init_rope()
+
+ def _init_rope(self):
+ if self.config.rope_scaling is None:
+ self.rotary_emb = InternLM2RotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.config.rope_theta,
+ )
+ else:
+ scaling_type = self.config.rope_scaling["type"]
+ scaling_factor = self.config.rope_scaling["factor"]
+ if scaling_type == "dynamic":
+ self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.config.rope_theta,
+ scaling_factor=scaling_factor,
+ )
+ elif scaling_type == "linear":
+ self.rotary_emb = InternLM2LinearScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.config.rope_theta,
+ scaling_factor=scaling_factor,
+ )
+ else:
+ raise ValueError("Currently we only support rotary embedding's type being 'dynamic' or 'linear'.")
+ return self.rotary_emb
+
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+ "Please make sure use `attention_mask` instead.`"
+ )
+
+ bsz, q_len, _ = hidden_states.size()
+
+ qkv_states = self.wqkv(hidden_states)
+
+ qkv_states = rearrange(
+ qkv_states,
+ "b q (h gs d) -> b q h gs d",
+ gs=2 + self.num_key_value_groups,
+ d=self.head_dim,
+ )
+
+ query_states = qkv_states[..., : self.num_key_value_groups, :]
+ query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d")
+ key_states = qkv_states[..., -2, :]
+ value_states = qkv_states[..., -1, :]
+
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+ past_key_value = (key_states, value_states) if use_cache else None
+
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+ )
+ attn_weights = attn_weights + attention_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+ attn_output = self.wo(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+# Modified from transformers.model.llama.modeling_llama.InternLM2FlashAttention2
+class InternLM2FlashAttention2(InternLM2Attention):
+ """
+ InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+ flash attention and deal with padding tokens in case the input contains any of them.
+ """
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ # InternLM2FlashAttention2 attention does not support output_attentions
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+ "Please make sure use `attention_mask` instead.`"
+ )
+
+ # overwrite attention_mask with padding_mask
+ attention_mask = kwargs.pop("padding_mask")
+
+ output_attentions = False
+
+ bsz, q_len, _ = hidden_states.size()
+
+ qkv_states = self.wqkv(hidden_states)
+
+ qkv_states = rearrange(
+ qkv_states,
+ "b q (h gs d) -> b q h gs d",
+ gs=2 + self.num_key_value_groups,
+ d=self.head_dim,
+ )
+
+ query_states = qkv_states[..., : self.num_key_value_groups, :]
+ query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d")
+ key_states = qkv_states[..., -2, :]
+ value_states = qkv_states[..., -1, :]
+
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+ past_key_value = (key_states, value_states) if use_cache else None
+
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ attn_output = self._flash_attention_forward(
+ query_states, key_states, value_states, attention_mask, q_len
+ )
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+ attn_output = self.wo(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+ def _flash_attention_forward(
+ self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+ ):
+ """
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+ first unpad the input, then computes the attention scores and pad the final attention scores.
+
+ Args:
+ query_states (`torch.Tensor`):
+ Input query states to be passed to Flash Attention API
+ key_states (`torch.Tensor`):
+ Input key states to be passed to Flash Attention API
+ value_states (`torch.Tensor`):
+ Input value states to be passed to Flash Attention API
+ attention_mask (`torch.Tensor`):
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+ position of padding tokens and 1 for the position of non-padding tokens.
+ dropout (`int`, *optional*):
+ Attention dropout
+ softmax_scale (`float`, *optional*):
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+ """
+ # Contains at least one padding token in the sequence
+ causal = self.is_causal and query_length != 1
+ if attention_mask is not None:
+ batch_size = query_states.shape[0]
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input(
+ query_states, key_states, value_states, attention_mask, query_length
+ )
+
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+ attn_output_unpad = flash_attn_varlen_func(
+ query_states,
+ key_states,
+ value_states,
+ cu_seqlens_q=cu_seqlens_q,
+ cu_seqlens_k=cu_seqlens_k,
+ max_seqlen_q=max_seqlen_in_batch_q,
+ max_seqlen_k=max_seqlen_in_batch_k,
+ dropout_p=dropout,
+ softmax_scale=softmax_scale,
+ causal=causal,
+ )
+
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+ else:
+ attn_output = flash_attn_func(
+ query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+ )
+
+ return attn_output
+
+ def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+ key_layer = index_first_axis(
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+ )
+ value_layer = index_first_axis(
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+ )
+
+ if query_length == kv_seq_len:
+ query_layer = index_first_axis(
+ query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+ )
+ cu_seqlens_q = cu_seqlens_k
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
+ indices_q = indices_k
+ elif query_length == 1:
+ max_seqlen_in_batch_q = 1
+ cu_seqlens_q = torch.arange(
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
+ ) # There is a memcpy here, that is very bad.
+ indices_q = cu_seqlens_q[:-1]
+ query_layer = query_layer.squeeze(1)
+ else:
+ # The -q_len: slice assumes left padding.
+ attention_mask = attention_mask[:, -query_length:]
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+ return (
+ query_layer,
+ key_layer,
+ value_layer,
+ indices_q.to(torch.int64),
+ (cu_seqlens_q, cu_seqlens_k),
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+ )
+
+INTERNLM2_ATTENTION_CLASSES = {
+ "eager": InternLM2Attention,
+ "flash_attention_2": InternLM2FlashAttention2,
+}
+
+# Modified from transformers.model.llama.modeling_llama.LlamaDecoderLayer
+class InternLM2DecoderLayer(nn.Module):
+ def __init__(self, config: InternLM2Config):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+
+ self.attention = INTERNLM2_ATTENTION_CLASSES[config.attn_implementation](config=config)
+
+ self.feed_forward = InternLM2MLP(config)
+ self.attention_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.ffn_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ **kwargs,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`, *optional*):
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+ query_sequence_length, key_sequence_length)` if default attention is used.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+ """
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+ "Please make sure use `attention_mask` instead.`"
+ )
+
+ residual = hidden_states
+
+ hidden_states = self.attention_norm(hidden_states)
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.attention(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ **kwargs,
+ )
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.ffn_norm(hidden_states)
+ hidden_states = self.feed_forward(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ return outputs
+
+
+InternLM2_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`InternLM2Config`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2
+@add_start_docstrings(
+ "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.",
+ InternLM2_START_DOCSTRING,
+)
+class InternLM2PreTrainedModel(PreTrainedModel):
+ config_class = InternLM2Config
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["InternLM2DecoderLayer"]
+ _skip_keys_device_placement = "past_key_values"
+
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+
+InternLM2_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+ `past_key_values`).
+
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+ information on the default strategy.
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.n_positions - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or
+ when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+ `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+ of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Modified from transformers.model.llama.modeling_llama.LlamaModel
+@add_start_docstrings(
+ "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.",
+ InternLM2_START_DOCSTRING,
+)
+class InternLM2Model(InternLM2PreTrainedModel):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLM2DecoderLayer`]
+
+ Args:
+ config: InternLM2Config
+ """
+
+ _auto_class = "AutoModel"
+
+ def __init__(self, config: InternLM2Config):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+ self.config = config
+
+ self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+
+ self.layers = nn.ModuleList([InternLM2DecoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.tok_embeddings
+
+ def set_input_embeddings(self, value):
+ self.tok_embeddings = value
+
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+ # create causal mask
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ combined_attention_mask = None
+ if input_shape[-1] > 1:
+ combined_attention_mask = _make_causal_mask(
+ input_shape,
+ inputs_embeds.dtype,
+ device=inputs_embeds.device,
+ past_key_values_length=past_key_values_length,
+ )
+
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+ inputs_embeds.device
+ )
+ combined_attention_mask = (
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+ )
+
+ return combined_attention_mask
+
+ @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if self.config.attn_implementation == "flash_attention_2":
+ _import_flash_attn()
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ batch_size, seq_length = input_ids.shape[:2]
+ elif inputs_embeds is not None:
+ batch_size, seq_length = inputs_embeds.shape[:2]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ seq_length_with_past = seq_length
+ past_key_values_length = 0
+ if past_key_values is not None:
+ past_key_values_length = past_key_values[0][0].shape[2]
+ seq_length_with_past = seq_length_with_past + past_key_values_length
+
+ if position_ids is None:
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+ position_ids = torch.arange(
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+ )
+ position_ids = position_ids.unsqueeze(0)
+
+ if inputs_embeds is None:
+ inputs_embeds = self.tok_embeddings(input_ids)
+
+ if self.config.attn_implementation == "flash_attention_2":
+ # 2d mask is passed through the layers
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+ else:
+ if attention_mask is None:
+ attention_mask = torch.ones(
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+ )
+ attention_mask = self._prepare_decoder_attention_mask(
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+ )
+
+ # embed positions
+ hidden_states = inputs_embeds
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ next_decoder_cache = () if use_cache else None
+
+ for idx, decoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ # None for past_key_value
+ return module(*inputs, output_attentions, None)
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(decoder_layer),
+ hidden_states,
+ attention_mask,
+ position_ids,
+ None,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ hidden_states = self.norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = next_decoder_cache if use_cache else None
+ if not return_dict:
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ )
+
+
+# Modified from transformers.model.llama.modeling_llama.LlamaForCausalLM
+class InternLM2ForCausalLM(InternLM2PreTrainedModel):
+ _auto_class = "AutoModelForCausalLM"
+
+ _tied_weights_keys = ["output.weight"]
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = InternLM2Model(config)
+ self.vocab_size = config.vocab_size
+ self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.tok_embeddings
+
+ def set_input_embeddings(self, value):
+ self.model.tok_embeddings = value
+
+ def get_output_embeddings(self):
+ return self.output
+
+ def set_output_embeddings(self, new_embeddings):
+ self.output = new_embeddings
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+ r"""
+ Args:
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+ Returns:
+
+ Example:
+
+ ```python
+ >>> from transformers import AutoTokenizer, InternLM2ForCausalLM
+
+ >>> model = InternLM2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+ >>> # Generate
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+ ```"""
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = outputs[0]
+ logits = self.output(hidden_states)
+ logits = logits.float()
+
+ loss = None
+ if labels is not None:
+ # Shift so that tokens < n predict n
+ shift_logits = logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+ # Flatten the tokens
+ loss_fct = CrossEntropyLoss()
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
+ shift_labels = shift_labels.view(-1)
+ # Enable model parallelism
+ shift_labels = shift_labels.to(shift_logits.device)
+ loss = loss_fct(shift_logits, shift_labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def prepare_inputs_for_generation(
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+ ):
+ if past_key_values is not None:
+ past_length = past_key_values[0][0].shape[2]
+
+ # Some generation methods already pass only the last input ID
+ if input_ids.shape[1] > past_length:
+ remove_prefix_length = past_length
+ else:
+ # Default to old behavior: keep only final ID
+ remove_prefix_length = input_ids.shape[1] - 1
+
+ input_ids = input_ids[:, remove_prefix_length:]
+
+ position_ids = kwargs.get("position_ids", None)
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -input_ids.shape[1] :]
+
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+ if inputs_embeds is not None and past_key_values is None:
+ model_inputs = {"inputs_embeds": inputs_embeds}
+ else:
+ model_inputs = {"input_ids": input_ids}
+
+ model_inputs.update(
+ {
+ "position_ids": position_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "attention_mask": attention_mask,
+ }
+ )
+ return model_inputs
+
+ @staticmethod
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+ )
+ return reordered_past
+
+ def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
+ prompt = ""
+ if meta_instruction:
+ prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n"""
+ else:
+ prompt += ""
+ for record in history:
+ prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
+ prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
+ return tokenizer([prompt], return_tensors="pt")
+
+ @torch.no_grad()
+ def chat(
+ self,
+ tokenizer,
+ query: str,
+ history: List[Tuple[str, str]] = [],
+ streamer: Optional[BaseStreamer] = None,
+ max_new_tokens: int = 1024,
+ do_sample: bool = True,
+ temperature: float = 0.8,
+ top_p: float = 0.8,
+ meta_instruction: str = "You are an AI assistant whose name is InternLM (书生·浦语).\n"
+ "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
+ "- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.",
+ **kwargs,
+ ):
+ inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
+ inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
+ # also add end-of-assistant token in eos token id to avoid unnecessary generation
+ eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
+ outputs = self.generate(
+ **inputs,
+ streamer=streamer,
+ max_new_tokens=max_new_tokens,
+ do_sample=do_sample,
+ temperature=temperature,
+ top_p=top_p,
+ eos_token_id=eos_token_id,
+ **kwargs,
+ )
+ outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
+ response = tokenizer.decode(outputs, skip_special_tokens=True)
+ response = response.split("<|im_end|>")[0]
+ history = history + [(query, response)]
+ return response, history
+
+ @torch.no_grad()
+ def stream_chat(
+ self,
+ tokenizer,
+ query: str,
+ history: List[Tuple[str, str]] = [],
+ max_new_tokens: int = 1024,
+ do_sample: bool = True,
+ temperature: float = 0.8,
+ top_p: float = 0.8,
+ **kwargs,
+ ):
+ """
+ Return a generator in format: (response, history)
+ Eg.
+ ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')])
+ ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')])
+ """
+ if BaseStreamer is None:
+ raise ModuleNotFoundError(
+ "The version of `transformers` is too low. Please make sure "
+ "that you have installed `transformers>=4.28.0`."
+ )
+
+ response_queue = queue.Queue(maxsize=20)
+
+ class ChatStreamer(BaseStreamer):
+ def __init__(self, tokenizer) -> None:
+ super().__init__()
+ self.tokenizer = tokenizer
+ self.queue = response_queue
+ self.query = query
+ self.history = history
+ self.response = ""
+ self.received_inputs = False
+ self.queue.put((self.response, history + [(self.query, self.response)]))
+
+ def put(self, value):
+ if len(value.shape) > 1 and value.shape[0] > 1:
+ raise ValueError("ChatStreamer only supports batch size 1")
+ elif len(value.shape) > 1:
+ value = value[0]
+
+ if not self.received_inputs:
+ # The first received value is input_ids, ignore here
+ self.received_inputs = True
+ return
+
+ token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
+ if token.strip() != "<|im_end|>":
+ self.response = self.response + token
+ history = self.history + [(self.query, self.response)]
+ self.queue.put((self.response, history))
+
+ def end(self):
+ self.queue.put(None)
+
+ def stream_producer():
+ return self.chat(
+ tokenizer=tokenizer,
+ query=query,
+ streamer=ChatStreamer(tokenizer=tokenizer),
+ history=history,
+ max_new_tokens=max_new_tokens,
+ do_sample=do_sample,
+ temperature=temperature,
+ top_p=top_p,
+ **kwargs,
+ )
+
+ def consumer():
+ producer = threading.Thread(target=stream_producer)
+ producer.start()
+ while True:
+ res = response_queue.get()
+ if res is None:
+ return
+ yield res
+
+ return consumer()
+
+
+# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
+@add_start_docstrings(
+ """
+ The InternLM2 Model transformer with a sequence classification head on top (linear layer).
+
+ [`InternLM2ForSequenceClassification`] uses the last token in order to do the classification,
+ as other causal models (e.g. GPT-2) do.
+
+ Since it does classification on the last token, it requires to know the position of the last token. If a
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+ each row of the batch).
+ """,
+ InternLM2_START_DOCSTRING,
+)
+class InternLM2ForSequenceClassification(InternLM2PreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+ self.model = InternLM2Model(config)
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.tok_embeddings
+
+ def set_input_embeddings(self, value):
+ self.model.tok_embeddings = value
+
+ @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ transformer_outputs = self.model(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ hidden_states = transformer_outputs[0]
+ logits = self.score(hidden_states)
+
+ if input_ids is not None:
+ batch_size = input_ids.shape[0]
+ else:
+ batch_size = inputs_embeds.shape[0]
+
+ if self.config.pad_token_id is None and batch_size != 1:
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+ if self.config.pad_token_id is None:
+ sequence_lengths = -1
+ else:
+ if input_ids is not None:
+ sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to(
+ logits.device
+ )
+ else:
+ sequence_lengths = -1
+
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+ loss = None
+ if labels is not None:
+ labels = labels.to(logits.device)
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(pooled_logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(pooled_logits, labels)
+ if not return_dict:
+ output = (pooled_logits,) + transformer_outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SequenceClassifierOutputWithPast(
+ loss=loss,
+ logits=pooled_logits,
+ past_key_values=transformer_outputs.past_key_values,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
diff --git a/triton_models/tokenizer/placeholder b/triton_models/tokenizer/placeholder
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/triton_models/tokenizer/pytorch_model.bin.index.json b/triton_models/tokenizer/pytorch_model.bin.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d95cf180df4c423e817c55f30f5ce93ac80e220
--- /dev/null
+++ b/triton_models/tokenizer/pytorch_model.bin.index.json
@@ -0,0 +1,554 @@
+{
+ "metadata": {
+ "total_size": 5251801088
+ },
+ "weight_map": {
+ "model.layers.0.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.0.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.1.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.10.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.10.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.11.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.12.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.13.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.14.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.15.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.16.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.17.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.18.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.19.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.2.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.2.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.20.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.20.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.21.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.22.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.23.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.24.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.25.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.attention_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.26.ffn_norm.weight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wo.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wo.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin",
+ "model.layers.27.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.27.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.28.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.29.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.3.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.3.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.30.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.30.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wo.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wo.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.attention_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin",
+ "model.layers.31.ffn_norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.layers.4.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.4.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.5.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.6.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.7.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.8.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wo.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wo.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.attention_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin",
+ "model.layers.9.ffn_norm.weight": "pytorch_model-00001-of-00003.bin",
+ "model.norm.weight": "pytorch_model-00003-of-00003.bin",
+ "model.tok_embeddings.weight": "pytorch_model-00001-of-00003.bin",
+ "output.weight": "pytorch_model-00003-of-00003.bin"
+ }
+}
diff --git a/triton_models/tokenizer/special_tokens_map.json b/triton_models/tokenizer/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..492d4b2966a1763442d426d880dbc29f94906e4c
--- /dev/null
+++ b/triton_models/tokenizer/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/triton_models/tokenizer/tokenization_internlm.py b/triton_models/tokenizer/tokenization_internlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9792349c7fed6fc64476eabdd9dad7a84640c3ee
--- /dev/null
+++ b/triton_models/tokenizer/tokenization_internlm.py
@@ -0,0 +1,240 @@
+# coding=utf-8
+# Copyright (c) InternLM. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes for IntermLM."""
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {}
+
+
+class InternLMTokenizer(PreTrainedTokenizer):
+ """
+ Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+ Args:
+ vocab_file (`str`):
+ Path to the vocabulary file.
+ """
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+ model_input_names = ["input_ids", "attention_mask"]
+ _auto_class = "AutoTokenizer"
+
+ def __init__(
+ self,
+ vocab_file,
+ unk_token="",
+ bos_token="",
+ eos_token="",
+ pad_token="",
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
+ add_bos_token=True,
+ add_eos_token=False,
+ decode_with_prefix_space=False,
+ clean_up_tokenization_spaces=False,
+ **kwargs,
+ ):
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ self.vocab_file = vocab_file
+ self.add_bos_token = add_bos_token
+ self.add_eos_token = add_eos_token
+ self.decode_with_prefix_space = decode_with_prefix_space
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+ self._no_prefix_space_tokens = None
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ pad_token=pad_token,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ **kwargs,
+ )
+
+ """ Initialization"""
+
+ @property
+ def no_prefix_space_tokens(self):
+ if self._no_prefix_space_tokens is None:
+ vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
+ self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
+ return self._no_prefix_space_tokens
+
+ @property
+ def vocab_size(self):
+ """Returns vocab size"""
+ return self.sp_model.get_piece_size()
+
+ @property
+ def bos_token_id(self) -> Optional[int]:
+ return self.sp_model.bos_id()
+
+ @property
+ def eos_token_id(self) -> Optional[int]:
+ return self.sp_model.eos_id()
+
+ def get_vocab(self):
+ """Returns vocab as a dict"""
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
+ def _tokenize(self, text):
+ """Returns a tokenized string."""
+ return self.sp_model.encode(text, out_type=str)
+
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ return self.sp_model.piece_to_id(token)
+
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ token = self.sp_model.IdToPiece(index)
+ return token
+
+ def _maybe_add_prefix_space(self, tokens, decoded):
+ if tokens and tokens[0] not in self.no_prefix_space_tokens:
+ return " " + decoded
+ else:
+ return decoded
+
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (string) in a single string."""
+ current_sub_tokens = []
+ out_string = ""
+ prev_is_special = False
+ for token in tokens:
+ # make sure that special tokens are not decoded using sentencepiece model
+ if token in self.all_special_tokens:
+ if not prev_is_special:
+ out_string += " "
+ out_string += self.sp_model.decode(current_sub_tokens) + token
+ prev_is_special = True
+ current_sub_tokens = []
+ else:
+ current_sub_tokens.append(token)
+ prev_is_special = False
+ out_string += self.sp_model.decode(current_sub_tokens)
+ out_string = self.clean_up_tokenization(out_string)
+ out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
+ return out_string[1:]
+
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ """
+ Save the vocabulary and special tokens file to a directory.
+
+ Args:
+ save_directory (`str`):
+ The directory in which to save the vocabulary.
+
+ Returns:
+ `Tuple(str)`: Paths to the files saved.
+ """
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ out_vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+ copyfile(self.vocab_file, out_vocab_file)
+ elif not os.path.isfile(self.vocab_file):
+ with open(out_vocab_file, "wb") as fi:
+ content_spiece_model = self.sp_model.serialized_model_proto()
+ fi.write(content_spiece_model)
+
+ return (out_vocab_file,)
+
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+ if self.add_bos_token:
+ bos_token_ids = [self.bos_token_id]
+ else:
+ bos_token_ids = []
+
+ output = bos_token_ids + token_ids_0
+
+ if token_ids_1 is not None:
+ output = output + token_ids_1
+
+ if self.add_eos_token:
+ output = output + [self.eos_token_id]
+
+ return output
+
+ def get_special_tokens_mask(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+ ) -> List[int]:
+ """
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+ special tokens using the tokenizer `prepare_for_model` method.
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+ Whether or not the token list is already formatted with special tokens for the model.
+
+ Returns:
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ """
+ if already_has_special_tokens:
+ return super().get_special_tokens_mask(
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+ )
+
+ if token_ids_1 is None:
+ return [1] + ([0] * len(token_ids_0)) + [1]
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+ def create_token_type_ids_from_sequences(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ """
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
+ use of token type ids, therefore a list of zeros is returned.
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+
+ Returns:
+ `List[int]`: List of zeros.
+ """
+ eos = [self.eos_token_id]
+
+ if token_ids_1 is None:
+ return len(token_ids_0 + eos) * [0]
+ return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
diff --git a/triton_models/tokenizer/tokenizer.model b/triton_models/tokenizer/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6600712949ca9c4ffb50f25275993a21fba0b408
--- /dev/null
+++ b/triton_models/tokenizer/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b
+size 1477754
diff --git a/triton_models/tokenizer/tokenizer.py b/triton_models/tokenizer/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..db936a5501cb07d33d56083656dbd734ba7431bf
--- /dev/null
+++ b/triton_models/tokenizer/tokenizer.py
@@ -0,0 +1,400 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import os.path as osp
+from collections import deque
+from typing import List, Optional, Sequence, Union
+
+import torch
+
+from lmdeploy.utils import get_logger
+
+# this file will be copied to triton server, make sure all
+# importing are starting from the package root lmdeploy
+
+
+class SentencePieceTokenizer:
+ """Tokenizer of sentencepiece.
+
+ Args:
+ model_file (str): the path of the tokenizer model
+ """
+
+ def __init__(self, model_file: str):
+ from sentencepiece import SentencePieceProcessor
+ self.model = SentencePieceProcessor(model_file=model_file)
+ self._prefix_space_tokens = None
+ # for stop words
+ self._maybe_decode_bytes: bool = None
+ # TODO maybe lack a constant.py
+ self._indexes_tokens_deque = deque(maxlen=10)
+ self.max_indexes_num = 5
+ self.logger = get_logger('lmdeploy')
+
+ @property
+ def vocab_size(self):
+ """vocabulary size."""
+ return self.model.vocab_size()
+
+ @property
+ def bos_token_id(self):
+ """begine of the sentence token id."""
+ return self.model.bos_id()
+
+ @property
+ def eos_token_id(self):
+ """end of the sentence token id."""
+ return self.model.eos_id()
+
+ @property
+ def prefix_space_tokens(self):
+ """tokens without prefix space."""
+ if self._prefix_space_tokens is None:
+ vocab = self.model.IdToPiece(list(range(self.vocab_size)))
+ self._prefix_space_tokens = {
+ i
+ for i, tok in enumerate(vocab) if tok.startswith('▁')
+ }
+ return self._prefix_space_tokens
+
+ def _maybe_add_prefix_space(self, tokens, decoded):
+ """maybe add prefix space for incremental decoding."""
+ if len(tokens) and not decoded.startswith(' ') and\
+ tokens[0] in self.prefix_space_tokens:
+ return ' ' + decoded
+ else:
+ return decoded
+
+ def indexes_containing_token(self, token: str):
+ """Return all the possible indexes, whose decoding output may contain
+ the input token."""
+ # traversing vocab is time consuming, can not be accelerated with
+ # multi threads (computation) or multi process (can't pickle tokenizer)
+ # so, we maintain latest 10 stop words and return directly if matched
+ for _token, _indexes in self._indexes_tokens_deque:
+ if token == _token:
+ return _indexes
+ if token == ' ': # ' ' is special
+ token = '▁'
+ vocab = self.model.IdToPiece(list(range(self.vocab_size)))
+ indexes = [i for i, voc in enumerate(vocab) if token in voc]
+ if len(indexes) > self.max_indexes_num:
+ indexes = self.encode(token, add_bos=False)[-1:]
+ self.logger.warning(
+ f'There are too many(>{self.max_indexes_num}) possible '
+ f'indexes may decoding {token}, we will use {indexes} only')
+ self._indexes_tokens_deque.append((token, indexes))
+ return indexes
+
+ def encode(self, s: str, add_bos: bool = True, **kwargs):
+ """Tokenize a prompt.
+
+ Args:
+ s (str): a prompt
+ Returns:
+ list[int]: token ids
+ """
+ return self.model.Encode(s, add_bos=add_bos, **kwargs)
+
+ def decode(self, t: Sequence[int], offset: Optional[int] = None):
+ """De-tokenize.
+
+ Args:
+ t (List[int]): a list of token ids
+ offset (int): for incrementally decoding. Default to None, which
+ means not applied.
+ Returns:
+ str: text of decoding tokens
+ """
+ if isinstance(t, torch.Tensor):
+ t = t.tolist()
+ t = t[offset:]
+ out_string = self.model.Decode(t)
+ if offset:
+ out_string = self._maybe_add_prefix_space(t, out_string)
+ return out_string
+
+ def __call__(self, s: Union[str, Sequence[str]]):
+ """Tokenize prompts.
+
+ Args:
+ s (str): prompts
+ Returns:
+ list[int]: token ids
+ """
+ import addict
+ add_bos = False
+ add_eos = False
+
+ input_ids = self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
+ return addict.Addict(input_ids=input_ids)
+
+
+class HuggingFaceTokenizer:
+ """Tokenizer of sentencepiece.
+
+ Args:
+ model_dir (str): the directory of the tokenizer model
+ """
+
+ def __init__(self, model_dir: str):
+ from transformers import AutoTokenizer
+ model_file = osp.join(model_dir, 'tokenizer.model')
+ backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json')
+ model_file_exists = osp.exists(model_file)
+ self.logger = get_logger('lmdeploy')
+ if not osp.exists(backend_tokenizer_file) and model_file_exists:
+ self.logger.warning(
+ 'Can not find tokenizer.json. '
+ 'It may take long time to initialize the tokenizer.')
+ self.model = AutoTokenizer.from_pretrained(model_dir,
+ trust_remote_code=True)
+ self._prefix_space_tokens = None
+ # save tokenizer.json to reuse
+ if not osp.exists(backend_tokenizer_file) and model_file_exists:
+ if hasattr(self.model, 'backend_tokenizer'):
+ if os.access(model_dir, os.W_OK):
+ self.model.backend_tokenizer.save(backend_tokenizer_file)
+
+ if self.model.eos_token_id is None:
+ generation_config_file = osp.join(model_dir,
+ 'generation_config.json')
+ if osp.exists(generation_config_file):
+ with open(generation_config_file, 'r') as f:
+ cfg = json.load(f)
+ self.model.eos_token_id = cfg['eos_token_id']
+ elif hasattr(self.model, 'eod_id'): # Qwen remote
+ self.model.eos_token_id = self.model.eod_id
+
+ # for stop words
+ self._vocab_size_with_added: int = None
+ self._maybe_decode_bytes: bool = None
+ # TODO maybe lack a constant.py
+ self._indexes_tokens_deque = deque(maxlen=10)
+ self.max_indexes_num = 5
+ self.token2id = {}
+
+ @property
+ def vocab_size(self):
+ """vocabulary size."""
+ return self.model.vocab_size
+
+ @property
+ def vocab_size_with_added(self):
+ """vocabulary size with added vocab."""
+ if self._vocab_size_with_added is not None:
+ return self._vocab_size_with_added
+ self._vocab_size_with_added = len(self.model.get_vocab())
+ return self._vocab_size_with_added
+
+ @property
+ def bos_token_id(self):
+ """begine of the sentence token id."""
+ return self.model.bos_token_id
+
+ @property
+ def eos_token_id(self):
+ """end of the sentence token id."""
+ return self.model.eos_token_id
+
+ @property
+ def prefix_space_tokens(self):
+ """tokens without prefix space."""
+ if self._prefix_space_tokens is None:
+ vocab = self.model.convert_ids_to_tokens(
+ list(range(self.vocab_size)))
+ self._prefix_space_tokens = {
+ i
+ for i, tok in enumerate(vocab)
+ if tok.startswith('▁' if isinstance(tok, str) else b' ')
+ }
+ return self._prefix_space_tokens
+
+ def _maybe_add_prefix_space(self, tokens: List[int], decoded: str):
+ """maybe add prefix space for incremental decoding."""
+ if len(tokens) and not decoded.startswith(' ') and\
+ tokens[0] in self.prefix_space_tokens:
+ return ' ' + decoded
+ else:
+ return decoded
+
+ @property
+ def maybe_decode_bytes(self):
+ """Check if self.model.convert_ids_to_tokens return not a str value."""
+ if self._maybe_decode_bytes is None:
+ self._maybe_decode_bytes = False
+ vocab = self.model.convert_ids_to_tokens(
+ list(range(self.vocab_size)))
+ for tok in vocab:
+ if not isinstance(tok, str):
+ self._maybe_decode_bytes = True
+ break
+ return self._maybe_decode_bytes
+
+ def indexes_containing_token(self, token: str):
+ """Return all the possible indexes, whose decoding output may contain
+ the input token."""
+ # traversing vocab is time consuming, can not be accelerated with
+ # multi threads (computation) or multi process (can't pickle tokenizer)
+ # so, we maintain latest 10 stop words and return directly if matched
+ for _token, _indexes in self._indexes_tokens_deque:
+ if token == _token:
+ return _indexes
+
+ if self.token2id == {}:
+ # decode is slower than convert_ids_to_tokens
+ if self.maybe_decode_bytes:
+ self.token2id = {
+ self.model.decode(i): i
+ for i in range(self.vocab_size)
+ }
+ else:
+ self.token2id = {
+ self.model.convert_ids_to_tokens(i): i
+ for i in range(self.vocab_size)
+ }
+ if token == ' ': # ' ' is special
+ token = '▁'
+ indexes = [i for _token, i in self.token2id.items() if token in _token]
+ if len(indexes) > self.max_indexes_num:
+ indexes = self.encode(token, add_bos=False)[-1:]
+ self.logger.warning(
+ f'There are too many(>{self.max_indexes_num}) possible '
+ f'indexes may decoding {token}, we will use {indexes} only')
+ # there might be token id that exceeds self.vocab_size
+ if len(indexes) == 0:
+ indexes = self.encode(token, False)
+ if len(indexes) != 1:
+ self.logger.warning(
+ f'The token {token}, its length of indexes {indexes} is '
+ 'not 1. Currently, it can not be used as stop words')
+ indexes = []
+ self._indexes_tokens_deque.append((token, indexes))
+ return indexes
+
+ def encode(self, s: str, add_bos: bool = True, **kwargs):
+ """Tokenize a prompt.
+
+ Args:
+ s (str): a prompt
+ Returns:
+ list[int]: token ids
+ """
+ encoded = self.model.encode(s, **kwargs)
+ if not add_bos:
+ # in the middle of a session
+ if len(encoded) and encoded[0] == self.bos_token_id:
+ encoded = encoded[1:]
+ return encoded
+
+ def decode(self, t: Sequence[int], offset: Optional[int] = None):
+ """De-tokenize.
+
+ Args:
+ t (List[int]): a list of token ids
+ offset (int): for incrementally decoding. Default to None, which
+ means not applied.
+ Returns:
+ str: text of decoding tokens
+ """
+ skip_special_tokens = True
+ t = t[offset:]
+ out_string = self.model.decode(t,
+ skip_special_tokens=skip_special_tokens)
+ if offset:
+ out_string = self._maybe_add_prefix_space(t, out_string)
+ return out_string
+
+ def __call__(self, s: Union[str, Sequence[str]]):
+ """Tokenize prompts.
+
+ Args:
+ s (str): prompts
+ Returns:
+ list[int]: token ids
+ """
+ add_special_tokens = False
+ return self.model(s, add_special_tokens=add_special_tokens)
+
+
+class Tokenizer:
+ """Tokenize prompts or de-tokenize tokens into texts.
+
+ Args:
+ model_file (str): the path of the tokenizer model
+ """
+
+ def __init__(self, model_file: str):
+ if model_file.endswith('.model'):
+ model_folder = osp.split(model_file)[0]
+ else:
+ model_folder = model_file
+ model_file = osp.join(model_folder, 'tokenizer.model')
+ tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json')
+
+ model_file_exists = osp.exists(model_file)
+ config_exists = osp.exists(tokenizer_config_file)
+ use_hf_model = config_exists or not model_file_exists
+ self.logger = get_logger('lmdeploy')
+ if not use_hf_model:
+ self.model = SentencePieceTokenizer(model_file)
+ else:
+ self.model = HuggingFaceTokenizer(model_folder)
+
+ @property
+ def vocab_size(self):
+ """vocabulary size."""
+ return self.model.vocab_size
+
+ @property
+ def bos_token_id(self):
+ """begine of the sentence token id."""
+ return self.model.bos_token_id
+
+ @property
+ def eos_token_id(self):
+ """end of the sentence token id."""
+ return self.model.eos_token_id
+
+ def encode(self, s: str, add_bos: bool = True, **kwargs):
+ """Tokenize a prompt.
+
+ Args:
+ s (str): a prompt
+ Returns:
+ list[int]: token ids
+ """
+ return self.model.encode(s, add_bos, **kwargs)
+
+ def decode(self, t: Sequence[int], offset: Optional[int] = None):
+ """De-tokenize.
+
+ Args:
+ t (List[int]): a list of token ids
+ offset (int): for incrementally decoding. Default to None, which
+ means not applied.
+ Returns:
+ str: text of decoding tokens
+ """
+ return self.model.decode(t, offset)
+
+ def __call__(self, s: Union[str, Sequence[str]]):
+ """Tokenize prompts.
+
+ Args:
+ s (str): prompts
+ Returns:
+ list[int]: token ids
+ """
+ return self.model(s)
+
+ def indexes_containing_token(self, token):
+ """Return all the possible indexes, whose decoding output may contain
+ the input token."""
+ encoded = self.encode(token, add_bos=False)
+ if len(encoded) > 1:
+ self.logger.warning(
+ f'The token {token}, its length of indexes {encoded} is over '
+ 'than 1. Currently, it can not be used as stop words')
+ return []
+ return self.model.indexes_containing_token(token)
diff --git a/triton_models/tokenizer/tokenizer_config.json b/triton_models/tokenizer/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f133449013be570f08fdf7c70f1a2c8ccb4724da
--- /dev/null
+++ b/triton_models/tokenizer/tokenizer_config.json
@@ -0,0 +1,90 @@
+{
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92538": {
+ "content": "<|plugin|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92539": {
+ "content": "<|interpreter|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92540": {
+ "content": "<|action_end|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92541": {
+ "content": "<|action_start|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92542": {
+ "content": "<|im_end|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "92543": {
+ "content": "<|im_start|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "auto_map": {
+ "AutoTokenizer": [
+ "tokenization_internlm.InternLMTokenizer",
+ null
+ ]
+ },
+ "bos_token": "",
+ "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "tokenizer_class": "InternLMTokenizer",
+ "unk_token": ""
+}
diff --git a/triton_models/weights/config.ini b/triton_models/weights/config.ini
new file mode 100644
index 0000000000000000000000000000000000000000..88f3d40970a1e663689736be546f8d3d64bb8734
--- /dev/null
+++ b/triton_models/weights/config.ini
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c8358cd3fffcb86829f6b600bdd0ba77b6147eed572f88700ec4d914db070d6
+size 645
diff --git a/triton_models/weights/layers.0.attention.w_qkv.0.qweight b/triton_models/weights/layers.0.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..4f5435a75963ce7ce17b0536f500c8ebf8ca4220
--- /dev/null
+++ b/triton_models/weights/layers.0.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1763929a6e7bbdafdb81d39ebfa08263351ccea12347aa68b292b1b7c458e45
+size 12582912
diff --git a/triton_models/weights/layers.0.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.0.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..52107ec494683ad0e0403e4189bcceed1ceabdcb
--- /dev/null
+++ b/triton_models/weights/layers.0.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ed40e83191f5304fd2df93ff5b90ae9a165bbe489af8020e06948fbbb289d7d
+size 786432
diff --git a/triton_models/weights/layers.0.attention.wo.0.qweight b/triton_models/weights/layers.0.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..6e21231bbe43b92e43a0d2600ed6969f6c00e767
--- /dev/null
+++ b/triton_models/weights/layers.0.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6710235be94402052aaaae809e488f433d75d6d33acf546e2d0bf7aae4d8f0f
+size 8388608
diff --git a/triton_models/weights/layers.0.attention.wo.0.scales_zeros b/triton_models/weights/layers.0.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..4961bf6cfbf6ae7592675c56d719924794d8da68
--- /dev/null
+++ b/triton_models/weights/layers.0.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c069c91ef3a796ac2e9e0230319fabb6bc8433c68284c6e5ca71baa477a3438
+size 524288
diff --git a/triton_models/weights/layers.0.attention_norm.weight b/triton_models/weights/layers.0.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..51dd734ab95204a4ce7fd026707a375f1a85219f
--- /dev/null
+++ b/triton_models/weights/layers.0.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dde3cfe82d02d87660f40c667186249cd17a5ee5924ab2a3ea0385919a2d0f3b
+size 8192
diff --git a/triton_models/weights/layers.0.feed_forward.w13.0.qweight b/triton_models/weights/layers.0.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f3167a75e6defd59aa396437f58c797bb5cf1b2c
--- /dev/null
+++ b/triton_models/weights/layers.0.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26bc912102aa2b487baf312f3bfd8f97dc46ba6761c2328bfd3e45581bfbcfd4
+size 58720256
diff --git a/triton_models/weights/layers.0.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.0.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..68343cbdcbc17ec725af43c1a1d53b62bc5c32c0
--- /dev/null
+++ b/triton_models/weights/layers.0.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:309c93937a8778e4e4dce879efd1e0673f4bb7701644628abbaa8420e5b24cf0
+size 3670016
diff --git a/triton_models/weights/layers.0.feed_forward.w2.0.qweight b/triton_models/weights/layers.0.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..3e82c77a6ba7b16d19d55f544f872223d33fba6d
--- /dev/null
+++ b/triton_models/weights/layers.0.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d096d08769d4b05f7483b4ed024224e0d4d35772231e757157e69c9c0dc1c6ef
+size 29360128
diff --git a/triton_models/weights/layers.0.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.0.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..fee7031bc4703588c99d993aaf4e1c0f1d080e5b
--- /dev/null
+++ b/triton_models/weights/layers.0.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdb73c0a0f614f1033850266d6ff4311374557a2653e0fa7857f8507ca87058e
+size 1835008
diff --git a/triton_models/weights/layers.0.ffn_norm.weight b/triton_models/weights/layers.0.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..e8f321d4e16161bcdf7f2b6979e9f90b8aa04ef3
--- /dev/null
+++ b/triton_models/weights/layers.0.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5b414270e0d50fbec62cdab6ecd217c2f688872d5ed7d9f91bb75dfff46651b
+size 8192
diff --git a/triton_models/weights/layers.0.past_kv_scale.0.weight b/triton_models/weights/layers.0.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..e376c6acc6ad65b07267f834beda69a889c5f0b1
--- /dev/null
+++ b/triton_models/weights/layers.0.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25f7250671024d0129c45c3f3d8f57887921d219c280350697d41e9170925c77
+size 16
diff --git a/triton_models/weights/layers.1.attention.w_qkv.0.qweight b/triton_models/weights/layers.1.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..bb3ebc7beaa1d925c4a14fbad6d2df2ec6bad94f
--- /dev/null
+++ b/triton_models/weights/layers.1.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a125e82d7ee989858902abca2bec9dc3f4ad74008f5307a1e7a635d148c53f3a
+size 12582912
diff --git a/triton_models/weights/layers.1.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.1.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..bc0ed1f6f8ef00629e07ce4989e2ddde96723c08
--- /dev/null
+++ b/triton_models/weights/layers.1.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f96d91127194d8a8404809f81602727e59903c86473ee27012bb303f83cdf77
+size 786432
diff --git a/triton_models/weights/layers.1.attention.wo.0.qweight b/triton_models/weights/layers.1.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2eaa43207863db980e17ed160bc4613b175baf27
--- /dev/null
+++ b/triton_models/weights/layers.1.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4905342d79812e6bd9d6d993443ee6b30df2f80cef44176d1398dc884c458bad
+size 8388608
diff --git a/triton_models/weights/layers.1.attention.wo.0.scales_zeros b/triton_models/weights/layers.1.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c136a82b25947dc950216cf643734a4a5ee81a36
--- /dev/null
+++ b/triton_models/weights/layers.1.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c7971bdedd76bbe5630fd97b2badbdd26d22055ffe6fe0374fff051af9feb80
+size 524288
diff --git a/triton_models/weights/layers.1.attention_norm.weight b/triton_models/weights/layers.1.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..abe49b3b4fe282cbcf269cc92e4a1b03f8304d1b
--- /dev/null
+++ b/triton_models/weights/layers.1.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d589a6b27b707580d37c4b198dc952071bb1a34967ebd9175f9055ac012bc781
+size 8192
diff --git a/triton_models/weights/layers.1.feed_forward.w13.0.qweight b/triton_models/weights/layers.1.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..7d2bbd8d926a99dd1ba3adf0859660ace736b884
--- /dev/null
+++ b/triton_models/weights/layers.1.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1dd761cf75a1f95c5a55a245fbe1a8bca8967be0d7a03dd12108d0be835d7682
+size 58720256
diff --git a/triton_models/weights/layers.1.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.1.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..9fb67e07dca86f3c043855b520b84ed83c9b4930
--- /dev/null
+++ b/triton_models/weights/layers.1.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d4fdfeee03517f7896aadab5adec50c8449a2e1bda2f0cf5b8725b26057d1f6
+size 3670016
diff --git a/triton_models/weights/layers.1.feed_forward.w2.0.qweight b/triton_models/weights/layers.1.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..83348571bf69b92747b68f25d3755c7b2146e4c5
--- /dev/null
+++ b/triton_models/weights/layers.1.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0c42be27fe2e9f48473b5cc4ec63cd06575ade857ea8699b4bd05eb4f801dc6
+size 29360128
diff --git a/triton_models/weights/layers.1.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.1.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..7f8d31081aee57241eed23ae114dd5e39f9e6bbf
--- /dev/null
+++ b/triton_models/weights/layers.1.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe915a8697f98fe80270d235325b469219fac1c8a4529052fd15f6b1ee8f13e6
+size 1835008
diff --git a/triton_models/weights/layers.1.ffn_norm.weight b/triton_models/weights/layers.1.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..6db47869baaf62ea10c904bb39ca2fd8dcb35aa5
--- /dev/null
+++ b/triton_models/weights/layers.1.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90fa27f32ad04b368d7110fb689b24ea02904efb2f2b7a9f9be876c331fc7212
+size 8192
diff --git a/triton_models/weights/layers.1.past_kv_scale.0.weight b/triton_models/weights/layers.1.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..87ba80c2080cfc64bd645133d99c4fb0f602b920
--- /dev/null
+++ b/triton_models/weights/layers.1.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08456e5241a0fbd14699cb889680261c9e0ca7d30051066d899e99be24e15d52
+size 16
diff --git a/triton_models/weights/layers.10.attention.w_qkv.0.qweight b/triton_models/weights/layers.10.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..35f6c98510eb157f0971d9d241b2ec765cd3c834
--- /dev/null
+++ b/triton_models/weights/layers.10.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4d8d7ae69eea66730a10e906758105f2c99b16d082b9ea84d7e7cd8afcdbd4c
+size 12582912
diff --git a/triton_models/weights/layers.10.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.10.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..77eb52490f504dbd5b089674f267142c27e7acc0
--- /dev/null
+++ b/triton_models/weights/layers.10.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2885240377b91bd85bbe4ee6f67b8ca23233584c35ce71b752f9f3bbb66e266c
+size 786432
diff --git a/triton_models/weights/layers.10.attention.wo.0.qweight b/triton_models/weights/layers.10.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..440d3e309d85cdfb81736fd024a2834f4d0ce308
--- /dev/null
+++ b/triton_models/weights/layers.10.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae5115820467dcb2720eeb7abbdaf3ecd5edb56d9d7453fb0bf4f6b65323445a
+size 8388608
diff --git a/triton_models/weights/layers.10.attention.wo.0.scales_zeros b/triton_models/weights/layers.10.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..18b5ecc65f6f8133a1821de0925d37622a67af48
--- /dev/null
+++ b/triton_models/weights/layers.10.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4438217ed5de15cb91f4e30f0644b08952e981d25015dd4b75c4a0cae83517c2
+size 524288
diff --git a/triton_models/weights/layers.10.attention_norm.weight b/triton_models/weights/layers.10.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..4f0f39a02bb84010dd644e2fc96ef3b46d4c2820
--- /dev/null
+++ b/triton_models/weights/layers.10.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cd2c0d884542c0a881ef8fcfc9fbcc1feb67afbff0a8befc9bb741e2d8ea2af
+size 8192
diff --git a/triton_models/weights/layers.10.feed_forward.w13.0.qweight b/triton_models/weights/layers.10.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..bf50b623e7b1f4520d761286edd1db51a109c4c6
--- /dev/null
+++ b/triton_models/weights/layers.10.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a1258ea1e97e4c41db26a363eddedd3bd47c6d49f7bf738703c5746c54f4e37
+size 58720256
diff --git a/triton_models/weights/layers.10.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.10.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..ee36f684587a649d68d9579441ca3e90af8d7d6e
--- /dev/null
+++ b/triton_models/weights/layers.10.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48e7492a7d4447980961b5891a0997f2568bdbe10ed15ba0998f8ca1bdaf0a4c
+size 3670016
diff --git a/triton_models/weights/layers.10.feed_forward.w2.0.qweight b/triton_models/weights/layers.10.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..b0cce8413321f6074dc61c7a28bc92377f4c7ab2
--- /dev/null
+++ b/triton_models/weights/layers.10.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fb81b3c6a3f7b674506b003621b7e92925754e97d23ecb1209003f2232e33cb
+size 29360128
diff --git a/triton_models/weights/layers.10.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.10.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..ce1603f2d10d9ae9ef7251cb66a02c3e0cba6b67
--- /dev/null
+++ b/triton_models/weights/layers.10.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:773b9c8eb4a3818b2667162b3169bd4fe813f2fcba5c708a49b79fa5c5053c61
+size 1835008
diff --git a/triton_models/weights/layers.10.ffn_norm.weight b/triton_models/weights/layers.10.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..bbe9a16316f0db34745e41ef00224f94b9237fee
--- /dev/null
+++ b/triton_models/weights/layers.10.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b576f4d059d0f37a4fd3e626e640dad540ff4758aa449bafe55a78046a01dc9b
+size 8192
diff --git a/triton_models/weights/layers.10.past_kv_scale.0.weight b/triton_models/weights/layers.10.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..da0421db9e924c29c37c13c09376487aaa383c8d
--- /dev/null
+++ b/triton_models/weights/layers.10.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:430d675f2f2e4512591d558ea6f29e42dd38c55ffcd8d21873a12e9ff90e15b2
+size 16
diff --git a/triton_models/weights/layers.11.attention.w_qkv.0.qweight b/triton_models/weights/layers.11.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..d5058e0b21a7342d2379f3a9315e85ef9bbe7682
--- /dev/null
+++ b/triton_models/weights/layers.11.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2871ddd112a88bb89a549de3bf1c53af525e962e118eb7ad0feac6a56599a26e
+size 12582912
diff --git a/triton_models/weights/layers.11.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.11.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..92844164ec6f5b42e8222c577ce94bae5314a9c9
--- /dev/null
+++ b/triton_models/weights/layers.11.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de7017bdedc110df3a9f9fab19466968a5488b9ab3ad533f0908f2d368371adb
+size 786432
diff --git a/triton_models/weights/layers.11.attention.wo.0.qweight b/triton_models/weights/layers.11.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c67e6d4b3e11faa456791b77155fef70589e246f
--- /dev/null
+++ b/triton_models/weights/layers.11.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:530e3110fadceb664c29ff9da577cf401128e93ae21601affd1c62137b04db35
+size 8388608
diff --git a/triton_models/weights/layers.11.attention.wo.0.scales_zeros b/triton_models/weights/layers.11.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..4e0d310e48ae8ebd9b629872134eb3687a55e341
--- /dev/null
+++ b/triton_models/weights/layers.11.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1725da8fac86700a95c4ee9d40cf9ebf0d1ebabb4b145c2d57c4a31c42299cb8
+size 524288
diff --git a/triton_models/weights/layers.11.attention_norm.weight b/triton_models/weights/layers.11.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..f57dfc1e256d2fca8f1c8d59982ea28fb2f209c8
--- /dev/null
+++ b/triton_models/weights/layers.11.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4cb24612b49347f84741d6daab9a90b828aab924fc9b21fd2d2ca6b67abf8ea8
+size 8192
diff --git a/triton_models/weights/layers.11.feed_forward.w13.0.qweight b/triton_models/weights/layers.11.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..27905dc8bb55b6305cefdf0135d72eda3e7e17d9
--- /dev/null
+++ b/triton_models/weights/layers.11.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0af7f58d1e58e6610b5b56291bf697d79471c1eeaefdff9466fdc87996c3c86
+size 58720256
diff --git a/triton_models/weights/layers.11.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.11.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..474796975c206470856a63e5627806fdd1a9d0e4
--- /dev/null
+++ b/triton_models/weights/layers.11.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46e2d6846839f995e9434c35519a1152c52285d29672febe66e9f07b0e7523e5
+size 3670016
diff --git a/triton_models/weights/layers.11.feed_forward.w2.0.qweight b/triton_models/weights/layers.11.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..b8e4a4f967601a2151a7eb5da1c126599eea4743
--- /dev/null
+++ b/triton_models/weights/layers.11.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ae182cb83af72cac11a76113fc5492ae4ccda1cd45df36facac10e65369d22c
+size 29360128
diff --git a/triton_models/weights/layers.11.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.11.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..aac9a3ac0afb93d279461dacd82e1fd80dfb6161
--- /dev/null
+++ b/triton_models/weights/layers.11.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54af6ef8d3b0aaa32183d5fb176a4d2097bd043e44ebea37ba43ac4021e18253
+size 1835008
diff --git a/triton_models/weights/layers.11.ffn_norm.weight b/triton_models/weights/layers.11.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..6f958acb3e97bbc263ba99adb14ceb897dc7e573
--- /dev/null
+++ b/triton_models/weights/layers.11.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ae646b4e03481a9e0eccf0a151deeae360012b79d455f413d6b4c8c05ead016
+size 8192
diff --git a/triton_models/weights/layers.11.past_kv_scale.0.weight b/triton_models/weights/layers.11.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3bf7aed58e43958ad08d6b6e8beffe072f7e15e6
--- /dev/null
+++ b/triton_models/weights/layers.11.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:114046d9b18a39823a18019529563163f191e5a74c65e959db74c96b77c9b4b9
+size 16
diff --git a/triton_models/weights/layers.12.attention.w_qkv.0.qweight b/triton_models/weights/layers.12.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..b026bcfd8643c18461670a5a2980cf9a8539bb2b
--- /dev/null
+++ b/triton_models/weights/layers.12.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d30b7fa1db362abf3186072da75c305cd7e79f90f4b1eea6095014d9f7989da7
+size 12582912
diff --git a/triton_models/weights/layers.12.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.12.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..011903f321dd322447298b693e1eedb17f35c3ac
--- /dev/null
+++ b/triton_models/weights/layers.12.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:654fe994288ed138b388cb0e14a9c4e7124b601ac4efa404788e3267ed137307
+size 786432
diff --git a/triton_models/weights/layers.12.attention.wo.0.qweight b/triton_models/weights/layers.12.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..fd89f748d1ea906c6617d240a4e123d243105b64
--- /dev/null
+++ b/triton_models/weights/layers.12.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:069d9e054d6cd0171b229e37a70b6a2fca364783cc8e80de9f81060931964e0b
+size 8388608
diff --git a/triton_models/weights/layers.12.attention.wo.0.scales_zeros b/triton_models/weights/layers.12.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..b46cd92e96aa0e40ba260aea37674bdb9fbf1fd6
--- /dev/null
+++ b/triton_models/weights/layers.12.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:394968e46096fa0f50701fe0d09193561276359f023ea5dbc3a16bb3f1aff8b8
+size 524288
diff --git a/triton_models/weights/layers.12.attention_norm.weight b/triton_models/weights/layers.12.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..0020f8c429974d047571347728c95d5259c0da58
--- /dev/null
+++ b/triton_models/weights/layers.12.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:020a5a9ed0a5065303d1079d24ce7252b639f6f76bf49c7b8fb5fac3bc93fc1b
+size 8192
diff --git a/triton_models/weights/layers.12.feed_forward.w13.0.qweight b/triton_models/weights/layers.12.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f5cd9ca940d4417db1082cb6b445b56fc3ed304e
--- /dev/null
+++ b/triton_models/weights/layers.12.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9293f916e4009deb3dd715ac0fea08afe5be75548d2fe2e70a67fd5826664cea
+size 58720256
diff --git a/triton_models/weights/layers.12.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.12.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..be6c9b7b29a56d2d3afaec63b36099fc29d1ba80
--- /dev/null
+++ b/triton_models/weights/layers.12.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89899a4751211dda4328e2380ceec5d62d0d0b13fd164ccb7c9f5e189409a08f
+size 3670016
diff --git a/triton_models/weights/layers.12.feed_forward.w2.0.qweight b/triton_models/weights/layers.12.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..53e4822e263ce179450dcfacefe7dd882447324d
--- /dev/null
+++ b/triton_models/weights/layers.12.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f0f0481d3c7eeecc2717614f38dcd54163c287431e82da95a1e8d5fd182cc27
+size 29360128
diff --git a/triton_models/weights/layers.12.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.12.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..2f8d90a6c38370788887ee529f4ad8c7b4fd6593
--- /dev/null
+++ b/triton_models/weights/layers.12.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:690b11e4c0f825ec39db6b53fc1ccdd51d051c752199195f2cff8079ef3b980d
+size 1835008
diff --git a/triton_models/weights/layers.12.ffn_norm.weight b/triton_models/weights/layers.12.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..35e00aeee302ec1726ef04c71f2a2f429fe0d23e
--- /dev/null
+++ b/triton_models/weights/layers.12.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce6abd982c6b4b398f13a6113cfaefff0fe65190ff1b232c8b9a68acb30fbfdb
+size 8192
diff --git a/triton_models/weights/layers.12.past_kv_scale.0.weight b/triton_models/weights/layers.12.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..8fb69a827363200f7cd82be1b4f35bab6e143bb7
--- /dev/null
+++ b/triton_models/weights/layers.12.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3cee21f879722a16a454f6455c8d8c3aec77cbfdba6cbebac9c4762d1d03bb2
+size 16
diff --git a/triton_models/weights/layers.13.attention.w_qkv.0.qweight b/triton_models/weights/layers.13.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..63d098e6067e1aac3d4f6083c34f967abcfb40f4
--- /dev/null
+++ b/triton_models/weights/layers.13.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:983fa35043fba20d8f39610fc859862486472388df708d85176e198b9493f194
+size 12582912
diff --git a/triton_models/weights/layers.13.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.13.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..f78fb596aaf17a70c0fc17098a02d2fbd9f8b12e
--- /dev/null
+++ b/triton_models/weights/layers.13.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcfbdb8a6f2d86500e49d21e3d0cf88dda2e18b505be8459e46962f1a5403902
+size 786432
diff --git a/triton_models/weights/layers.13.attention.wo.0.qweight b/triton_models/weights/layers.13.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..d0443fc30519b3ca74b5e3d4e0317af1dbe8b32d
--- /dev/null
+++ b/triton_models/weights/layers.13.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e76d5b55510b3111a4c8068f8bf2abe8372c9868a5346fd03831633817f49a3
+size 8388608
diff --git a/triton_models/weights/layers.13.attention.wo.0.scales_zeros b/triton_models/weights/layers.13.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..6cbcd17aed1ae804e9e87a936274b99c9ad81296
--- /dev/null
+++ b/triton_models/weights/layers.13.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da85282928c5b1723c48e93cdadc416b400deb61bb90f28c4675989ab7d2f4f8
+size 524288
diff --git a/triton_models/weights/layers.13.attention_norm.weight b/triton_models/weights/layers.13.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..58edee2f8e729e06965c92f434900ae4f75e1a49
--- /dev/null
+++ b/triton_models/weights/layers.13.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:592d7039e973372cadcf8b3f717c19ecbcb911e2f40140d617855643bf2bfa3f
+size 8192
diff --git a/triton_models/weights/layers.13.feed_forward.w13.0.qweight b/triton_models/weights/layers.13.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..0f2f191246be551220b2b9df11e88d070f4b63c7
--- /dev/null
+++ b/triton_models/weights/layers.13.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1cbe619508e858a2637045e1e07f9cb0ec4c6020d6041e40bc9558aaa9fd290
+size 58720256
diff --git a/triton_models/weights/layers.13.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.13.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..8114a135ab96b7c28393bb44bad7050a71bd712c
--- /dev/null
+++ b/triton_models/weights/layers.13.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c555740ee91741c87411db09bc23b419caa191a4ac0ccf7e34b00fe64e614493
+size 3670016
diff --git a/triton_models/weights/layers.13.feed_forward.w2.0.qweight b/triton_models/weights/layers.13.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..efc53988aa0826924baa6153c20d1fb1abae3183
--- /dev/null
+++ b/triton_models/weights/layers.13.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5434cecf17636b9bbdf1df6ae4b6d1eb6c06a611c93fe0291ad0d3892d850a81
+size 29360128
diff --git a/triton_models/weights/layers.13.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.13.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c927886fb77c90e7e2afb11bb38945c179e779cd
--- /dev/null
+++ b/triton_models/weights/layers.13.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c89194f222aef9d0488e0677d654d9f4cc783cebad2ba76e9013ef99684a1c2c
+size 1835008
diff --git a/triton_models/weights/layers.13.ffn_norm.weight b/triton_models/weights/layers.13.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..0044a510f007c3e66e363ee02bbc25f4c26cb6a6
--- /dev/null
+++ b/triton_models/weights/layers.13.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75cc6d0e292ec019791db0f7ef63b0508d8a5d19404fadb09c1b06a8dcae7cdb
+size 8192
diff --git a/triton_models/weights/layers.13.past_kv_scale.0.weight b/triton_models/weights/layers.13.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..313f047a7db61ca9b3fed45b948aad24958ec896
--- /dev/null
+++ b/triton_models/weights/layers.13.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e86a948027461837c94daa03c444ddaa2a484bdadcab47a89f78d0d332ba0370
+size 16
diff --git a/triton_models/weights/layers.14.attention.w_qkv.0.qweight b/triton_models/weights/layers.14.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..d34a88071016d52838a914b177b787d6b7f5e989
--- /dev/null
+++ b/triton_models/weights/layers.14.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd65317b8701a195eabe835058a9366309ad055eebd4354fe994187573dcfcb4
+size 12582912
diff --git a/triton_models/weights/layers.14.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.14.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..dbf55a9dd11b2bb29fb5f7a2ec180b89f6372195
--- /dev/null
+++ b/triton_models/weights/layers.14.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a8b7af909bb0ee02940f92c80cde0a7a869e60bd4778c7eb5934ed7134b1e56
+size 786432
diff --git a/triton_models/weights/layers.14.attention.wo.0.qweight b/triton_models/weights/layers.14.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f2e7385fd3b0a6c38260980964dfd035abe25f95
--- /dev/null
+++ b/triton_models/weights/layers.14.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f17aa0c464ae8e87100f9946574744e554c50847775d5e3cc888584c920b51bf
+size 8388608
diff --git a/triton_models/weights/layers.14.attention.wo.0.scales_zeros b/triton_models/weights/layers.14.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..cca81645ed7af2fd8f2039c751f0856ab6332929
--- /dev/null
+++ b/triton_models/weights/layers.14.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac63fb5629b386babfc0cf09324e8388735c894def38688f57e5fa413a76a6b6
+size 524288
diff --git a/triton_models/weights/layers.14.attention_norm.weight b/triton_models/weights/layers.14.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..a2e5c82b9d622524d9390c76957ed9e8994aa2b8
--- /dev/null
+++ b/triton_models/weights/layers.14.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9d54e43cc40808a7a12fb34802e7e3fa239938943e4f247ea54556f65191e0e
+size 8192
diff --git a/triton_models/weights/layers.14.feed_forward.w13.0.qweight b/triton_models/weights/layers.14.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..efb7ccb2234e6b179d310051c53ba547a39f7b6b
--- /dev/null
+++ b/triton_models/weights/layers.14.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f974af156ac932cd0619e0e86095071dccc8cd0608319df5c1042492b2002e9d
+size 58720256
diff --git a/triton_models/weights/layers.14.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.14.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..8d916976c94c174148b04db334b907ec77c7d638
--- /dev/null
+++ b/triton_models/weights/layers.14.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5be3c8f04a42c5e0c9de9d00508fbb981849cf188dba80cf6127d8f4b4b712d
+size 3670016
diff --git a/triton_models/weights/layers.14.feed_forward.w2.0.qweight b/triton_models/weights/layers.14.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c926dcac71d930076be55189beacbb36cfb1a777
--- /dev/null
+++ b/triton_models/weights/layers.14.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c33e3534172410d4656b1a244becc400d680dc19664a6fe5d2531f0733b24b1
+size 29360128
diff --git a/triton_models/weights/layers.14.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.14.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..78c574771e660fcfc3a237c9d56afe57b62f1ea0
--- /dev/null
+++ b/triton_models/weights/layers.14.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3be2e077ef369c828ac8f31826249f327d120baaaf9d0141f67b9a814f95a57b
+size 1835008
diff --git a/triton_models/weights/layers.14.ffn_norm.weight b/triton_models/weights/layers.14.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3094bf1d424cd5ba8300cb6dddb32e4bc9d78073
--- /dev/null
+++ b/triton_models/weights/layers.14.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdb3dd1a12abaf094e03a1d933aa4ab506d5c4c0cd21cf0802c04f4a0d5a85c7
+size 8192
diff --git a/triton_models/weights/layers.14.past_kv_scale.0.weight b/triton_models/weights/layers.14.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..a1ff0007bbe4e1f0abfdccce67158196a9b3ba13
--- /dev/null
+++ b/triton_models/weights/layers.14.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39dfb751ce93881ea2c4e2f68155583024cfcf9e85b5705781348b079cc29b0d
+size 16
diff --git a/triton_models/weights/layers.15.attention.w_qkv.0.qweight b/triton_models/weights/layers.15.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..8d981e2ef18ba6fa67894151d2e5d33aec76e769
--- /dev/null
+++ b/triton_models/weights/layers.15.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f2d6afe6100ef0eb47d5b379ce3faa38ec1063ba36d47d9526647ea7fa4bda2
+size 12582912
diff --git a/triton_models/weights/layers.15.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.15.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..92d62c8db383b4e459224b1370a1d87eaa416096
--- /dev/null
+++ b/triton_models/weights/layers.15.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8abb8c1bad2acba915885821b231c1884cd63fd978d62d23a25775671c97f9b
+size 786432
diff --git a/triton_models/weights/layers.15.attention.wo.0.qweight b/triton_models/weights/layers.15.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..43781b59b7834c4758226fadd3757cd458eb9001
--- /dev/null
+++ b/triton_models/weights/layers.15.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fca2dec7e83b35a6b582edfc05ddf49890b234aeba53a3d88384a436cc96c4c1
+size 8388608
diff --git a/triton_models/weights/layers.15.attention.wo.0.scales_zeros b/triton_models/weights/layers.15.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..51a58827bb1c84c5a11deab1134c99e4cd37f472
--- /dev/null
+++ b/triton_models/weights/layers.15.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83bb55b56df6d0d2c1f6f04d894e5d6e63d476b8fffe1dd0441a892eed850502
+size 524288
diff --git a/triton_models/weights/layers.15.attention_norm.weight b/triton_models/weights/layers.15.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..7e895dc7fffaa82cf585391595f009adf667e4cd
--- /dev/null
+++ b/triton_models/weights/layers.15.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06c4e4b6e08466593216c5fffe5bb16fbe296be7d83b8d67084a728b4f0d26d0
+size 8192
diff --git a/triton_models/weights/layers.15.feed_forward.w13.0.qweight b/triton_models/weights/layers.15.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..8dfc85e4b6b9e369447163acf76550539913fb5a
--- /dev/null
+++ b/triton_models/weights/layers.15.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b271e071ebc5f1e37284433f76d394ee2ba20920d64e64355f6c37672bd68f3
+size 58720256
diff --git a/triton_models/weights/layers.15.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.15.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c0f10138fba546a8c454600fd6a73289e0a7f8fd
--- /dev/null
+++ b/triton_models/weights/layers.15.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b42f1cdd3b5b76e04cd4154950ade000eff8bfc44853c827ff351d00526201bc
+size 3670016
diff --git a/triton_models/weights/layers.15.feed_forward.w2.0.qweight b/triton_models/weights/layers.15.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e0d0b67b1d9d4d9530690ac220e426dedaddb1fc
--- /dev/null
+++ b/triton_models/weights/layers.15.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c44d9731ffc2bbd8a368f60064a8e8e85f50b04677d059c25fce70aae38dc81
+size 29360128
diff --git a/triton_models/weights/layers.15.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.15.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..a99be30bc9c12257d3764ef09722a06f15ef0437
--- /dev/null
+++ b/triton_models/weights/layers.15.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:287e909a7bd9bcc0b456c57c361a614c1898383785bccf9f57eee7f91599e3b3
+size 1835008
diff --git a/triton_models/weights/layers.15.ffn_norm.weight b/triton_models/weights/layers.15.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..630c4372de835971e521542c84649a00c3b2e403
--- /dev/null
+++ b/triton_models/weights/layers.15.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8dafc8ea6132b5caec667dde3f6dda741e7ff23e40b8ff5f5ccc59232ca434b
+size 8192
diff --git a/triton_models/weights/layers.15.past_kv_scale.0.weight b/triton_models/weights/layers.15.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..a47b7192fa2a190ceb02a526a527aed679e93740
--- /dev/null
+++ b/triton_models/weights/layers.15.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c945e5779fcddbf5dff47a4c3502bce9ba0bace5158abc583e852d1418f9513a
+size 16
diff --git a/triton_models/weights/layers.16.attention.w_qkv.0.qweight b/triton_models/weights/layers.16.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..b17d911138bd69b5faa2b303479e7cca9c12b659
--- /dev/null
+++ b/triton_models/weights/layers.16.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf8c2d841b0c3dfd0a4349bb4aa84c0d85141c14277e879c033484e225096715
+size 12582912
diff --git a/triton_models/weights/layers.16.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.16.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..bd4333af13bff4ad87c753e24461be8ab19102ab
--- /dev/null
+++ b/triton_models/weights/layers.16.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a54b05a6ce8083736ca7db382672bb83d215649338920308cf0edd2e4f1ae07
+size 786432
diff --git a/triton_models/weights/layers.16.attention.wo.0.qweight b/triton_models/weights/layers.16.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e09e8104c2418067fc961e4fa84dc074da5eaa81
--- /dev/null
+++ b/triton_models/weights/layers.16.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b8f9b5eb6ea1827048eb48661af27f66fbf5f510055f7dfc813f28f79967c83
+size 8388608
diff --git a/triton_models/weights/layers.16.attention.wo.0.scales_zeros b/triton_models/weights/layers.16.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..a056f4943ce26b8bb7e3c8d3d052feb2f324a4d8
--- /dev/null
+++ b/triton_models/weights/layers.16.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3031c7a07ae7554fdc02af0112aaf4f343c164f1da7e65ac0926e0b33ec1daf
+size 524288
diff --git a/triton_models/weights/layers.16.attention_norm.weight b/triton_models/weights/layers.16.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..710904f88b607829b98f69d31a704b5ccb2180d3
--- /dev/null
+++ b/triton_models/weights/layers.16.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0996c709a45131cb25cd72865a06e38920f31941b25f83f2d78ed5751645c284
+size 8192
diff --git a/triton_models/weights/layers.16.feed_forward.w13.0.qweight b/triton_models/weights/layers.16.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..ea56d48779234f87b2b0a859e2cb110d0718e2b9
--- /dev/null
+++ b/triton_models/weights/layers.16.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50fe105dfc87e7a2f06e12b9d1d92899b4b20106d29198eb7f8156c888b57620
+size 58720256
diff --git a/triton_models/weights/layers.16.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.16.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..5773631e90c5be54da0f5ca15e355b6bf855b4e3
--- /dev/null
+++ b/triton_models/weights/layers.16.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8081c981a8cc02210f42ffa6b41e8f8a018cc273f18dd184e7a76ea6a14af908
+size 3670016
diff --git a/triton_models/weights/layers.16.feed_forward.w2.0.qweight b/triton_models/weights/layers.16.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..5a19b7dd919248c1d8f24d12508ffb36be409a0b
--- /dev/null
+++ b/triton_models/weights/layers.16.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b58ad7e7bd4aaf5109590b6f4b500643cea2e5ee7ecf3de2f2bafd931fecbba
+size 29360128
diff --git a/triton_models/weights/layers.16.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.16.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..17e81af1aaa097a81bf4407a23e87dfb0810ba73
--- /dev/null
+++ b/triton_models/weights/layers.16.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05659661021dfb93c23ca810756fba0afa33f7dc7103bb74e79a5b5cee0630c2
+size 1835008
diff --git a/triton_models/weights/layers.16.ffn_norm.weight b/triton_models/weights/layers.16.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..f45d501c72951cd1746375922f7e113162bef097
--- /dev/null
+++ b/triton_models/weights/layers.16.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:990398b91f28bd4d0ea10d21a8f911746291d93d353659c273a0d263f3f8b26f
+size 8192
diff --git a/triton_models/weights/layers.16.past_kv_scale.0.weight b/triton_models/weights/layers.16.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..cc7a02ca2638e540d970eba9c8c2ca40c599f58e
--- /dev/null
+++ b/triton_models/weights/layers.16.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a46e5538c6531808ab35a4aa3f8acc92997393bf5778110738282e7d0b5a6253
+size 16
diff --git a/triton_models/weights/layers.17.attention.w_qkv.0.qweight b/triton_models/weights/layers.17.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..b7d289a0a181f768648b3388209609a158c0d194
--- /dev/null
+++ b/triton_models/weights/layers.17.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a874ceb40f2cd87b1fbadffe4f336e766e4632d1486bae80a524aca3884a760
+size 12582912
diff --git a/triton_models/weights/layers.17.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.17.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..02676e7729a5ae2a782c7397622f5661a55ae306
--- /dev/null
+++ b/triton_models/weights/layers.17.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3e383f96fe0c11172a8eb7c833e16437243ddf5083fe742f2f5267c606bf46f
+size 786432
diff --git a/triton_models/weights/layers.17.attention.wo.0.qweight b/triton_models/weights/layers.17.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f5d248ed5bb53bc83690b851c4850179affe3a1e
--- /dev/null
+++ b/triton_models/weights/layers.17.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ba47e294f57c2391d17559990d81c10b3febf1ac79cdaf9646ea4b5b1efe9ae
+size 8388608
diff --git a/triton_models/weights/layers.17.attention.wo.0.scales_zeros b/triton_models/weights/layers.17.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..cec2b0826f0458f462a1f155b2420afe3cade230
--- /dev/null
+++ b/triton_models/weights/layers.17.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19655fc3273537cb5a737021f0914fcaba9f520ae85a241b6943a1e375859c5a
+size 524288
diff --git a/triton_models/weights/layers.17.attention_norm.weight b/triton_models/weights/layers.17.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..493203ace8591c626f3ddd92a1d30a132fb91f7c
--- /dev/null
+++ b/triton_models/weights/layers.17.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f60382d336b8fe223742bf477d6e1d6b03a426c1397370821017d77560828a40
+size 8192
diff --git a/triton_models/weights/layers.17.feed_forward.w13.0.qweight b/triton_models/weights/layers.17.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..fada103f386b9576504b44aad9effb7227b81161
--- /dev/null
+++ b/triton_models/weights/layers.17.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6347e704f461d7d6ee0ae21b790cdd6180debf826b736f1862a27bc9ced0045
+size 58720256
diff --git a/triton_models/weights/layers.17.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.17.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..e34de3f6584cca7245e62f91730286274c18de9f
--- /dev/null
+++ b/triton_models/weights/layers.17.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13d6a83305e5bb3038ce5829693b70573fbcbfd18ef9251f42334a92a864f2f2
+size 3670016
diff --git a/triton_models/weights/layers.17.feed_forward.w2.0.qweight b/triton_models/weights/layers.17.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..62706b91c086f1c95651471ed13767ce01618e08
--- /dev/null
+++ b/triton_models/weights/layers.17.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62bbff754141a2d1cf72617d73f2522333bb2694a88e8a5b37c1aca6b22b17a0
+size 29360128
diff --git a/triton_models/weights/layers.17.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.17.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..7d16b3f60264de0aab7805c342d890386aa3c7ec
--- /dev/null
+++ b/triton_models/weights/layers.17.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2aced42506d0f633676edf55b7de564b795eb6de86d8c0f6c0f1d1301233312
+size 1835008
diff --git a/triton_models/weights/layers.17.ffn_norm.weight b/triton_models/weights/layers.17.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..2115ea8bcc2774631a370c71a768d54242473864
--- /dev/null
+++ b/triton_models/weights/layers.17.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7866c4443b210b814e1bcca660a34c2b78f21172253d2c53300be2c3e3d44fc
+size 8192
diff --git a/triton_models/weights/layers.17.past_kv_scale.0.weight b/triton_models/weights/layers.17.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..945eb96703d8de2eef6085a642b1a27de7fb8cba
--- /dev/null
+++ b/triton_models/weights/layers.17.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8029ca34c285ba5e30b011338457cb6e1aa2bde375aa5bddeb10d5f735b827aa
+size 16
diff --git a/triton_models/weights/layers.18.attention.w_qkv.0.qweight b/triton_models/weights/layers.18.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c8f8e2fdabca3f7c34468465c2a769b83df35ce8
--- /dev/null
+++ b/triton_models/weights/layers.18.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:802bfc3126429a1c8f50bb8bc82a62b62b5e4fac66b2e5201d5ca3dadc76b2b0
+size 12582912
diff --git a/triton_models/weights/layers.18.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.18.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..15b491c33507c9aa77edc43db2d844a6f497fca7
--- /dev/null
+++ b/triton_models/weights/layers.18.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5b1e35a7c3f4353a260afd771398ed0e6f3fb0cfe2c9e57c9c6aa837187477b
+size 786432
diff --git a/triton_models/weights/layers.18.attention.wo.0.qweight b/triton_models/weights/layers.18.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..fda05fdf95a8e38dbba3ae8e857729fde60e6d1b
--- /dev/null
+++ b/triton_models/weights/layers.18.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d5e9b4b8ac11947e865c95a0ee01bea2b98bb4d8e186bc655980c0819220337
+size 8388608
diff --git a/triton_models/weights/layers.18.attention.wo.0.scales_zeros b/triton_models/weights/layers.18.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..56d79eb2481c7040c86fa26964ede1eeae1395e4
--- /dev/null
+++ b/triton_models/weights/layers.18.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fb7cefb270cbf64d8347c25b5d776be71d432c570ac277fc6dcb8160f358040
+size 524288
diff --git a/triton_models/weights/layers.18.attention_norm.weight b/triton_models/weights/layers.18.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3c20c25a40ad141d017b4cce8700f88ca3d8efca
--- /dev/null
+++ b/triton_models/weights/layers.18.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4dac1fd7000d40fa00eb19ec7e140c8fd08a7e2fba5ac80c0f15abf00fd9048e
+size 8192
diff --git a/triton_models/weights/layers.18.feed_forward.w13.0.qweight b/triton_models/weights/layers.18.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..3c1d6af45afa49731996db41ef7d18503411125c
--- /dev/null
+++ b/triton_models/weights/layers.18.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23dee44b6cb77a166863b69487459d9de5dfd4c3989306919d4c35dc20c884be
+size 58720256
diff --git a/triton_models/weights/layers.18.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.18.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..54489f50388ea9154fce92dbadd4bf6a1a861f86
--- /dev/null
+++ b/triton_models/weights/layers.18.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10a6c1e2ca46dac304c89690e837221b7cd15133dc1e7ccfb18f69187af51208
+size 3670016
diff --git a/triton_models/weights/layers.18.feed_forward.w2.0.qweight b/triton_models/weights/layers.18.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e90ed3787e1ac9da6ffed10588e004c09bf3b9b1
--- /dev/null
+++ b/triton_models/weights/layers.18.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a35d9d5c12d752b160f51f53a49e9a763662605165cb85272e539b60a9f92055
+size 29360128
diff --git a/triton_models/weights/layers.18.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.18.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..17951129ba756efbad134062196862ef2b290c05
--- /dev/null
+++ b/triton_models/weights/layers.18.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:845ca7749cf6829cc274de80528f41dbd289d125720a4f68417677871dd528c9
+size 1835008
diff --git a/triton_models/weights/layers.18.ffn_norm.weight b/triton_models/weights/layers.18.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3fdc07d36718c6a4fb843c7a0e547971f25bbe50
--- /dev/null
+++ b/triton_models/weights/layers.18.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:885808cbeec44e76e545008343da6029dce51d48908c85d61f4e3e5734a316a7
+size 8192
diff --git a/triton_models/weights/layers.18.past_kv_scale.0.weight b/triton_models/weights/layers.18.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..4b8d6bdb257005f9da0843e14b064394e5e12366
--- /dev/null
+++ b/triton_models/weights/layers.18.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da3eda4da09ebaeb73ef447011ce0b9ef2ee982ab26d8d0408ad482f9b2b389e
+size 16
diff --git a/triton_models/weights/layers.19.attention.w_qkv.0.qweight b/triton_models/weights/layers.19.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f58ac78fbf8480c4a875a904f3eca7296b9d1dc7
--- /dev/null
+++ b/triton_models/weights/layers.19.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a697cc9e5c643856df75e5d40a4ddc810ad41c0ab9362ad6c7745862c000ccf
+size 12582912
diff --git a/triton_models/weights/layers.19.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.19.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..ff2f26342ca1663ff6c89e5015b02b41e976f9a9
--- /dev/null
+++ b/triton_models/weights/layers.19.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5deb01a923b8c70c8adaa62c3b6128231899cb7c185908822279725696d1c819
+size 786432
diff --git a/triton_models/weights/layers.19.attention.wo.0.qweight b/triton_models/weights/layers.19.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f444fcc2661a285f914957b05cedde19a4954ace
--- /dev/null
+++ b/triton_models/weights/layers.19.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:682754ebee51648ef7b0249fee7289fdf825e61916f97ec62087c8e39e9c14bb
+size 8388608
diff --git a/triton_models/weights/layers.19.attention.wo.0.scales_zeros b/triton_models/weights/layers.19.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..41cb9a3fa2554343948079acebcb10fa2a940517
--- /dev/null
+++ b/triton_models/weights/layers.19.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6d4a938a39924f222f02b460355a83ffb98a00ff19d05048c3bcb82c9e57edc
+size 524288
diff --git a/triton_models/weights/layers.19.attention_norm.weight b/triton_models/weights/layers.19.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..5acd5f2587a22bc1a1e2870e9b4af8ea1eaeb505
--- /dev/null
+++ b/triton_models/weights/layers.19.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63d26f2643a9aceebf2af38dbc611dc36da45a176257e478e62f85ddbc559f55
+size 8192
diff --git a/triton_models/weights/layers.19.feed_forward.w13.0.qweight b/triton_models/weights/layers.19.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..cc8dd8ef920737fc2e432adac1ce42303e7d7111
--- /dev/null
+++ b/triton_models/weights/layers.19.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a54bcfb108f050cf4a7c7cb37114ceb35476b3f8bb6cf6c541e8df014fbf6133
+size 58720256
diff --git a/triton_models/weights/layers.19.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.19.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c378e9b9bed297468e52701cb4eea8586e317e8f
--- /dev/null
+++ b/triton_models/weights/layers.19.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11cb4b7bd0b53f894236952f72793d3d4e647e6d07fc37e1112b0c5ba392176c
+size 3670016
diff --git a/triton_models/weights/layers.19.feed_forward.w2.0.qweight b/triton_models/weights/layers.19.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..131386a17e034a3ba0ce59be9c0351b35dfc20e1
--- /dev/null
+++ b/triton_models/weights/layers.19.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f32b6e7bb6005ba215aa938a0b52300230f7008150b45a11916829314ef3494
+size 29360128
diff --git a/triton_models/weights/layers.19.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.19.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..af5383b2c8c39d1c54f5dea9298ea08f5cbe267b
--- /dev/null
+++ b/triton_models/weights/layers.19.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84f83448a65d6bf12e5484bdf2805b2648a5ee6c0f71f592f1399a71f787a365
+size 1835008
diff --git a/triton_models/weights/layers.19.ffn_norm.weight b/triton_models/weights/layers.19.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..6f5513a9af9eec5fbc82dd527339fb220156deb0
--- /dev/null
+++ b/triton_models/weights/layers.19.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7e2f003c72088419d2608b060a98ab42356eeffed53510f1d468f4ccd3f1141
+size 8192
diff --git a/triton_models/weights/layers.19.past_kv_scale.0.weight b/triton_models/weights/layers.19.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..fd5be00138be7b2df59bf0b592a9bef86dc82eb8
--- /dev/null
+++ b/triton_models/weights/layers.19.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c71b33b311eb0e23a8b2494a543ba1181fd72314b49cf78a9749b9cf4a00df4
+size 16
diff --git a/triton_models/weights/layers.2.attention.w_qkv.0.qweight b/triton_models/weights/layers.2.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2d9c45e71e2c0ab82208f4202b06c9b97f6ba148
--- /dev/null
+++ b/triton_models/weights/layers.2.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fa15c6683fb8dd4f6a17b49bb0a989e462a984b2b1a62741c0261b0205e4d3a
+size 12582912
diff --git a/triton_models/weights/layers.2.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.2.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..cf230e2e4ec022b7dadc04504edd265c2736423a
--- /dev/null
+++ b/triton_models/weights/layers.2.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d46a56b3063ca3e890569f20f0f9554bd4b8b3dce4dd28c6de2a2c8b018de692
+size 786432
diff --git a/triton_models/weights/layers.2.attention.wo.0.qweight b/triton_models/weights/layers.2.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2ec2d68e756cc1afd558415a1c748d3366f51240
--- /dev/null
+++ b/triton_models/weights/layers.2.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:745bd18832a4be0427eecf06fbd16e5b4d9045d9bae02a538648bf061f1bcd31
+size 8388608
diff --git a/triton_models/weights/layers.2.attention.wo.0.scales_zeros b/triton_models/weights/layers.2.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..d1e959a3fa4ef4072ae44bb537bc108a99c3799e
--- /dev/null
+++ b/triton_models/weights/layers.2.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f165998aa89a2e93b82203e08444995edcdc00ed2dd2b3dc3171ed8c4aef68f
+size 524288
diff --git a/triton_models/weights/layers.2.attention_norm.weight b/triton_models/weights/layers.2.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..775cfb53b3214e57d496df775c7f2e98df37a237
--- /dev/null
+++ b/triton_models/weights/layers.2.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35db76352c3fef9616c14aefa7c0b05850df54a54e3e6c922df8876639c7048e
+size 8192
diff --git a/triton_models/weights/layers.2.feed_forward.w13.0.qweight b/triton_models/weights/layers.2.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..1b19b3f633c84fa1134ae29f0bf9f119d9b25d42
--- /dev/null
+++ b/triton_models/weights/layers.2.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5d14e61c9cc1a1874bbf7c1db7fb04e8b97f8d49e011bf0b5c2003a072083cf
+size 58720256
diff --git a/triton_models/weights/layers.2.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.2.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..e293bf94f00d2acb588e4a05e8b36c07adfd4cfe
--- /dev/null
+++ b/triton_models/weights/layers.2.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02a79b8fb1590037f3bcbe91f25dbcb82b2b91fe0a109dca31de0493a089fcdd
+size 3670016
diff --git a/triton_models/weights/layers.2.feed_forward.w2.0.qweight b/triton_models/weights/layers.2.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c43fcc94e533822deff81b234c66897d23c2a5aa
--- /dev/null
+++ b/triton_models/weights/layers.2.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cbde66d92d3be35621cdb2171a2b9e5ab5448d229f07d7da65d25553adcce029
+size 29360128
diff --git a/triton_models/weights/layers.2.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.2.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c5beb7d2b7d8320386a5105a4a2618ceec4e4943
--- /dev/null
+++ b/triton_models/weights/layers.2.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41bfc952713a7fd5409f909e9ab107d9ef734e730f7b00d97fc34ef24395e62e
+size 1835008
diff --git a/triton_models/weights/layers.2.ffn_norm.weight b/triton_models/weights/layers.2.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..45e884fea486483f4689411e2b0f5841bb3e6317
--- /dev/null
+++ b/triton_models/weights/layers.2.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f16599930e314f9a8ef2b760cc6773e75961152d32432b5fc3e411955dbdc227
+size 8192
diff --git a/triton_models/weights/layers.2.past_kv_scale.0.weight b/triton_models/weights/layers.2.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..70e74bf48eaad9dd65823e3d66a8d46c4452b13d
--- /dev/null
+++ b/triton_models/weights/layers.2.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7808c14f00dcb7b2b77edadc8852138f46802e013a3025e161a669adde20339
+size 16
diff --git a/triton_models/weights/layers.20.attention.w_qkv.0.qweight b/triton_models/weights/layers.20.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..6053a83955560e1c2a84e72515c7672d70304835
--- /dev/null
+++ b/triton_models/weights/layers.20.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45521551eeea8b702589fe7c6b19749333abf647f53f56713807dc38f58041ec
+size 12582912
diff --git a/triton_models/weights/layers.20.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.20.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..0e188dc213c48bf55e4b2001a68e495c895187a7
--- /dev/null
+++ b/triton_models/weights/layers.20.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7d9740714493408c67acb934d26406c11421ab7efdabd743bd990103a90f701
+size 786432
diff --git a/triton_models/weights/layers.20.attention.wo.0.qweight b/triton_models/weights/layers.20.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..730a6aa484d4286f408baf8abf88ea73e0b5aa02
--- /dev/null
+++ b/triton_models/weights/layers.20.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55586decc011d181feef941588d73d75de2ec8040bce7db734699a33a7bd6f42
+size 8388608
diff --git a/triton_models/weights/layers.20.attention.wo.0.scales_zeros b/triton_models/weights/layers.20.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..affb6ab65788c985dc6ccf43d5cb3fcc8f4e91f6
--- /dev/null
+++ b/triton_models/weights/layers.20.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3dff92bdb0d4bd34ecf08c0c024d9aabfeb9dc6407b55b55d25835922bddb9c
+size 524288
diff --git a/triton_models/weights/layers.20.attention_norm.weight b/triton_models/weights/layers.20.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..a4b06c9551477c77ebc9de6151cd219a9c13f63c
--- /dev/null
+++ b/triton_models/weights/layers.20.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4dfd453a8ca7eaa0368df85c67b0c4520d044c50e21e3e9c642016e56425fe2c
+size 8192
diff --git a/triton_models/weights/layers.20.feed_forward.w13.0.qweight b/triton_models/weights/layers.20.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e0aa342e545feda824e44af8745b7bf6714e3672
--- /dev/null
+++ b/triton_models/weights/layers.20.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a12408ddaac163c3473e187a838044bf3c05b1a72758d6b77338da700a74f845
+size 58720256
diff --git a/triton_models/weights/layers.20.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.20.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..595f2605064e623b1acbbbb39aad1abe47d2b5fe
--- /dev/null
+++ b/triton_models/weights/layers.20.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a20c9c4a6621e851abb268c647e4f9459277dc53bc5f64a0504562c9e7736b61
+size 3670016
diff --git a/triton_models/weights/layers.20.feed_forward.w2.0.qweight b/triton_models/weights/layers.20.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..3881b21e76f4c55a6f5a94d56794ece1d12912e8
--- /dev/null
+++ b/triton_models/weights/layers.20.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e13a13177f50e58cd454dfef4083e8b8da065d25bd277aeabcbbd65d9c7ee2db
+size 29360128
diff --git a/triton_models/weights/layers.20.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.20.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..f0c038b596c5143988722e1d044fdba36b9f4c53
--- /dev/null
+++ b/triton_models/weights/layers.20.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2bb55062eaf5f412bae85c9ac428ddc2e0e59d0e53ebd21abb1228cf4d1ea3c
+size 1835008
diff --git a/triton_models/weights/layers.20.ffn_norm.weight b/triton_models/weights/layers.20.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3cfe4cc50ce587ea9b564a20130b4fe2225d7d52
--- /dev/null
+++ b/triton_models/weights/layers.20.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37c809eef52d6f683a42650531b04e14b95934556c2f3607466882fff2c7a049
+size 8192
diff --git a/triton_models/weights/layers.20.past_kv_scale.0.weight b/triton_models/weights/layers.20.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3fe9d60389494bd97b6721514bbf76a4a2f4aeea
--- /dev/null
+++ b/triton_models/weights/layers.20.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97801b00a17ab91f1019edf80b667e915c772df1461e322cb8602d8bd831a8b1
+size 16
diff --git a/triton_models/weights/layers.21.attention.w_qkv.0.qweight b/triton_models/weights/layers.21.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..905d5eb82f1967282905cf3974e526f1e48e2b90
--- /dev/null
+++ b/triton_models/weights/layers.21.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2792bae2516c6d5167b1efdd66141ddc18439be883865eee923aa0d64f3501f7
+size 12582912
diff --git a/triton_models/weights/layers.21.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.21.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..9a1f6b2beb40845a92a60a5b1ea44afefad5446c
--- /dev/null
+++ b/triton_models/weights/layers.21.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:953b7c49b7ba4bab3b5ab552b697d5be9184144ec4f8f6ea9815a0e12420a4c6
+size 786432
diff --git a/triton_models/weights/layers.21.attention.wo.0.qweight b/triton_models/weights/layers.21.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..fbd8d63b76ae1f3a0394dfd4c09e724627ce656a
--- /dev/null
+++ b/triton_models/weights/layers.21.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f719914491c7941474c1b6efa5a79541ade54eff71a6d65a28dcff17baeacd89
+size 8388608
diff --git a/triton_models/weights/layers.21.attention.wo.0.scales_zeros b/triton_models/weights/layers.21.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..3199f31825d84cf98169a9ac8361fd01195c513a
--- /dev/null
+++ b/triton_models/weights/layers.21.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21e70d0275306b0d766b533780955602dc9d5163028c509745120b4e9dd070d1
+size 524288
diff --git a/triton_models/weights/layers.21.attention_norm.weight b/triton_models/weights/layers.21.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..ace9b471c09970005b6d8dcb34406ac8671f3340
--- /dev/null
+++ b/triton_models/weights/layers.21.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f5b37279d734e53f01e524b941104c4a2a0794819cb443255e46130190eb060
+size 8192
diff --git a/triton_models/weights/layers.21.feed_forward.w13.0.qweight b/triton_models/weights/layers.21.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..93ad736f2b44139c784864069aece4a59db96543
--- /dev/null
+++ b/triton_models/weights/layers.21.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7298a7ea1a9a2f16bfcca14510dce8da6342ceaccf48354e63945a00c86a8887
+size 58720256
diff --git a/triton_models/weights/layers.21.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.21.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..a7e502a74af20d234730806f84f0ee0fbec81a3d
--- /dev/null
+++ b/triton_models/weights/layers.21.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90e896e7361f2fde100ee9cbf4591ba2509c11ad2e06ff9150614c28f39f6cc7
+size 3670016
diff --git a/triton_models/weights/layers.21.feed_forward.w2.0.qweight b/triton_models/weights/layers.21.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e129776d2c3518130aa1688eefa5ce1d57e1f1cb
--- /dev/null
+++ b/triton_models/weights/layers.21.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0415c4da6fb2feb289a75e84a73c525272f0098ee5c14faf5544454178576f62
+size 29360128
diff --git a/triton_models/weights/layers.21.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.21.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..60435a424658f628b48358ed84954acb2782b727
--- /dev/null
+++ b/triton_models/weights/layers.21.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ff5c969303a6b351d8bb80064aad2c92e8c5c32d85bff840317ca0739ced463
+size 1835008
diff --git a/triton_models/weights/layers.21.ffn_norm.weight b/triton_models/weights/layers.21.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..6655336998857a70516ff902b71f61175fd1a6c3
--- /dev/null
+++ b/triton_models/weights/layers.21.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8042770bf17c4b7520332fdeeef3decf2eb77871e6d80a2fcfe79e850827faae
+size 8192
diff --git a/triton_models/weights/layers.21.past_kv_scale.0.weight b/triton_models/weights/layers.21.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..68bb063c7fe76ee11dc858fe2552eff20f89fc06
--- /dev/null
+++ b/triton_models/weights/layers.21.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:babef4e3b7889042e89f865f3c8bb53f6191e2c9329e3eb418e0627256b4bbf7
+size 16
diff --git a/triton_models/weights/layers.22.attention.w_qkv.0.qweight b/triton_models/weights/layers.22.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..26e5e328af67eb6995b4eccd4f3f47e2a5572bbb
--- /dev/null
+++ b/triton_models/weights/layers.22.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3845fa57cee6ae1adc7c640c17820f11d196a86138e3ab1b26d1fcdb5a12d480
+size 12582912
diff --git a/triton_models/weights/layers.22.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.22.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..25e896649de6e4eebef3fb52b4695e66834ea627
--- /dev/null
+++ b/triton_models/weights/layers.22.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60a8fb6d26d3741fbf2dbd24d9e96a689ce0d8311349bc7b7d487a94ffae7309
+size 786432
diff --git a/triton_models/weights/layers.22.attention.wo.0.qweight b/triton_models/weights/layers.22.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..30d513ba9872686a172b2e5bb54d7dc19c89b18b
--- /dev/null
+++ b/triton_models/weights/layers.22.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e8c0a44652ccfbbb876d6c56c552653b788b14188b48f41b957d17036111f93
+size 8388608
diff --git a/triton_models/weights/layers.22.attention.wo.0.scales_zeros b/triton_models/weights/layers.22.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..63489132ff37547f3c5a7082e39f7d6e60d99e2f
--- /dev/null
+++ b/triton_models/weights/layers.22.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cf24c066812a6a36df8eec192b40520df7d10573d5a2bfd2327ddaecf6e938a
+size 524288
diff --git a/triton_models/weights/layers.22.attention_norm.weight b/triton_models/weights/layers.22.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..67e9beee3472ac10efd53bef75c3678f86f0287a
--- /dev/null
+++ b/triton_models/weights/layers.22.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87603494aa61475dfc747464841436f303bcf654dc27b1a07564f53558ebc0e8
+size 8192
diff --git a/triton_models/weights/layers.22.feed_forward.w13.0.qweight b/triton_models/weights/layers.22.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..a6f81f752873c957d60d333f567fcf45dc101888
--- /dev/null
+++ b/triton_models/weights/layers.22.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37604a1d32f8001155e15ab4e13282b050da543ad0d0a25b759081246fdbdb15
+size 58720256
diff --git a/triton_models/weights/layers.22.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.22.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..7fc132bdca2ee4128bec7e863686fdca2f7aebf4
--- /dev/null
+++ b/triton_models/weights/layers.22.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06d1aced0b15076b9f26d4ea4f4f6b732368d7b373e7a588635da39cb9db5f39
+size 3670016
diff --git a/triton_models/weights/layers.22.feed_forward.w2.0.qweight b/triton_models/weights/layers.22.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2679586d03d73f48a045c13e8c8b19ad6eaa9b50
--- /dev/null
+++ b/triton_models/weights/layers.22.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15b2a9ac0ae91a96deefa360ba92e79339705410d925b2356b9815692ea31061
+size 29360128
diff --git a/triton_models/weights/layers.22.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.22.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..7216f3454da54e1117fd4e92befe84b4c8b46a1a
--- /dev/null
+++ b/triton_models/weights/layers.22.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a99b63ab8c94e4d8f81bc8cab1561f47e3c2bac9f6e13f0b23d9438e02d7d1e
+size 1835008
diff --git a/triton_models/weights/layers.22.ffn_norm.weight b/triton_models/weights/layers.22.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..4d71b5ceacf9dcc9afaaf1adf8978c2911ea951f
--- /dev/null
+++ b/triton_models/weights/layers.22.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:309c8793e4e6d01a426ded64878ab5bb81fc897a4369e2e12e180067d9e2f97f
+size 8192
diff --git a/triton_models/weights/layers.22.past_kv_scale.0.weight b/triton_models/weights/layers.22.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..265569647dc54011c0c7aa312cda60679eddf224
--- /dev/null
+++ b/triton_models/weights/layers.22.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a792b8d14741661477851bbe77b6f5dc4fecf7ce07009fb7d6bd25090b2ad2b
+size 16
diff --git a/triton_models/weights/layers.23.attention.w_qkv.0.qweight b/triton_models/weights/layers.23.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..3c4b6c3a2d7fa4c456839afe2c5df63b4801cf29
--- /dev/null
+++ b/triton_models/weights/layers.23.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2a664f7c9133d9a3d3f013ae68b7c826124f0ce8ee3e2a8b7a3d412fc4ce18c
+size 12582912
diff --git a/triton_models/weights/layers.23.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.23.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..6980321a22d78892613c341246abfd4fa6a6ec1b
--- /dev/null
+++ b/triton_models/weights/layers.23.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d1caf7d6d040d5052d79ec08aa4282d486d3fd63e54ce73293b62776d97cc01
+size 786432
diff --git a/triton_models/weights/layers.23.attention.wo.0.qweight b/triton_models/weights/layers.23.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..a959f9c51c2010dee1865544214aa31aca8e384b
--- /dev/null
+++ b/triton_models/weights/layers.23.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:019ccc843a3257c4a7b36900f96de821382e2847851af142ae89a9238b434b20
+size 8388608
diff --git a/triton_models/weights/layers.23.attention.wo.0.scales_zeros b/triton_models/weights/layers.23.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..63ad5cf1b74567dc10825bf3797cef1aeaf45b20
--- /dev/null
+++ b/triton_models/weights/layers.23.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80a82f597426b697fe58ed646f41dd9a6f4514d8d93e7f2791fac932dac100ca
+size 524288
diff --git a/triton_models/weights/layers.23.attention_norm.weight b/triton_models/weights/layers.23.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..95ac563b56807e330af49708f5e09a5b5d763971
--- /dev/null
+++ b/triton_models/weights/layers.23.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d621b52a30d8a04c1866972255522c844eebd9f0b57ee2b90fd4f8e5e7ba07a
+size 8192
diff --git a/triton_models/weights/layers.23.feed_forward.w13.0.qweight b/triton_models/weights/layers.23.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..070dac5924104453edc840b81f83c3af7c79534c
--- /dev/null
+++ b/triton_models/weights/layers.23.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e95a18e90a00cd47b6fce45cb8c1eeedb6ec2b8fed6f0cd8de85f36cfd5dedee
+size 58720256
diff --git a/triton_models/weights/layers.23.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.23.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..53c5e980f8815c039d907e5466820c61f9d1076c
--- /dev/null
+++ b/triton_models/weights/layers.23.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae6d90f0468717c0bf1b22ab4914319697011c4ee53f13241c0ca1970acc3331
+size 3670016
diff --git a/triton_models/weights/layers.23.feed_forward.w2.0.qweight b/triton_models/weights/layers.23.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..3dbd1908961ec50661072cfe35a0e65123ee0522
--- /dev/null
+++ b/triton_models/weights/layers.23.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1036d81bd9d055c59bed34241ec3328c1035676dbcd78a0186946147c58af98b
+size 29360128
diff --git a/triton_models/weights/layers.23.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.23.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..377898876f13249c94c85b69c632e4edbf89ca0d
--- /dev/null
+++ b/triton_models/weights/layers.23.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f354eef95b3a2007598e99428488351bc81e825cc08c8a22beea2a74432f0e91
+size 1835008
diff --git a/triton_models/weights/layers.23.ffn_norm.weight b/triton_models/weights/layers.23.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..6034309e63a873c266790385d8a50379dff8c851
--- /dev/null
+++ b/triton_models/weights/layers.23.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36a712b30e1f4b920e2bf0e553bf62898650a968b94cb544d4c0cb45dd9724ba
+size 8192
diff --git a/triton_models/weights/layers.23.past_kv_scale.0.weight b/triton_models/weights/layers.23.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..2054dd9b5bac4cc5f3947a6a29b0a00ee9c8f9c6
--- /dev/null
+++ b/triton_models/weights/layers.23.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:362bc48a1da392c1d9c1404743b87e700f048e91e2236c0f23136126cbd17a42
+size 16
diff --git a/triton_models/weights/layers.24.attention.w_qkv.0.qweight b/triton_models/weights/layers.24.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..90ca332aa05b52f6a6c1174451a057235aeec1f3
--- /dev/null
+++ b/triton_models/weights/layers.24.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5cb069457b3e48f9401929077bc5a44b988b7741941ed8157cf23fc0af8fa2
+size 12582912
diff --git a/triton_models/weights/layers.24.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.24.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c424c3a6af59cdb2e6cd3d2acdd6fa6b8585e46b
--- /dev/null
+++ b/triton_models/weights/layers.24.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b47c34802342bd2a02dc98d311924169d7abdc703e43279cffdcf1422243038d
+size 786432
diff --git a/triton_models/weights/layers.24.attention.wo.0.qweight b/triton_models/weights/layers.24.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..183cbc95eb079e344c88e1fa4774f568a66dbbd9
--- /dev/null
+++ b/triton_models/weights/layers.24.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6468f6b524dabe33d4487522c605b92a5c91eaaa9d6b39433dd31588bfd09215
+size 8388608
diff --git a/triton_models/weights/layers.24.attention.wo.0.scales_zeros b/triton_models/weights/layers.24.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..c435ad2044cc72cc87bf58ea590aea7b6e463349
--- /dev/null
+++ b/triton_models/weights/layers.24.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59fa63a2023ffc20a936686267ae08fe6c793889ca330e0fb0a44ab2b5fe8041
+size 524288
diff --git a/triton_models/weights/layers.24.attention_norm.weight b/triton_models/weights/layers.24.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..dccff49fb462091aab55a0c4eb163652123ff7d5
--- /dev/null
+++ b/triton_models/weights/layers.24.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d38dd18c9fe84631f30cb2b7cb92efc25473d4ba1c438a7817690ed3bbaabd8
+size 8192
diff --git a/triton_models/weights/layers.24.feed_forward.w13.0.qweight b/triton_models/weights/layers.24.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f0bea0526b3fe332953eeee191fd4d279f3a8286
--- /dev/null
+++ b/triton_models/weights/layers.24.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db478db4b91a673763d0252f233423fa31c7a562f80cbc6c106931886d56e253
+size 58720256
diff --git a/triton_models/weights/layers.24.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.24.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..8d233c239c539161b7c5f0b5f890f196d9c544c2
--- /dev/null
+++ b/triton_models/weights/layers.24.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5329cd85fc6390d7fc596abdb5907e3c2576c2fb6fc87d7c0dc2dbae326a826
+size 3670016
diff --git a/triton_models/weights/layers.24.feed_forward.w2.0.qweight b/triton_models/weights/layers.24.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..d4c99dfed4f5fd009c04c0693ddd1253dadfb80e
--- /dev/null
+++ b/triton_models/weights/layers.24.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78e4b556d2c58615b1f3bcbfe8780a1217bc0420383b55afbf6767315ca09e66
+size 29360128
diff --git a/triton_models/weights/layers.24.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.24.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..8d61abbf087e7f17d99482529ceb6649e5f98e4b
--- /dev/null
+++ b/triton_models/weights/layers.24.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9861b1f0dcf30259bc7a9d1c02969f271b805981c696d49b1dcdd939a7ff504b
+size 1835008
diff --git a/triton_models/weights/layers.24.ffn_norm.weight b/triton_models/weights/layers.24.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..a5247850bcab46ee044a136c8ca64f1223e6f1a7
--- /dev/null
+++ b/triton_models/weights/layers.24.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f64ff3faab2a3c58cde1f351d57bef281660b552a9dbb9c0aa49bff00dcd6719
+size 8192
diff --git a/triton_models/weights/layers.24.past_kv_scale.0.weight b/triton_models/weights/layers.24.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..3a9a25a5c3ba55692571909bb40b460b6ed82ade
--- /dev/null
+++ b/triton_models/weights/layers.24.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d2ab419befc2e7b0391b3b7e7bfa13bf728db0d6cba53136aedc0802a4fcc8c
+size 16
diff --git a/triton_models/weights/layers.25.attention.w_qkv.0.qweight b/triton_models/weights/layers.25.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..41c3344f95ab3594af8a3648d644979c8b8a3e84
--- /dev/null
+++ b/triton_models/weights/layers.25.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0971d51d3ac5fa3cb80bf7adb2616878c3921d6810a7b8c312f2c5edfc20ba2b
+size 12582912
diff --git a/triton_models/weights/layers.25.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.25.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..380f67b6fde572f2eecd73076b154bb56c631ceb
--- /dev/null
+++ b/triton_models/weights/layers.25.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd9d2322fc1ac860eeeb0ae4f57b15011ca5728cab0c2de14ad0734c813b1070
+size 786432
diff --git a/triton_models/weights/layers.25.attention.wo.0.qweight b/triton_models/weights/layers.25.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..58a080a5403fbc6975a8c92d3d8890d106c41f32
--- /dev/null
+++ b/triton_models/weights/layers.25.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42757d1b84d12da08d617496b557df5dc43260ad03444559342e57effdeff897
+size 8388608
diff --git a/triton_models/weights/layers.25.attention.wo.0.scales_zeros b/triton_models/weights/layers.25.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..a623dfbef7759c22ba42888f23b6af5e7c88703c
--- /dev/null
+++ b/triton_models/weights/layers.25.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc49597aa705026d30a172bcee0421ded59135ee57d2d1a38d511274fd00db51
+size 524288
diff --git a/triton_models/weights/layers.25.attention_norm.weight b/triton_models/weights/layers.25.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..e330398be316b3c7d2b4e8091847c876352631d0
--- /dev/null
+++ b/triton_models/weights/layers.25.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f978aa26bb24bbd527a1e949719d548e1c7bf7d30f04b02f0f28d1343053132
+size 8192
diff --git a/triton_models/weights/layers.25.feed_forward.w13.0.qweight b/triton_models/weights/layers.25.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..941b657818aee3d6c553e08ef74566cd98e55321
--- /dev/null
+++ b/triton_models/weights/layers.25.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:063a4b6c0bb854f67986762bafa9651778da009fd725fe723fa47306a99a845f
+size 58720256
diff --git a/triton_models/weights/layers.25.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.25.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..4df2b6e64935f05f8ec6ea3db6b9723c6ca0a7bd
--- /dev/null
+++ b/triton_models/weights/layers.25.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4a77dbd2274b6de3cfb89254d1cb2c0af54b304bb9134a280cbe9b620a361a9
+size 3670016
diff --git a/triton_models/weights/layers.25.feed_forward.w2.0.qweight b/triton_models/weights/layers.25.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..a2a36f211eb8cebc2e1ce26bbd4bcd9a806cee31
--- /dev/null
+++ b/triton_models/weights/layers.25.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1626e0d17ba4f05b0f1e65537f46ada22bef2d00deb136c30dd6bb481b617d58
+size 29360128
diff --git a/triton_models/weights/layers.25.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.25.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..09e7a5b567087d78bfcd3614b11b21106f5f8f59
--- /dev/null
+++ b/triton_models/weights/layers.25.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d9b0e50a31c6c29d57500a64edf731ea04db50967219bfdcb0853730c574333
+size 1835008
diff --git a/triton_models/weights/layers.25.ffn_norm.weight b/triton_models/weights/layers.25.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..026c4beed926345148e983d57a1eb89a25c4fd1c
--- /dev/null
+++ b/triton_models/weights/layers.25.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0eea4a26418b7a503c71abf443da9d784c2adca6551e4f1b998f94d6145d696
+size 8192
diff --git a/triton_models/weights/layers.25.past_kv_scale.0.weight b/triton_models/weights/layers.25.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..67871afaf8d1df47fbde1f4a65674ded07d4a864
--- /dev/null
+++ b/triton_models/weights/layers.25.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cad249894548c60911d6d65a7d5846938c1e479698b4466d4cc6e03d2444922
+size 16
diff --git a/triton_models/weights/layers.26.attention.w_qkv.0.qweight b/triton_models/weights/layers.26.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..8e3258b77728a5579d15c2a374b61be41a2afa09
--- /dev/null
+++ b/triton_models/weights/layers.26.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3b88ded4b32bf8ff5ab7fa3616ab98f1bfea6fd86f37b729ad69ffe89d33e97
+size 12582912
diff --git a/triton_models/weights/layers.26.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.26.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..cb16882090f73a8651b55899be0c7b66b7d89aef
--- /dev/null
+++ b/triton_models/weights/layers.26.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1303373a67371e1e2f3ed25bc8cd8e559b9503bc5b4fdc37bfaf758cd26acfb3
+size 786432
diff --git a/triton_models/weights/layers.26.attention.wo.0.qweight b/triton_models/weights/layers.26.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f65b33bea38f966cd6cd26980998df21898fad28
--- /dev/null
+++ b/triton_models/weights/layers.26.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da07e11c5ce840df7eaa7de1ddff66356a2995b93b6d1cdefe1d96f6d4eb62a6
+size 8388608
diff --git a/triton_models/weights/layers.26.attention.wo.0.scales_zeros b/triton_models/weights/layers.26.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..e34f9fbc1e33e117eb223353e64a0d03c3a1ce09
--- /dev/null
+++ b/triton_models/weights/layers.26.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec446a339a8b88e9d35b0feb0dc82c82f64420cc45aa67b0730bc6fdfeb33b24
+size 524288
diff --git a/triton_models/weights/layers.26.attention_norm.weight b/triton_models/weights/layers.26.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..bd89d7d2bb2a10e4537def6bc6550ddf681db645
--- /dev/null
+++ b/triton_models/weights/layers.26.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:452e37de79706d39a7fddbbd901e8353363bb41bb1178eebb42b0a9aad1998fc
+size 8192
diff --git a/triton_models/weights/layers.26.feed_forward.w13.0.qweight b/triton_models/weights/layers.26.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..ef1f200bdb37b79404804e211dddd09441a90cfb
--- /dev/null
+++ b/triton_models/weights/layers.26.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fac2317afed02f28c9f68eae5e04821f1fea2d7553bd4ce30b68b9a7e896be65
+size 58720256
diff --git a/triton_models/weights/layers.26.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.26.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..3613b7754b7de11bd7146b2f99bbb2aabad43346
--- /dev/null
+++ b/triton_models/weights/layers.26.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e892079f260d62e05e5169a508c1b50c3beffc1e568e189b358850a9596863ac
+size 3670016
diff --git a/triton_models/weights/layers.26.feed_forward.w2.0.qweight b/triton_models/weights/layers.26.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..42508b0d05c03cfe54875df80e5848f92e3a2148
--- /dev/null
+++ b/triton_models/weights/layers.26.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b2ab3bee38aee899c1454a69dc424ae61b6d14d67438c307369be02f6460085
+size 29360128
diff --git a/triton_models/weights/layers.26.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.26.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..6078af07ebbfebda87b1016fd58cdcffbb0b4c73
--- /dev/null
+++ b/triton_models/weights/layers.26.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:552933cb4c5ad88c47fcfc8c8982e8a9d6c2bcf4975d0a1ff17f85a0de9a72a0
+size 1835008
diff --git a/triton_models/weights/layers.26.ffn_norm.weight b/triton_models/weights/layers.26.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..659727ca29164c591b4db04c441375c79e981fce
--- /dev/null
+++ b/triton_models/weights/layers.26.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a474d6dce328dea51c94d84fde68d4472d68dbbf19ce347181b5956b98d41847
+size 8192
diff --git a/triton_models/weights/layers.26.past_kv_scale.0.weight b/triton_models/weights/layers.26.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..db316b10f011519fdc39c70e40706bb6499001f4
--- /dev/null
+++ b/triton_models/weights/layers.26.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d995b27407d7307c6a5b4a4fa7f6247eac5d8c1cc62c066c9bd4395d0455a939
+size 16
diff --git a/triton_models/weights/layers.27.attention.w_qkv.0.qweight b/triton_models/weights/layers.27.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2b398a0b63fe43f5bd6467e9001673b60b3d8b76
--- /dev/null
+++ b/triton_models/weights/layers.27.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fb11cc9d2229d99f45200d53d2430007eca65a120d988a8ace070a0e3754128
+size 12582912
diff --git a/triton_models/weights/layers.27.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.27.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..36269d2bb210deac5bfb20fc68c3a3c0ba2430d9
--- /dev/null
+++ b/triton_models/weights/layers.27.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3b885790c722268908e56129344337198b0c0e4b3bf5e21a7f091d0846a5d30
+size 786432
diff --git a/triton_models/weights/layers.27.attention.wo.0.qweight b/triton_models/weights/layers.27.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..75c54cf768728053f1051c6d1260296c943bc2cd
--- /dev/null
+++ b/triton_models/weights/layers.27.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d46493db19a5dc9a8d01151f769f22f10733969cad257ff2372fe9ef169efdc7
+size 8388608
diff --git a/triton_models/weights/layers.27.attention.wo.0.scales_zeros b/triton_models/weights/layers.27.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..91523912e1e6240ee472d551a8422724c7f9396f
--- /dev/null
+++ b/triton_models/weights/layers.27.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f80605e605d11e0f5a9e470c80c72859f9651f99f3db043b9eab3989fffd647
+size 524288
diff --git a/triton_models/weights/layers.27.attention_norm.weight b/triton_models/weights/layers.27.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..92e464dfb802dd2cde189e137b6e908acaec5c38
--- /dev/null
+++ b/triton_models/weights/layers.27.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b48e7db8fe774bd46f4eecc92ef7f6bde3cb8e3ba66836e6cae00572ea0e14e
+size 8192
diff --git a/triton_models/weights/layers.27.feed_forward.w13.0.qweight b/triton_models/weights/layers.27.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e7392da13e07a3f00396eb1965e2c22daece98a8
--- /dev/null
+++ b/triton_models/weights/layers.27.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a634ce6c3f2743a5e0fa245a0adf32df70a41dc7c969d40b1a3197f0436cdf5
+size 58720256
diff --git a/triton_models/weights/layers.27.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.27.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..4fadfc7e45425848c37d17c3f39ffbbb822a8c78
--- /dev/null
+++ b/triton_models/weights/layers.27.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc138f3c7e31e1be2b6e2a57d7d5a2ffab4fa52343122dd272e41ac4bfd9096e
+size 3670016
diff --git a/triton_models/weights/layers.27.feed_forward.w2.0.qweight b/triton_models/weights/layers.27.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..aae88c0abda360c16b47ef75abda1c4077edf25e
--- /dev/null
+++ b/triton_models/weights/layers.27.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9052da467e48c0c4138fd3769e456cb753464bb30a03a4942846a5b3877131f
+size 29360128
diff --git a/triton_models/weights/layers.27.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.27.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..3b2fa2b516a8c83d6eed1702e517e005ac19f281
--- /dev/null
+++ b/triton_models/weights/layers.27.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e1f67441bf5d4f5ca51f1f289e07a3c59907d324265741f76ad966bf1755749
+size 1835008
diff --git a/triton_models/weights/layers.27.ffn_norm.weight b/triton_models/weights/layers.27.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..c15c40329868b970cca611aff6e2bbe13d48abf0
--- /dev/null
+++ b/triton_models/weights/layers.27.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7fda3309eb353c9341280ab8f2a516011494cba8b769560e91cd0c9d27fc6561
+size 8192
diff --git a/triton_models/weights/layers.27.past_kv_scale.0.weight b/triton_models/weights/layers.27.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..d8710f2aebc08c7c65db4a66ef9daeba362df5ce
--- /dev/null
+++ b/triton_models/weights/layers.27.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2367dba495b15a673a5e8f907f19e98254caa8845195d88897b3ecc36d7c794
+size 16
diff --git a/triton_models/weights/layers.28.attention.w_qkv.0.qweight b/triton_models/weights/layers.28.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..11c1eafa7f15149287cd144977ef8e5a42645397
--- /dev/null
+++ b/triton_models/weights/layers.28.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1f9e7857882c7a56236572f8a03d72222b257c8d9ed6e2efa1d66c6b5e21fb1
+size 12582912
diff --git a/triton_models/weights/layers.28.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.28.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..f725cdf5914a0af48485baa5a948fb90c3030913
--- /dev/null
+++ b/triton_models/weights/layers.28.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da00a72b006477cacf5f86157b6206faefb0b9a1945fed4e5f2a2f9fc9846f55
+size 786432
diff --git a/triton_models/weights/layers.28.attention.wo.0.qweight b/triton_models/weights/layers.28.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..294eeaef86a93508f7f8b171fb8a303bcfb5602c
--- /dev/null
+++ b/triton_models/weights/layers.28.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:626eff3b0dc5215c6954f774fc8116aa989824ab9c971a3782d8bce5ad31d0a8
+size 8388608
diff --git a/triton_models/weights/layers.28.attention.wo.0.scales_zeros b/triton_models/weights/layers.28.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..90a1002de820fee0fabb5d5081cde6d434fa08dc
--- /dev/null
+++ b/triton_models/weights/layers.28.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5faf82a3313ab0b53237e677fa72b3b44137a47ab5f26d401a3bf43f5beb1bd8
+size 524288
diff --git a/triton_models/weights/layers.28.attention_norm.weight b/triton_models/weights/layers.28.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..1ec94894ca9c51e452e351065e83a91a22a1d264
--- /dev/null
+++ b/triton_models/weights/layers.28.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac4a8732ba2c28970db1dc7e821bd6c8b0e4de12f8de1b6bc6692840154562a4
+size 8192
diff --git a/triton_models/weights/layers.28.feed_forward.w13.0.qweight b/triton_models/weights/layers.28.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2ad5905fe8ebd68dafedb5c0bbe70d34f3f8c71d
--- /dev/null
+++ b/triton_models/weights/layers.28.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f375cdf0cd1a60d7c9d00319853242606c44be5322598f91dbff37284f0ab67
+size 58720256
diff --git a/triton_models/weights/layers.28.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.28.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..f8676ba3b145e257dc1c75c1f9d9dd86413bc37d
--- /dev/null
+++ b/triton_models/weights/layers.28.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f57f5b0745ad5281aa67d83c0da6f1ebc7539dff487ae1345761bf995aedb1c
+size 3670016
diff --git a/triton_models/weights/layers.28.feed_forward.w2.0.qweight b/triton_models/weights/layers.28.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e3532b664b06cd727ceb44f27462084bddb160c3
--- /dev/null
+++ b/triton_models/weights/layers.28.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:393b972c36770d253df01db59d0c889a018a26ec7a18cf1e69617828344e2ed4
+size 29360128
diff --git a/triton_models/weights/layers.28.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.28.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..9cba65bef1506cf3787aac95439d21334e5424fa
--- /dev/null
+++ b/triton_models/weights/layers.28.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f4650f45c05fbd9d52eade717d47d32b1127ad57db10133ba490f5af3843551
+size 1835008
diff --git a/triton_models/weights/layers.28.ffn_norm.weight b/triton_models/weights/layers.28.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..0a50537a8d1863c6ea2bf1177d91c15f67d42dec
--- /dev/null
+++ b/triton_models/weights/layers.28.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26ab58696d625c79d618dd907bbeefb29dcb441a358411ed99c0f88e8649e74b
+size 8192
diff --git a/triton_models/weights/layers.28.past_kv_scale.0.weight b/triton_models/weights/layers.28.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..335aa2710f889028753142ad7c1c770b5aaece8c
--- /dev/null
+++ b/triton_models/weights/layers.28.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be67c63310802e47b331969149928657a52d9caadc4dcd0599f0ed63fa8fe4c3
+size 16
diff --git a/triton_models/weights/layers.29.attention.w_qkv.0.qweight b/triton_models/weights/layers.29.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f7fb2a0c283d5309b0acac81e3f78bf535e119e0
--- /dev/null
+++ b/triton_models/weights/layers.29.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:964846927bb91f85e501fe1626e8958dba12656845d1c2963d6f0d31ba0e6fe9
+size 12582912
diff --git a/triton_models/weights/layers.29.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.29.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..e4616ace3831b1353261ce821a222788574a6a7e
--- /dev/null
+++ b/triton_models/weights/layers.29.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59389b1002ea4286ef68d6a28a48de0070a8fe63bb33881a4ea5b4d4824b586a
+size 786432
diff --git a/triton_models/weights/layers.29.attention.wo.0.qweight b/triton_models/weights/layers.29.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c339b504ad1ca7893a586fe0fbab27e0414733d4
--- /dev/null
+++ b/triton_models/weights/layers.29.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a9f306da7ef17418be8aa9f47f97e653aeab2c155aaf1f32ea93c6e3e424c19
+size 8388608
diff --git a/triton_models/weights/layers.29.attention.wo.0.scales_zeros b/triton_models/weights/layers.29.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..880d7d9c3c95158609d1215b2f6bba14a3a6c655
--- /dev/null
+++ b/triton_models/weights/layers.29.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1370f068209c9ab1f42b6657508b06a3511d1d2d8d2c5b5988f4d58591d40279
+size 524288
diff --git a/triton_models/weights/layers.29.attention_norm.weight b/triton_models/weights/layers.29.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..dc3408e864d2f349f03d2ea9f976241c0dd4ae19
--- /dev/null
+++ b/triton_models/weights/layers.29.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0136d8df649cc27c395128240a43f899929866414704347f851202cc638b9ec0
+size 8192
diff --git a/triton_models/weights/layers.29.feed_forward.w13.0.qweight b/triton_models/weights/layers.29.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..12bd5dfc4141909486de6f81eb5de2cd0541f243
--- /dev/null
+++ b/triton_models/weights/layers.29.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90f34915975f77f41c0057ec1ddc7e83098a74c6efe44d5cfcbd6252f7483773
+size 58720256
diff --git a/triton_models/weights/layers.29.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.29.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..92ba76313e8ccbbbbf563a230bc24e60c122fbbb
--- /dev/null
+++ b/triton_models/weights/layers.29.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56814e27f2fc6ea900d3623c77d1df558ea69fe154c99fe57fd45b6567a62186
+size 3670016
diff --git a/triton_models/weights/layers.29.feed_forward.w2.0.qweight b/triton_models/weights/layers.29.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..850b76dcf051ec7876aa7626f2aee3c02df70a73
--- /dev/null
+++ b/triton_models/weights/layers.29.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95e520a4a76d63d5f4cfad6bb9577ab1343c24d563ee6491b0120e8b8f605a24
+size 29360128
diff --git a/triton_models/weights/layers.29.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.29.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..8d8434eea29d62735d93ec7d3ed91e73a56773a5
--- /dev/null
+++ b/triton_models/weights/layers.29.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a45ecef0ec7bb53ccdd1499338dfc1590c5b4d4e64ca01119d8e2eac40c5249
+size 1835008
diff --git a/triton_models/weights/layers.29.ffn_norm.weight b/triton_models/weights/layers.29.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..cf3ccd85ec2a836282f95d8ffa96f001a6c78bfb
--- /dev/null
+++ b/triton_models/weights/layers.29.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80000d50b78aad7b0076bc159838fbc0e679d1b07aa00f374142e40c5fcbba01
+size 8192
diff --git a/triton_models/weights/layers.29.past_kv_scale.0.weight b/triton_models/weights/layers.29.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..135fea41df0db406183c0c705ee1bf4e15b3d938
--- /dev/null
+++ b/triton_models/weights/layers.29.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2334dc6b4e2acee8b2c60625419023d8b5cb9692341970a8cb0cb0950658940d
+size 16
diff --git a/triton_models/weights/layers.3.attention.w_qkv.0.qweight b/triton_models/weights/layers.3.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..741f2dbe9906898116ac1c0bcf6b6f1305ac0c7d
--- /dev/null
+++ b/triton_models/weights/layers.3.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b023e843f1b897e2768f8aa9d1f18e1a2fcb8a17ee904981117c3822cafda263
+size 12582912
diff --git a/triton_models/weights/layers.3.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.3.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..58882890a176f4e5d124ddfbdce381fc920d5b9d
--- /dev/null
+++ b/triton_models/weights/layers.3.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02c5a27de7ab84dc800a722021cefc12233818ba708f7ef20abed96d1efa3b29
+size 786432
diff --git a/triton_models/weights/layers.3.attention.wo.0.qweight b/triton_models/weights/layers.3.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..28835af03e975d2a253d1b43e9094dcef5665859
--- /dev/null
+++ b/triton_models/weights/layers.3.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:961c0e6293f13ca0eb880f274fcf96b1394f554b645856d99f898ae03ba05ab1
+size 8388608
diff --git a/triton_models/weights/layers.3.attention.wo.0.scales_zeros b/triton_models/weights/layers.3.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..4941d02a83a0dab878ad6795511df8e08e216ce0
--- /dev/null
+++ b/triton_models/weights/layers.3.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6a94458f402b8342d3936d5c436bcc1125e642d5216c1cf70ad7850d134dbdf
+size 524288
diff --git a/triton_models/weights/layers.3.attention_norm.weight b/triton_models/weights/layers.3.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..fee571b50c58b11c6d17e7daaf1a1796af101e8a
--- /dev/null
+++ b/triton_models/weights/layers.3.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e702523cc2696abf9ea5f86ca0c3b8110cbc92f9074f3573cd0935519da7f326
+size 8192
diff --git a/triton_models/weights/layers.3.feed_forward.w13.0.qweight b/triton_models/weights/layers.3.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..6576fcc897f882a63b4376d2366b8a16b75529b2
--- /dev/null
+++ b/triton_models/weights/layers.3.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec59414d327ec0ca8adf200f8593102b1cbef09d5a97e88f7e6f3d1d941e32d7
+size 58720256
diff --git a/triton_models/weights/layers.3.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.3.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..39bfc8b9158d17ace10985a0aefa5ed9b27c830f
--- /dev/null
+++ b/triton_models/weights/layers.3.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:592014759039919238673a2d601e2d397b3eb60f2b684d06201310dc35e6f870
+size 3670016
diff --git a/triton_models/weights/layers.3.feed_forward.w2.0.qweight b/triton_models/weights/layers.3.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..a2dc182c2e093651d77ac65087453506558cc6df
--- /dev/null
+++ b/triton_models/weights/layers.3.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c67555a8eae4e6cc55420ec37ea21933418f802190fc809bb33855011f8ec82a
+size 29360128
diff --git a/triton_models/weights/layers.3.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.3.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..b12f9eae6cb382f2ef562f1e7dad7d8f2c7f4f48
--- /dev/null
+++ b/triton_models/weights/layers.3.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b8d6409835e70b1c0fdf81979b61995fb90f43381277f9e457070df5a91229c
+size 1835008
diff --git a/triton_models/weights/layers.3.ffn_norm.weight b/triton_models/weights/layers.3.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..1ac16014018db6a631b37da0836ea438c9d2fdaa
--- /dev/null
+++ b/triton_models/weights/layers.3.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b485c2892ea53a76f21e84c2ed42436b05a41f5dab146fab77f25d2b506ae53
+size 8192
diff --git a/triton_models/weights/layers.3.past_kv_scale.0.weight b/triton_models/weights/layers.3.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..86f8adc521ad298ee51185ebf02afa53325facc9
--- /dev/null
+++ b/triton_models/weights/layers.3.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76bf77db19b1d0234ee2da545c98ee3d5921030e6deaa8b2742d4e9d400d7207
+size 16
diff --git a/triton_models/weights/layers.30.attention.w_qkv.0.qweight b/triton_models/weights/layers.30.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..087b322573894903eb8e5cf81dc0e4962ccbb4bb
--- /dev/null
+++ b/triton_models/weights/layers.30.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b085323586c5f61228e43ec3cf935799c983d169abd417a55a6c3f82cd255a1
+size 12582912
diff --git a/triton_models/weights/layers.30.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.30.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..de17498ac115e410694314f9e590322ecc3140ef
--- /dev/null
+++ b/triton_models/weights/layers.30.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:602a6e94ab5a7bda70167414ea1e71c46be0e7b46a69689d093f991dc6930079
+size 786432
diff --git a/triton_models/weights/layers.30.attention.wo.0.qweight b/triton_models/weights/layers.30.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..e9eddf6db391e55430e3ca4f04fc6966cdb3bc10
--- /dev/null
+++ b/triton_models/weights/layers.30.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5864869bc2f57778cafb236ed45dbcacce36836e1c8b3dd94fd1375829174baa
+size 8388608
diff --git a/triton_models/weights/layers.30.attention.wo.0.scales_zeros b/triton_models/weights/layers.30.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..f810acf8fcee1cdadd5b34adde32f9c37b177343
--- /dev/null
+++ b/triton_models/weights/layers.30.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c899fc162f4dbec0809e3059f9ed0ba9d3004a75d31841ade9aaf16df93493e
+size 524288
diff --git a/triton_models/weights/layers.30.attention_norm.weight b/triton_models/weights/layers.30.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..ad23a4893d3cffe2d398058b89dc78f528c91053
--- /dev/null
+++ b/triton_models/weights/layers.30.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:683f799d6ecb59ef5b47ee78d4d1653b6a49da4dc6c6865734f2832457ad888e
+size 8192
diff --git a/triton_models/weights/layers.30.feed_forward.w13.0.qweight b/triton_models/weights/layers.30.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..b61119e589e6b7759f74e927ba8c5a5286eb965f
--- /dev/null
+++ b/triton_models/weights/layers.30.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb97c170f0415eeb563dfaab343a6b7c736fb302b605cf65ac29e190d485f03a
+size 58720256
diff --git a/triton_models/weights/layers.30.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.30.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..3f892216a36905289e63b4b93c0eaf050e7acc02
--- /dev/null
+++ b/triton_models/weights/layers.30.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:debf89602b57cf687b1f434d484beefd647c3ea0e8305484658248c8238a347f
+size 3670016
diff --git a/triton_models/weights/layers.30.feed_forward.w2.0.qweight b/triton_models/weights/layers.30.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..d0743b7b13a262d47d3c95ff5f00bcf70dca3937
--- /dev/null
+++ b/triton_models/weights/layers.30.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00143d530f528cfdded636568772b1ac564990d10d52c943463e8198b0f45b22
+size 29360128
diff --git a/triton_models/weights/layers.30.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.30.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..649ffe4f3c74051e77a62d2bd111b1c8956635a4
--- /dev/null
+++ b/triton_models/weights/layers.30.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6881934dda1754f8b7bdb5619bed9e9ec7cd819080a5080d36c545274e7563bd
+size 1835008
diff --git a/triton_models/weights/layers.30.ffn_norm.weight b/triton_models/weights/layers.30.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..19611f78c82d05c2fa778fc4099462db96768018
--- /dev/null
+++ b/triton_models/weights/layers.30.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c07830c7b5e53981d0d97e28af650885ba42b1395e88e2a8b553c080258be805
+size 8192
diff --git a/triton_models/weights/layers.30.past_kv_scale.0.weight b/triton_models/weights/layers.30.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..ebf0f2ce5ad46a9897b292cf74ea4074253d9e00
--- /dev/null
+++ b/triton_models/weights/layers.30.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7a7079eaefe501289467f67ff3ec35deb358c17022eff2a2d77c011d87a7485
+size 16
diff --git a/triton_models/weights/layers.31.attention.w_qkv.0.qweight b/triton_models/weights/layers.31.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..33f1f7e919ab93f0f093697cc6564c8041cf7c9a
--- /dev/null
+++ b/triton_models/weights/layers.31.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42e8c9373e34e9f38c5aa5b7f9e7282f283dd138fa488699361a998289d4f0b8
+size 12582912
diff --git a/triton_models/weights/layers.31.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.31.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..51b423248b2e8762a232cb9f6524cc2d2882e6a1
--- /dev/null
+++ b/triton_models/weights/layers.31.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e74870d817de1f15c0b372de19d9049754192d574290aa47cc2da4114e02fbe3
+size 786432
diff --git a/triton_models/weights/layers.31.attention.wo.0.qweight b/triton_models/weights/layers.31.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..7976fa7add831d946d9634761ff8db4d07f69a6b
--- /dev/null
+++ b/triton_models/weights/layers.31.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:882c11872607c376a08d0e7ab4025ebae8050ca0a958b4678fa7c5f5fe34af8c
+size 8388608
diff --git a/triton_models/weights/layers.31.attention.wo.0.scales_zeros b/triton_models/weights/layers.31.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..87b74517a018f5d65e974fc575140a80f0cf2f63
--- /dev/null
+++ b/triton_models/weights/layers.31.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:780d8a3fc0d41d7e42ab7524e0e8eb3a5044627584cb749954a08d74e8889cc2
+size 524288
diff --git a/triton_models/weights/layers.31.attention_norm.weight b/triton_models/weights/layers.31.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..9e1759f5a7b8ce3bcbdf54ac4a167aa2a3836eeb
--- /dev/null
+++ b/triton_models/weights/layers.31.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13b79fca3496315c35d45be930b96ac34c0616ae9bb69018d41d4fe7d77fa1c3
+size 8192
diff --git a/triton_models/weights/layers.31.feed_forward.w13.0.qweight b/triton_models/weights/layers.31.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..fa724a72baf441d9817165d242ae54e77b819e7d
--- /dev/null
+++ b/triton_models/weights/layers.31.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d6490623b97868d9d81417ecbbc40bbcf24f872882ca23b74a76f6f384082cd
+size 58720256
diff --git a/triton_models/weights/layers.31.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.31.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..4e046750532412be4588ab28e7285c8f68bccf2f
--- /dev/null
+++ b/triton_models/weights/layers.31.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b03dd848d3c92adda40904bb369f812d1a2de1d72e53600bdf89cf3002aa5e4
+size 3670016
diff --git a/triton_models/weights/layers.31.feed_forward.w2.0.qweight b/triton_models/weights/layers.31.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..7954c17e1c4aac980fc31bc92786998b66007879
--- /dev/null
+++ b/triton_models/weights/layers.31.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f491d3ff06bae3646c8cabbf8c8b6e14963e909e5a3f2cadd84931bb1acc076
+size 29360128
diff --git a/triton_models/weights/layers.31.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.31.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..1f95fe4038958211cbda9224b4161cae99e0c2e5
--- /dev/null
+++ b/triton_models/weights/layers.31.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7895c436da989422f207c0631685485aada8b0cf45d0db3bbf0cb18b8573d8f4
+size 1835008
diff --git a/triton_models/weights/layers.31.ffn_norm.weight b/triton_models/weights/layers.31.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..491eadebff5c76dbdda444c927fd0bb153d54dbd
--- /dev/null
+++ b/triton_models/weights/layers.31.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b010068e8df791fcfd32ddefe46198f72adc5cb104f59512820541ed232ed52
+size 8192
diff --git a/triton_models/weights/layers.31.past_kv_scale.0.weight b/triton_models/weights/layers.31.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..9ed6ce58e195ff81f658649f8fbf99311dad0183
--- /dev/null
+++ b/triton_models/weights/layers.31.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fcd30ad8a1a6ae548b3b6cdbe2b3693c1d260fcf73e63e4cb201f4ff3a9216e8
+size 16
diff --git a/triton_models/weights/layers.4.attention.w_qkv.0.qweight b/triton_models/weights/layers.4.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..9efa7ae8526ee807be03ca3903436c1c4e096b2a
--- /dev/null
+++ b/triton_models/weights/layers.4.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd04897e691fff067678bfb5826f8c0dae0914c4a822266312a9fd08f9c8dfb9
+size 12582912
diff --git a/triton_models/weights/layers.4.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.4.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..b717a0bccf881f43c4dd4849aa9abac991f829b7
--- /dev/null
+++ b/triton_models/weights/layers.4.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a4e0a9b4313f6f28361952f5e1c00250e0bc8d8e348238f634679cc9983d4b0
+size 786432
diff --git a/triton_models/weights/layers.4.attention.wo.0.qweight b/triton_models/weights/layers.4.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..bbc885705f67c282413e4e10b430177fa24c64d1
--- /dev/null
+++ b/triton_models/weights/layers.4.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83ef42f037338f04aa63a71554b631e20e2cc1f4c44d0498061891de5d46dfec
+size 8388608
diff --git a/triton_models/weights/layers.4.attention.wo.0.scales_zeros b/triton_models/weights/layers.4.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..0dea56a4d1087a93efcf6c1d4c45d4eddcffd41d
--- /dev/null
+++ b/triton_models/weights/layers.4.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92669ba1e130035258630c4bb58a6ae23088baa4c818edb89d18126368fdd2b1
+size 524288
diff --git a/triton_models/weights/layers.4.attention_norm.weight b/triton_models/weights/layers.4.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..85901d7d4381bcdd1d25c69d8652668e9e82e4d7
--- /dev/null
+++ b/triton_models/weights/layers.4.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4392ba124c790351e1e804e3f6954b04df59cabe55918fb2ab208b9fcb1a25d4
+size 8192
diff --git a/triton_models/weights/layers.4.feed_forward.w13.0.qweight b/triton_models/weights/layers.4.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2eecef389220ebcbbb1b399d81d28d5c7123895d
--- /dev/null
+++ b/triton_models/weights/layers.4.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efab7d32785919b64059b2e20f610eae03ee8a2ba95bcd5c2d786e3074f66875
+size 58720256
diff --git a/triton_models/weights/layers.4.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.4.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..203aad693c83911b91ea533a372c2414914f0c33
--- /dev/null
+++ b/triton_models/weights/layers.4.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:624fd673a1cb8d5eed0814f7d0ebcfa6de1f0933f2c808a43fe9915863d06992
+size 3670016
diff --git a/triton_models/weights/layers.4.feed_forward.w2.0.qweight b/triton_models/weights/layers.4.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..22624a1646b9f3bc812053a3e4eccd3aa066e8cc
--- /dev/null
+++ b/triton_models/weights/layers.4.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2a9bc1f9a857eb51f12e913af082a9d065232ad278a46bf3312fee70b57c929
+size 29360128
diff --git a/triton_models/weights/layers.4.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.4.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..ba1d032b1632c72d516bf607d69ef9d858ec3f69
--- /dev/null
+++ b/triton_models/weights/layers.4.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f5a160ff8d293e97b6037541c207caf6ea4b15e625bd94dba7be81f1aa3052f
+size 1835008
diff --git a/triton_models/weights/layers.4.ffn_norm.weight b/triton_models/weights/layers.4.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..10fdc6cff9055cfb29be992fd58fec67e3a1e156
--- /dev/null
+++ b/triton_models/weights/layers.4.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7584bdc2460f81e60ad3db90f314b1c3c0bb458b724ad5a8ef2f6b87991871f
+size 8192
diff --git a/triton_models/weights/layers.4.past_kv_scale.0.weight b/triton_models/weights/layers.4.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..8ab0548585972c0f9a19539e4f0246ed192f0042
--- /dev/null
+++ b/triton_models/weights/layers.4.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:734c894776290dd532cb25f542e38b56c9151c45fb751e1d58f5aba3c1cf86ce
+size 16
diff --git a/triton_models/weights/layers.5.attention.w_qkv.0.qweight b/triton_models/weights/layers.5.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..04ab0a16f4f6b5b500d30b4b27152a073d6efffb
--- /dev/null
+++ b/triton_models/weights/layers.5.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76f7240f7f94715ffc2e22da1e1986a7738b3a81d2803a89fa8d467ab37d52f3
+size 12582912
diff --git a/triton_models/weights/layers.5.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.5.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..35b017f6b8442ef2ed28b4f1d7f2aab7e6c8f3d4
--- /dev/null
+++ b/triton_models/weights/layers.5.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f30a98755d5e88115a8343930c20bbfd34ef8095694f4c0709b299e0ee587b25
+size 786432
diff --git a/triton_models/weights/layers.5.attention.wo.0.qweight b/triton_models/weights/layers.5.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..4b270cc9d0768c5834bf5dee3db2ae53b9d1a2db
--- /dev/null
+++ b/triton_models/weights/layers.5.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c2c8b87162bc3f8d4c6044cbbba5bff1a0b4d484418966d683cd8edd5ffe289
+size 8388608
diff --git a/triton_models/weights/layers.5.attention.wo.0.scales_zeros b/triton_models/weights/layers.5.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..2170f6316f894a43c57df7c6f3b6435d6d290e59
--- /dev/null
+++ b/triton_models/weights/layers.5.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8a0bc293e079e00c8fb29ea166613fb81fc7a51dfae01bda404298bd3541858
+size 524288
diff --git a/triton_models/weights/layers.5.attention_norm.weight b/triton_models/weights/layers.5.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..e56c76ec2f895f4ab09e315bcb026a0cd110898e
--- /dev/null
+++ b/triton_models/weights/layers.5.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e322bf9e96c707a007b6cf18e95291034a7b4acc28cc9c868ba72a2067f42a4a
+size 8192
diff --git a/triton_models/weights/layers.5.feed_forward.w13.0.qweight b/triton_models/weights/layers.5.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c0603e429404aebb532d112009658a498d6a25d2
--- /dev/null
+++ b/triton_models/weights/layers.5.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b111a37c3e4700a7ac8bcc755e22baf0cdd205a4f64cce28587b12e6bf542fa5
+size 58720256
diff --git a/triton_models/weights/layers.5.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.5.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..54720e241e1c6574c937ac39760a84933da14ee8
--- /dev/null
+++ b/triton_models/weights/layers.5.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ccbdd88d473982cb63c5daa191f2956e0826feff876c6303ad46054ce474a9f3
+size 3670016
diff --git a/triton_models/weights/layers.5.feed_forward.w2.0.qweight b/triton_models/weights/layers.5.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f69f281b519e24e86576e49e914a3f29b9833837
--- /dev/null
+++ b/triton_models/weights/layers.5.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d055b75469902bb480fb2470766fc359100caf6f512e030d846c895cb23501e
+size 29360128
diff --git a/triton_models/weights/layers.5.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.5.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..39d27ba627be29fdb76869d39b5a02b38030a6a9
--- /dev/null
+++ b/triton_models/weights/layers.5.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf2b8068885689ca049003d3dff4bc8e68b47ddb9be7d7fdd56b39582b7fd61e
+size 1835008
diff --git a/triton_models/weights/layers.5.ffn_norm.weight b/triton_models/weights/layers.5.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..8f90bb2bd06c0ff2405bb8ca61c65441dc384653
--- /dev/null
+++ b/triton_models/weights/layers.5.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c886bfe39172273f70831164b7b87f48054c0da65cd1724be839673c817009b9
+size 8192
diff --git a/triton_models/weights/layers.5.past_kv_scale.0.weight b/triton_models/weights/layers.5.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..0032439aec9359a437391315477b7201d232b7ba
--- /dev/null
+++ b/triton_models/weights/layers.5.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b479855806803e6c485764401a2ed76b362ac09f2606a6d58fbba9b134ee186
+size 16
diff --git a/triton_models/weights/layers.6.attention.w_qkv.0.qweight b/triton_models/weights/layers.6.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..08c09cae235117db0cf2be801f075c4236bd6ba2
--- /dev/null
+++ b/triton_models/weights/layers.6.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebf9ddd2465c02a1a37bafe82e009127d6cbbcf0bec3b323eece36934bb6eeff
+size 12582912
diff --git a/triton_models/weights/layers.6.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.6.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..78b67e25716cf86de09b47dc537db6ec420fd21a
--- /dev/null
+++ b/triton_models/weights/layers.6.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b39acb9cc4de067c3ef5b0128c253ad0b646756445766d91f2421ca30ab6e272
+size 786432
diff --git a/triton_models/weights/layers.6.attention.wo.0.qweight b/triton_models/weights/layers.6.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..2d2cd5ddae6f67b08f6610fd6bfd8fe17ff43ad7
--- /dev/null
+++ b/triton_models/weights/layers.6.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81ad5a0787961305a05ec9b7c0fb89cc2aa70589a36efea39557a8ff33be93c9
+size 8388608
diff --git a/triton_models/weights/layers.6.attention.wo.0.scales_zeros b/triton_models/weights/layers.6.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..380b6dedbd40afe6240e0271cfd0000ef9f17b01
--- /dev/null
+++ b/triton_models/weights/layers.6.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:edadc4493b3568ab5ebe758a1aedc2ef5fefcd688f5a78eb1866379967ca1cd6
+size 524288
diff --git a/triton_models/weights/layers.6.attention_norm.weight b/triton_models/weights/layers.6.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..68cf1e82a5f3d60ef2c37bde39437efe411c0263
--- /dev/null
+++ b/triton_models/weights/layers.6.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5dcd4367593812ecec39d8b1ff7cd21912c1283686db24be488384fd2453162c
+size 8192
diff --git a/triton_models/weights/layers.6.feed_forward.w13.0.qweight b/triton_models/weights/layers.6.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..f66c0c431c68905f3cc431d2b266b628bcc1f9b1
--- /dev/null
+++ b/triton_models/weights/layers.6.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3cc20446684f9b809fd52c40bda9d32c115789c650575c0e54f5ab030b7ceed
+size 58720256
diff --git a/triton_models/weights/layers.6.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.6.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..d158d234d215899f80ded95207cff364e20e0c1d
--- /dev/null
+++ b/triton_models/weights/layers.6.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f01f13b1cd0cd8080d7c4906d71e44200b8053aa605a37069f1a9e1034a81f93
+size 3670016
diff --git a/triton_models/weights/layers.6.feed_forward.w2.0.qweight b/triton_models/weights/layers.6.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..0bee7d213091341bc193cd21b808a3776987b7dd
--- /dev/null
+++ b/triton_models/weights/layers.6.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95865a00e74b9d37ba9c21241922979b4f26eb06b78b84b25be12bcfba617657
+size 29360128
diff --git a/triton_models/weights/layers.6.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.6.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..80f3f7257450ba5de9d4dabaa61b516c7c807046
--- /dev/null
+++ b/triton_models/weights/layers.6.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0dcaefa2acb86a25aedc25d60558af179bbf8968f1fd023b20343dad73b0184
+size 1835008
diff --git a/triton_models/weights/layers.6.ffn_norm.weight b/triton_models/weights/layers.6.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..b56799656e38d049d14d02b2d7e4ab1e470bac6d
--- /dev/null
+++ b/triton_models/weights/layers.6.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e458ef7058c9d7734737447072dc2908dea9ebf64a2ebcef932e4d6832057f5b
+size 8192
diff --git a/triton_models/weights/layers.6.past_kv_scale.0.weight b/triton_models/weights/layers.6.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..70c460d32701c69c43ce43977e55d4c5e407b1c8
--- /dev/null
+++ b/triton_models/weights/layers.6.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa3e886e06b35057d676139206ed116fafd8c8dd29244eff07cf1221837e8807
+size 16
diff --git a/triton_models/weights/layers.7.attention.w_qkv.0.qweight b/triton_models/weights/layers.7.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..4bd1b6da8292c5b10b20dbee8e2ee7e95a46637d
--- /dev/null
+++ b/triton_models/weights/layers.7.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0c4ca025a4e163c0dc2da98d463549125001a9cc93654f37907cce2a9882d52
+size 12582912
diff --git a/triton_models/weights/layers.7.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.7.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..8846088f9a04128c3626ebdde6d6747d1d663587
--- /dev/null
+++ b/triton_models/weights/layers.7.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c086c5de28164657905ed6eaed423d6244ae0368c6180aa26fc0a6eb89724a83
+size 786432
diff --git a/triton_models/weights/layers.7.attention.wo.0.qweight b/triton_models/weights/layers.7.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c4891059c086711d0200456b57dc31f93418ba81
--- /dev/null
+++ b/triton_models/weights/layers.7.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efcb8926a09d3f78acbff4e19e2e5bafad04172d17321a6af2b4fe7974c40fe1
+size 8388608
diff --git a/triton_models/weights/layers.7.attention.wo.0.scales_zeros b/triton_models/weights/layers.7.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..a08abb8652ecda43c661807290bbefa793fb0160
--- /dev/null
+++ b/triton_models/weights/layers.7.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c0cdf8402670c6998b317082c140f0eb51c4bb0b41ca4e6386c6f1648f56a76
+size 524288
diff --git a/triton_models/weights/layers.7.attention_norm.weight b/triton_models/weights/layers.7.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..37c18cd18f7054a248d6352d4d5a25ac9a4175e5
--- /dev/null
+++ b/triton_models/weights/layers.7.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28cf5e25d536f7d9180c2eb1d7dcfd7d4bb749816849f75c5e09f0210cdbc417
+size 8192
diff --git a/triton_models/weights/layers.7.feed_forward.w13.0.qweight b/triton_models/weights/layers.7.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..9b50669a9dc81bf91e567a299ee57d333907a007
--- /dev/null
+++ b/triton_models/weights/layers.7.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0581fd7f812265f9b47b8eab7621664a046c4c6f98279676df767aaf339eee7
+size 58720256
diff --git a/triton_models/weights/layers.7.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.7.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..93d6f40d2e5bcd8b2a2da3d12418121279963070
--- /dev/null
+++ b/triton_models/weights/layers.7.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f86e5d5f8bd7d8eded5bf5a5cbefc9b1b3242cdb2b486f6b1b0289d75f4df828
+size 3670016
diff --git a/triton_models/weights/layers.7.feed_forward.w2.0.qweight b/triton_models/weights/layers.7.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..9d07164c18362f5b0879cc88dbb43ef395f284f2
--- /dev/null
+++ b/triton_models/weights/layers.7.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b02b881d979d0fb77a4d705ed4bc68ca58e7cfa84a504d90b9e816ddd99a6b0
+size 29360128
diff --git a/triton_models/weights/layers.7.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.7.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..b95f34d475e6c10781aca4639fbcadc9e706fc5a
--- /dev/null
+++ b/triton_models/weights/layers.7.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0c7e60168198f2ac9347ac8eb4fc59ea42fe0380e24550cd4fa2e989a2d90b4
+size 1835008
diff --git a/triton_models/weights/layers.7.ffn_norm.weight b/triton_models/weights/layers.7.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..7669f396fbea22312892ecc7e69f5847e3e3d0f7
--- /dev/null
+++ b/triton_models/weights/layers.7.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bce0233aef9e8401ea7eaddce5b44f2a28b6fd1018023ec3f2cae495f4d205b6
+size 8192
diff --git a/triton_models/weights/layers.7.past_kv_scale.0.weight b/triton_models/weights/layers.7.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..d2b299db6620c0abf87b67b228dd03b696854499
--- /dev/null
+++ b/triton_models/weights/layers.7.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae08ed15fa296e998f7e93b866fb5536103b357ca8fd0e8ee44423c4fe3ea4d3
+size 16
diff --git a/triton_models/weights/layers.8.attention.w_qkv.0.qweight b/triton_models/weights/layers.8.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..9a071d9e1c24a362c04a0f4335000d1eeeadbfea
--- /dev/null
+++ b/triton_models/weights/layers.8.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:312a5231076c36e023c30c18761d4793c7aaf2d1658f740a4ed6fe3ab9fb9532
+size 12582912
diff --git a/triton_models/weights/layers.8.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.8.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..b756258fc2694a8580c1d6d55d73c1aae4f88737
--- /dev/null
+++ b/triton_models/weights/layers.8.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:045eb164e9d18487951013b4a69dab786f034139e232a0c079e6c6de0b84d445
+size 786432
diff --git a/triton_models/weights/layers.8.attention.wo.0.qweight b/triton_models/weights/layers.8.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..79dcacb0bc5ed37629a105bb0afdc20c383e1736
--- /dev/null
+++ b/triton_models/weights/layers.8.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:917ac6b4102a88cb5fe47a13834f30fb45329e8234e6bf4a6d5def09acfca138
+size 8388608
diff --git a/triton_models/weights/layers.8.attention.wo.0.scales_zeros b/triton_models/weights/layers.8.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..3f21f5d05d73002cb0251350fce183ec3b6f82cc
--- /dev/null
+++ b/triton_models/weights/layers.8.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:075ca25071e36779993618787bcad51f47a6210b5c7efb13836b9f0c39113c7b
+size 524288
diff --git a/triton_models/weights/layers.8.attention_norm.weight b/triton_models/weights/layers.8.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..6441edc914d86ab07b46c530e63df5e212499fbf
--- /dev/null
+++ b/triton_models/weights/layers.8.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7645c5cc08248a97031708e37a8869793e72e86be7d529ee2d38214aa125f326
+size 8192
diff --git a/triton_models/weights/layers.8.feed_forward.w13.0.qweight b/triton_models/weights/layers.8.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..6b623d7f4ebef4670369d48905c1f66aa9b3fd94
--- /dev/null
+++ b/triton_models/weights/layers.8.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0a76bb17ba96c365a1bf660f901c21c3fc1d15165b0532e97c7ad86158513f0
+size 58720256
diff --git a/triton_models/weights/layers.8.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.8.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..f7b56f5fefdb81227823903289604a2f9e33cbf6
--- /dev/null
+++ b/triton_models/weights/layers.8.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f6cc9bf35da7c08e89248a2d1151ca84f97e0d44fda2f474fbe090fa2b71bc6
+size 3670016
diff --git a/triton_models/weights/layers.8.feed_forward.w2.0.qweight b/triton_models/weights/layers.8.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..c837700cdf510ee1df94f861174695bb0e1ccfc8
--- /dev/null
+++ b/triton_models/weights/layers.8.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67d6a461146ce6fca245beab647f837c7718f50c1ae6d48f852becd4b88ecd68
+size 29360128
diff --git a/triton_models/weights/layers.8.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.8.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..63ba13362b7c68d37224b01f241452a27cf8717a
--- /dev/null
+++ b/triton_models/weights/layers.8.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22f763f7c06275a5821c55ab0428986c7982da93d02ec561c4c1cf0bc83cb82a
+size 1835008
diff --git a/triton_models/weights/layers.8.ffn_norm.weight b/triton_models/weights/layers.8.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..c4ec482ee099d1dd8d7b2633b38f9546642f8c04
--- /dev/null
+++ b/triton_models/weights/layers.8.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97f607d08fdcc7d4a7048194e994afa25c34242bddec4d56534a779484534dec
+size 8192
diff --git a/triton_models/weights/layers.8.past_kv_scale.0.weight b/triton_models/weights/layers.8.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..dae30d205782945d230c044159736e88b8c261e0
--- /dev/null
+++ b/triton_models/weights/layers.8.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55e7e6e9663622f872cb332c414eac32a102e97ffdf3f5a2b6afa6f8371e1a5f
+size 16
diff --git a/triton_models/weights/layers.9.attention.w_qkv.0.qweight b/triton_models/weights/layers.9.attention.w_qkv.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..502cfce88cfb73bd839f1fb667fba672259c4294
--- /dev/null
+++ b/triton_models/weights/layers.9.attention.w_qkv.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ad1c9bfda707333f5860de8512ec7db789721d5f17e96ec0c1f79f98533c42c
+size 12582912
diff --git a/triton_models/weights/layers.9.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.9.attention.w_qkv.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..47605d66d4acddffb2885150c9d68d184f94a9c6
--- /dev/null
+++ b/triton_models/weights/layers.9.attention.w_qkv.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b5179dc3fba3abadb58abf409bfef33b382dc7373a002c3c43da9785c86f614
+size 786432
diff --git a/triton_models/weights/layers.9.attention.wo.0.qweight b/triton_models/weights/layers.9.attention.wo.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..0c3613bd080dd0fe0abbe07c8a567bf85e48e33d
--- /dev/null
+++ b/triton_models/weights/layers.9.attention.wo.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:535eb0ed2a008590448c38ddcfcf990219dd0c1752e28d11fe3310cdf4039d57
+size 8388608
diff --git a/triton_models/weights/layers.9.attention.wo.0.scales_zeros b/triton_models/weights/layers.9.attention.wo.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..bc68d0462949d41fb22495d6fc4d8a2c6c21b6a6
--- /dev/null
+++ b/triton_models/weights/layers.9.attention.wo.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee2d02d009e36ca78d86a48ea408c2017c21903b64400397a77f437f495d936c
+size 524288
diff --git a/triton_models/weights/layers.9.attention_norm.weight b/triton_models/weights/layers.9.attention_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..8493ee9741dd897107d9fe3cea7c2d01fdd4dee5
--- /dev/null
+++ b/triton_models/weights/layers.9.attention_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fcacb811b4cf62144e1ac2d3eadbafab30083e3420c46a92df1ab21840b29fe5
+size 8192
diff --git a/triton_models/weights/layers.9.feed_forward.w13.0.qweight b/triton_models/weights/layers.9.feed_forward.w13.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..bcb62122ef3b2bf1d13099eb7e64cd4f6266f02c
--- /dev/null
+++ b/triton_models/weights/layers.9.feed_forward.w13.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aca67258bcd3c39f17fb15a14b72cfe8ca597aeb30e0f4f298efa5eb093abcf3
+size 58720256
diff --git a/triton_models/weights/layers.9.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.9.feed_forward.w13.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..3e0e6af0add56eeb2e1cf7bc0142e52be7a5ae29
--- /dev/null
+++ b/triton_models/weights/layers.9.feed_forward.w13.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4b60ceaccc0af57c36de7cd69acf05d8c307f2d6d27a7e765e0f132ae95d17a
+size 3670016
diff --git a/triton_models/weights/layers.9.feed_forward.w2.0.qweight b/triton_models/weights/layers.9.feed_forward.w2.0.qweight
new file mode 100644
index 0000000000000000000000000000000000000000..399c1fc8d6cc43a27e802ca067c88fc4f9a3bc73
--- /dev/null
+++ b/triton_models/weights/layers.9.feed_forward.w2.0.qweight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e07e422f44ddda11dc7404b257cacd675b2b7f44491941e6754155df3a31d2e
+size 29360128
diff --git a/triton_models/weights/layers.9.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.9.feed_forward.w2.0.scales_zeros
new file mode 100644
index 0000000000000000000000000000000000000000..9509fd872d04e11bf53f07f99129e785b2056187
--- /dev/null
+++ b/triton_models/weights/layers.9.feed_forward.w2.0.scales_zeros
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3cc346804097116087236c77f2e2c018922efba4f2e32d8a71ddf8b026c9d34d
+size 1835008
diff --git a/triton_models/weights/layers.9.ffn_norm.weight b/triton_models/weights/layers.9.ffn_norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..185031880012c613c2cf8937d4aa159e1c93a4c0
--- /dev/null
+++ b/triton_models/weights/layers.9.ffn_norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98119ccde8c54eacba56311e43a7c74e62e30e0d7302b011202dea6a6348ba66
+size 8192
diff --git a/triton_models/weights/layers.9.past_kv_scale.0.weight b/triton_models/weights/layers.9.past_kv_scale.0.weight
new file mode 100644
index 0000000000000000000000000000000000000000..0ec9f90c9c5be11398b7b1bdba1df5b0975ab0d4
--- /dev/null
+++ b/triton_models/weights/layers.9.past_kv_scale.0.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62cf0a7960b56038dd17b81e2a1c38a016c2b78bd7272299dee18ae8e53e5c92
+size 16
diff --git a/triton_models/weights/norm.weight b/triton_models/weights/norm.weight
new file mode 100644
index 0000000000000000000000000000000000000000..906361178f72cf7bd1f01447accc35bf0e1b633a
--- /dev/null
+++ b/triton_models/weights/norm.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efcd3fb0c1c5225c17e0eeb5b46068bb7311f716a4908d5a39d79b37985b58e7
+size 8192
diff --git a/triton_models/weights/output.weight b/triton_models/weights/output.weight
new file mode 100644
index 0000000000000000000000000000000000000000..04e8f86f0b46051b3db62d5eefcbebda87641472
--- /dev/null
+++ b/triton_models/weights/output.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b0ed41b4df8f91647fc8bdd2aa61f55c39e09b6e063c8bd509b591797293919
+size 758120448
diff --git a/triton_models/weights/tok_embeddings.weight b/triton_models/weights/tok_embeddings.weight
new file mode 100644
index 0000000000000000000000000000000000000000..0b3edbd16fbb690f7c781043ea905fd4380e5f04
--- /dev/null
+++ b/triton_models/weights/tok_embeddings.weight
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8986115ad7e59813a41c88c0d601235fa36138d6c15e5657a050cf4ec40fb037
+size 758120448