diff --git a/model_repository/postprocessing/1/__pycache__/model.cpython-310.pyc b/model_repository/postprocessing/1/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa0ac1382a7864add3a9bb04e6b328fa6995f67d Binary files /dev/null and b/model_repository/postprocessing/1/__pycache__/model.cpython-310.pyc differ diff --git a/model_repository/postprocessing/1/model.py b/model_repository/postprocessing/1/model.py new file mode 100644 index 0000000000000000000000000000000000000000..20de97595195da5dedc044a31c6086c1f49892da --- /dev/null +++ b/model_repository/postprocessing/1/model.py @@ -0,0 +1,129 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp +from pathlib import Path + +import numpy as np +import triton_python_backend_utils as pb_utils + +# This tokenizer is `lmdeploy/turbomind/tokenizer.py`. When an LLM is served +# by triton inference server, it has to be converted first by running +# `python lmdeploy/serve/turbomind/deploy.py`. Then +# `lmdeploy/turbomind/tokenizer.py` will be copied to `tokenizer/tokenizer.py` +from .tokenizer.tokenizer import Tokenizer + + +class TritonPythonModel: + """Your Python model must use the same class name. + + Every Python model that is created must have "TritonPythonModel" as the + class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device + ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # Parse model configs + self.model_config = model_config = json.loads(args['model_config']) + + # Parse model output configs + output_config = pb_utils.get_output_config_by_name( + model_config, 'OUTPUT') + + # Convert Triton types to numpy types + self.output_dtype = pb_utils.triton_string_to_numpy( + output_config['data_type']) + + cur_folder = Path(__file__).parent + + self.tokenizer = Tokenizer( + osp.join( + cur_folder, self.model_config['parameters']['tokenizer_path'] + ['string_value'])) + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + for idx, request in enumerate(requests): + # Get input tensors + tokens_batch = pb_utils.get_input_tensor_by_name( + request, 'TOKENS_BATCH').as_numpy() + sequence_length = pb_utils.get_input_tensor_by_name( + request, 'sequence_length').as_numpy() + + # Postprocessing output data. + outputs = self._postprocessing(tokens_batch.tolist(), + sequence_length) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + output_tensor = pb_utils.Tensor( + 'OUTPUT', + np.array(outputs).astype(self.output_dtype)) + + # Create InferenceResponse. You can set an error here in case + # there was a problem with handling this inference request. + # Below is an example of how you can set errors in inference + # response: + # + # pb_utils.InferenceResponse( + # output_tensors=..., TritonError("An error occurred")) + inference_response = pb_utils.InferenceResponse( + output_tensors=[output_tensor]) + responses.append(inference_response) + + # You should return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + + Implementing `finalize` function is optional. This function allows the + model to perform any necessary clean ups before exit. + """ + print('Cleaning up...') + + def _postprocessing(self, tokens_batch, sequence_length): + """decode token ids into texts.""" + outputs = [] + for beam_tokens, beam_len in zip(tokens_batch, sequence_length): + for tokens, _len in zip(beam_tokens, beam_len): + output = self.tokenizer.decode(tokens, _len) + output = output.encode('utf8') + outputs.append(output) + return outputs diff --git a/model_repository/postprocessing/1/tokenizer/config.json b/model_repository/postprocessing/1/tokenizer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..84235b8a1a9618cc0ac265caf61ea4088780e3b1 --- /dev/null +++ b/model_repository/postprocessing/1/tokenizer/config.json @@ -0,0 +1,37 @@ +{ + "_name_or_path": "/root/psy/internlm2-7b/work_dirs/internlm2_chat_7b_qlora_oasst1_512_e3_copy/hf_2/merge", + "architectures": [ + "InternLM2ForCausalLM" + ], + "attn_implementation": "eager", + "auto_map": { + "AutoConfig": "configuration_internlm.InternLMConfig", + "AutoModel": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM" + }, + "bias": false, + "bos_token_id": 1, + "eos_token_id": 2, + "fp16": true, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "model_type": "internlm", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pad_token_id": 2, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 2.0, + "type": "dynamic" + }, + "rope_theta": 1000000, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.37.2", + "use_cache": false, + "vocab_size": 92544 +} diff --git a/model_repository/postprocessing/1/tokenizer/configuration_internlm.py b/model_repository/postprocessing/1/tokenizer/configuration_internlm.py new file mode 100644 index 0000000000000000000000000000000000000000..4d013582feaa1f9970a4256c4a0f77000fa645de --- /dev/null +++ b/model_repository/postprocessing/1/tokenizer/configuration_internlm.py @@ -0,0 +1,164 @@ +# coding=utf-8 +# Copyright (c) InternLM. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" InternLM model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {} + + +class InternLMConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate + an InternLM model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the InternLM-7B. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`InternLMModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings(`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + Example: + + ```python + >>> from transformers import InternLMModel, InternLMConfig + + >>> # Initializing a InternLM internlm-7b style configuration + >>> configuration = InternLMConfig() + + >>> # Initializing a model from the internlm-7b style configuration + >>> model = InternLMModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "internlm" + _auto_class = "AutoConfig" + + def __init__( # pylint: disable=W0102 + self, + vocab_size=103168, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + bias=True, + rope_theta=10000, + rope_scaling=None, + attn_implementation="eager", + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.bias = bias + + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + + self.attn_implementation = attn_implementation + if self.attn_implementation is None: + self.attn_implementation = "eager" + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}") diff --git a/model_repository/postprocessing/1/tokenizer/generation_config.json b/model_repository/postprocessing/1/tokenizer/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cc5efeadd3bf2caa4462a3be79d580690f410668 --- /dev/null +++ b/model_repository/postprocessing/1/tokenizer/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2, + "transformers_version": "4.37.2" +} diff --git a/model_repository/postprocessing/1/tokenizer/modeling_internlm2.py b/model_repository/postprocessing/1/tokenizer/modeling_internlm2.py new file mode 100644 index 0000000000000000000000000000000000000000..39d6f71d2933385988ec05f845d3f6386c97f74b --- /dev/null +++ b/model_repository/postprocessing/1/tokenizer/modeling_internlm2.py @@ -0,0 +1,1385 @@ +# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved. +# +# This code is based on transformers/src/transformers/models/llama/modeling_llama.py +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch InternLM2 model.""" +import math +import queue +import threading +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from einops import rearrange +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from transformers.activations import ACT2FN +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) + +try: + from transformers.generation.streamers import BaseStreamer +except: # noqa # pylint: disable=bare-except + BaseStreamer = None + +from .configuration_internlm import InternLMConfig as InternLM2Config + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "InternLM2Config" + +flash_attn_func, flash_attn_varlen_func = None, None +pad_input, index_first_axis, unpad_input = None, None, None +def _import_flash_attn(): + global flash_attn_func, flash_attn_varlen_func + global pad_input, index_first_axis, unpad_input + try: + from flash_attn import flash_attn_func as _flash_attn_func, flash_attn_varlen_func as _flash_attn_varlen_func + from flash_attn.bert_padding import pad_input as _pad_input, index_first_axis as _index_first_axis, unpad_input as _unpad_input + flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func + pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input + except ImportError: + raise ImportError("flash_attn is not installed.") + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.bart.modeling_bart._make_causal_mask +def _make_causal_mask( + input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 +): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM2 +class InternLM2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + InternLM2RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +# Copied from transformers.model.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM2 +class InternLM2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=torch.float32) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.model.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->InternLM2 +class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding): + """InternLM2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + t = t / self.scaling_factor + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.model.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM2 +class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding): + """InternLM2RotaryEmbedding extended with Dynamic NTK scaling. + Credits to the Reddit users /u/bloc97 and /u/emozilla. + """ + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.model.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors.""" + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class InternLM2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.w1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.w3 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.w2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x)) + + return down_proj + + +# Copied from transformers.model.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +# Modified from transformers.model.llama.modeling_llama.LlamaAttention +class InternLM2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: InternLM2Config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.wqkv = nn.Linear( + self.hidden_size, + (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim, + bias=config.bias, + ) + + self.wo = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = InternLM2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "dynamic": + self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, + scaling_factor=scaling_factor, + ) + elif scaling_type == "linear": + self.rotary_emb = InternLM2LinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, + scaling_factor=scaling_factor, + ) + else: + raise ValueError("Currently we only support rotary embedding's type being 'dynamic' or 'linear'.") + return self.rotary_emb + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + + bsz, q_len, _ = hidden_states.size() + + qkv_states = self.wqkv(hidden_states) + + qkv_states = rearrange( + qkv_states, + "b q (h gs d) -> b q h gs d", + gs=2 + self.num_key_value_groups, + d=self.head_dim, + ) + + query_states = qkv_states[..., : self.num_key_value_groups, :] + query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d") + key_states = qkv_states[..., -2, :] + value_states = qkv_states[..., -1, :] + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.wo(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +# Modified from transformers.model.llama.modeling_llama.InternLM2FlashAttention2 +class InternLM2FlashAttention2(InternLM2Attention): + """ + InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # InternLM2FlashAttention2 attention does not support output_attentions + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + qkv_states = self.wqkv(hidden_states) + + qkv_states = rearrange( + qkv_states, + "b q (h gs d) -> b q h gs d", + gs=2 + self.num_key_value_groups, + d=self.head_dim, + ) + + query_states = qkv_states[..., : self.num_key_value_groups, :] + query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d") + key_states = qkv_states[..., -2, :] + value_states = qkv_states[..., -1, :] + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, q_len + ) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.wo(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + # Contains at least one padding token in the sequence + causal = self.is_causal and query_length != 1 + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q.to(torch.int64), + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + +INTERNLM2_ATTENTION_CLASSES = { + "eager": InternLM2Attention, + "flash_attention_2": InternLM2FlashAttention2, +} + +# Modified from transformers.model.llama.modeling_llama.LlamaDecoderLayer +class InternLM2DecoderLayer(nn.Module): + def __init__(self, config: InternLM2Config): + super().__init__() + self.hidden_size = config.hidden_size + + self.attention = INTERNLM2_ATTENTION_CLASSES[config.attn_implementation](config=config) + + self.feed_forward = InternLM2MLP(config) + self.attention_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.ffn_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + + residual = hidden_states + + hidden_states = self.attention_norm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.attention( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.ffn_norm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +InternLM2_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`InternLM2Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2 +@add_start_docstrings( + "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.", + InternLM2_START_DOCSTRING, +) +class InternLM2PreTrainedModel(PreTrainedModel): + config_class = InternLM2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["InternLM2DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +InternLM2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or + when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Modified from transformers.model.llama.modeling_llama.LlamaModel +@add_start_docstrings( + "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.", + InternLM2_START_DOCSTRING, +) +class InternLM2Model(InternLM2PreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLM2DecoderLayer`] + + Args: + config: InternLM2Config + """ + + _auto_class = "AutoModel" + + def __init__(self, config: InternLM2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.config = config + + self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + + self.layers = nn.ModuleList([InternLM2DecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.tok_embeddings + + def set_input_embeddings(self, value): + self.tok_embeddings = value + + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + device=inputs_embeds.device, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( + inputs_embeds.device + ) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.attn_implementation == "flash_attention_2": + _import_flash_attn() + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + seq_length_with_past = seq_length + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.tok_embeddings(input_ids) + + if self.config.attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + else: + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + # embed positions + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, output_attentions, None) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +# Modified from transformers.model.llama.modeling_llama.LlamaForCausalLM +class InternLM2ForCausalLM(InternLM2PreTrainedModel): + _auto_class = "AutoModelForCausalLM" + + _tied_weights_keys = ["output.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = InternLM2Model(config) + self.vocab_size = config.vocab_size + self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.tok_embeddings + + def set_input_embeddings(self, value): + self.model.tok_embeddings = value + + def get_output_embeddings(self): + return self.output + + def set_output_embeddings(self, new_embeddings): + self.output = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, InternLM2ForCausalLM + + >>> model = InternLM2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.output(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values is not None: + past_length = past_key_values[0][0].shape[2] + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""): + prompt = "" + if meta_instruction: + prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n""" + else: + prompt += "" + for record in history: + prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n""" + prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n""" + return tokenizer([prompt], return_tensors="pt") + + @torch.no_grad() + def chat( + self, + tokenizer, + query: str, + history: List[Tuple[str, str]] = [], + streamer: Optional[BaseStreamer] = None, + max_new_tokens: int = 1024, + do_sample: bool = True, + temperature: float = 0.8, + top_p: float = 0.8, + meta_instruction: str = "You are an AI assistant whose name is InternLM (书生·浦语).\n" + "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n" + "- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.", + **kwargs, + ): + inputs = self.build_inputs(tokenizer, query, history, meta_instruction) + inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)} + # also add end-of-assistant token in eos token id to avoid unnecessary generation + eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]] + outputs = self.generate( + **inputs, + streamer=streamer, + max_new_tokens=max_new_tokens, + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + eos_token_id=eos_token_id, + **kwargs, + ) + outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :] + response = tokenizer.decode(outputs, skip_special_tokens=True) + response = response.split("<|im_end|>")[0] + history = history + [(query, response)] + return response, history + + @torch.no_grad() + def stream_chat( + self, + tokenizer, + query: str, + history: List[Tuple[str, str]] = [], + max_new_tokens: int = 1024, + do_sample: bool = True, + temperature: float = 0.8, + top_p: float = 0.8, + **kwargs, + ): + """ + Return a generator in format: (response, history) + Eg. + ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')]) + ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')]) + """ + if BaseStreamer is None: + raise ModuleNotFoundError( + "The version of `transformers` is too low. Please make sure " + "that you have installed `transformers>=4.28.0`." + ) + + response_queue = queue.Queue(maxsize=20) + + class ChatStreamer(BaseStreamer): + def __init__(self, tokenizer) -> None: + super().__init__() + self.tokenizer = tokenizer + self.queue = response_queue + self.query = query + self.history = history + self.response = "" + self.received_inputs = False + self.queue.put((self.response, history + [(self.query, self.response)])) + + def put(self, value): + if len(value.shape) > 1 and value.shape[0] > 1: + raise ValueError("ChatStreamer only supports batch size 1") + elif len(value.shape) > 1: + value = value[0] + + if not self.received_inputs: + # The first received value is input_ids, ignore here + self.received_inputs = True + return + + token = self.tokenizer.decode([value[-1]], skip_special_tokens=True) + if token.strip() != "<|im_end|>": + self.response = self.response + token + history = self.history + [(self.query, self.response)] + self.queue.put((self.response, history)) + + def end(self): + self.queue.put(None) + + def stream_producer(): + return self.chat( + tokenizer=tokenizer, + query=query, + streamer=ChatStreamer(tokenizer=tokenizer), + history=history, + max_new_tokens=max_new_tokens, + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + **kwargs, + ) + + def consumer(): + producer = threading.Thread(target=stream_producer) + producer.start() + while True: + res = response_queue.get() + if res is None: + return + yield res + + return consumer() + + +# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2 +@add_start_docstrings( + """ + The InternLM2 Model transformer with a sequence classification head on top (linear layer). + + [`InternLM2ForSequenceClassification`] uses the last token in order to do the classification, + as other causal models (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + InternLM2_START_DOCSTRING, +) +class InternLM2ForSequenceClassification(InternLM2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = InternLM2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.tok_embeddings + + def set_input_embeddings(self, value): + self.model.tok_embeddings = value + + @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to( + logits.device + ) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/model_repository/postprocessing/1/tokenizer/placeholder b/model_repository/postprocessing/1/tokenizer/placeholder new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model_repository/postprocessing/1/tokenizer/pytorch_model.bin.index.json b/model_repository/postprocessing/1/tokenizer/pytorch_model.bin.index.json new file mode 100644 index 0000000000000000000000000000000000000000..7d95cf180df4c423e817c55f30f5ce93ac80e220 --- /dev/null +++ b/model_repository/postprocessing/1/tokenizer/pytorch_model.bin.index.json @@ -0,0 +1,554 @@ +{ + "metadata": { + "total_size": 5251801088 + }, + "weight_map": { + "model.layers.0.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.10.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.10.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.10.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.2.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.20.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.27.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.27.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.27.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.3.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.30.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.4.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.norm.weight": "pytorch_model-00003-of-00003.bin", + "model.tok_embeddings.weight": "pytorch_model-00001-of-00003.bin", + "output.weight": "pytorch_model-00003-of-00003.bin" + } +} diff --git a/model_repository/postprocessing/1/tokenizer/special_tokens_map.json b/model_repository/postprocessing/1/tokenizer/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..492d4b2966a1763442d426d880dbc29f94906e4c --- /dev/null +++ b/model_repository/postprocessing/1/tokenizer/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/model_repository/postprocessing/1/tokenizer/tokenization_internlm.py b/model_repository/postprocessing/1/tokenizer/tokenization_internlm.py new file mode 100644 index 0000000000000000000000000000000000000000..9792349c7fed6fc64476eabdd9dad7a84640c3ee --- /dev/null +++ b/model_repository/postprocessing/1/tokenizer/tokenization_internlm.py @@ -0,0 +1,240 @@ +# coding=utf-8 +# Copyright (c) InternLM. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tokenization classes for IntermLM.""" +import os +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm +from transformers.tokenization_utils import PreTrainedTokenizer +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"} + +PRETRAINED_VOCAB_FILES_MAP = {} + + +class InternLMTokenizer(PreTrainedTokenizer): + """ + Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + model_input_names = ["input_ids", "attention_mask"] + _auto_class = "AutoTokenizer" + + def __init__( + self, + vocab_file, + unk_token="", + bos_token="", + eos_token="", + pad_token="", + sp_model_kwargs: Optional[Dict[str, Any]] = None, + add_bos_token=True, + add_eos_token=False, + decode_with_prefix_space=False, + clean_up_tokenization_spaces=False, + **kwargs, + ): + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.vocab_file = vocab_file + self.add_bos_token = add_bos_token + self.add_eos_token = add_eos_token + self.decode_with_prefix_space = decode_with_prefix_space + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + self._no_prefix_space_tokens = None + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + """ Initialization""" + + @property + def no_prefix_space_tokens(self): + if self._no_prefix_space_tokens is None: + vocab = self.convert_ids_to_tokens(list(range(self.vocab_size))) + self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")} + return self._no_prefix_space_tokens + + @property + def vocab_size(self): + """Returns vocab size""" + return self.sp_model.get_piece_size() + + @property + def bos_token_id(self) -> Optional[int]: + return self.sp_model.bos_id() + + @property + def eos_token_id(self) -> Optional[int]: + return self.sp_model.eos_id() + + def get_vocab(self): + """Returns vocab as a dict""" + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text): + """Returns a tokenized string.""" + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.piece_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + token = self.sp_model.IdToPiece(index) + return token + + def _maybe_add_prefix_space(self, tokens, decoded): + if tokens and tokens[0] not in self.no_prefix_space_tokens: + return " " + decoded + else: + return decoded + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + out_string = self.clean_up_tokenization(out_string) + out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string) + return out_string[1:] + + def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + if self.add_bos_token: + bos_token_ids = [self.bos_token_id] + else: + bos_token_ids = [] + + output = bos_token_ids + token_ids_0 + + if token_ids_1 is not None: + output = output + token_ids_1 + + if self.add_eos_token: + output = output + [self.eos_token_id] + + return output + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make + use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + eos = [self.eos_token_id] + + if token_ids_1 is None: + return len(token_ids_0 + eos) * [0] + return len(token_ids_0 + eos + token_ids_1 + eos) * [0] diff --git a/model_repository/postprocessing/1/tokenizer/tokenizer.model b/model_repository/postprocessing/1/tokenizer/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6600712949ca9c4ffb50f25275993a21fba0b408 --- /dev/null +++ b/model_repository/postprocessing/1/tokenizer/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b +size 1477754 diff --git a/model_repository/postprocessing/1/tokenizer/tokenizer.py b/model_repository/postprocessing/1/tokenizer/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..db936a5501cb07d33d56083656dbd734ba7431bf --- /dev/null +++ b/model_repository/postprocessing/1/tokenizer/tokenizer.py @@ -0,0 +1,400 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os +import os.path as osp +from collections import deque +from typing import List, Optional, Sequence, Union + +import torch + +from lmdeploy.utils import get_logger + +# this file will be copied to triton server, make sure all +# importing are starting from the package root lmdeploy + + +class SentencePieceTokenizer: + """Tokenizer of sentencepiece. + + Args: + model_file (str): the path of the tokenizer model + """ + + def __init__(self, model_file: str): + from sentencepiece import SentencePieceProcessor + self.model = SentencePieceProcessor(model_file=model_file) + self._prefix_space_tokens = None + # for stop words + self._maybe_decode_bytes: bool = None + # TODO maybe lack a constant.py + self._indexes_tokens_deque = deque(maxlen=10) + self.max_indexes_num = 5 + self.logger = get_logger('lmdeploy') + + @property + def vocab_size(self): + """vocabulary size.""" + return self.model.vocab_size() + + @property + def bos_token_id(self): + """begine of the sentence token id.""" + return self.model.bos_id() + + @property + def eos_token_id(self): + """end of the sentence token id.""" + return self.model.eos_id() + + @property + def prefix_space_tokens(self): + """tokens without prefix space.""" + if self._prefix_space_tokens is None: + vocab = self.model.IdToPiece(list(range(self.vocab_size))) + self._prefix_space_tokens = { + i + for i, tok in enumerate(vocab) if tok.startswith('▁') + } + return self._prefix_space_tokens + + def _maybe_add_prefix_space(self, tokens, decoded): + """maybe add prefix space for incremental decoding.""" + if len(tokens) and not decoded.startswith(' ') and\ + tokens[0] in self.prefix_space_tokens: + return ' ' + decoded + else: + return decoded + + def indexes_containing_token(self, token: str): + """Return all the possible indexes, whose decoding output may contain + the input token.""" + # traversing vocab is time consuming, can not be accelerated with + # multi threads (computation) or multi process (can't pickle tokenizer) + # so, we maintain latest 10 stop words and return directly if matched + for _token, _indexes in self._indexes_tokens_deque: + if token == _token: + return _indexes + if token == ' ': # ' ' is special + token = '▁' + vocab = self.model.IdToPiece(list(range(self.vocab_size))) + indexes = [i for i, voc in enumerate(vocab) if token in voc] + if len(indexes) > self.max_indexes_num: + indexes = self.encode(token, add_bos=False)[-1:] + self.logger.warning( + f'There are too many(>{self.max_indexes_num}) possible ' + f'indexes may decoding {token}, we will use {indexes} only') + self._indexes_tokens_deque.append((token, indexes)) + return indexes + + def encode(self, s: str, add_bos: bool = True, **kwargs): + """Tokenize a prompt. + + Args: + s (str): a prompt + Returns: + list[int]: token ids + """ + return self.model.Encode(s, add_bos=add_bos, **kwargs) + + def decode(self, t: Sequence[int], offset: Optional[int] = None): + """De-tokenize. + + Args: + t (List[int]): a list of token ids + offset (int): for incrementally decoding. Default to None, which + means not applied. + Returns: + str: text of decoding tokens + """ + if isinstance(t, torch.Tensor): + t = t.tolist() + t = t[offset:] + out_string = self.model.Decode(t) + if offset: + out_string = self._maybe_add_prefix_space(t, out_string) + return out_string + + def __call__(self, s: Union[str, Sequence[str]]): + """Tokenize prompts. + + Args: + s (str): prompts + Returns: + list[int]: token ids + """ + import addict + add_bos = False + add_eos = False + + input_ids = self.model.Encode(s, add_bos=add_bos, add_eos=add_eos) + return addict.Addict(input_ids=input_ids) + + +class HuggingFaceTokenizer: + """Tokenizer of sentencepiece. + + Args: + model_dir (str): the directory of the tokenizer model + """ + + def __init__(self, model_dir: str): + from transformers import AutoTokenizer + model_file = osp.join(model_dir, 'tokenizer.model') + backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json') + model_file_exists = osp.exists(model_file) + self.logger = get_logger('lmdeploy') + if not osp.exists(backend_tokenizer_file) and model_file_exists: + self.logger.warning( + 'Can not find tokenizer.json. ' + 'It may take long time to initialize the tokenizer.') + self.model = AutoTokenizer.from_pretrained(model_dir, + trust_remote_code=True) + self._prefix_space_tokens = None + # save tokenizer.json to reuse + if not osp.exists(backend_tokenizer_file) and model_file_exists: + if hasattr(self.model, 'backend_tokenizer'): + if os.access(model_dir, os.W_OK): + self.model.backend_tokenizer.save(backend_tokenizer_file) + + if self.model.eos_token_id is None: + generation_config_file = osp.join(model_dir, + 'generation_config.json') + if osp.exists(generation_config_file): + with open(generation_config_file, 'r') as f: + cfg = json.load(f) + self.model.eos_token_id = cfg['eos_token_id'] + elif hasattr(self.model, 'eod_id'): # Qwen remote + self.model.eos_token_id = self.model.eod_id + + # for stop words + self._vocab_size_with_added: int = None + self._maybe_decode_bytes: bool = None + # TODO maybe lack a constant.py + self._indexes_tokens_deque = deque(maxlen=10) + self.max_indexes_num = 5 + self.token2id = {} + + @property + def vocab_size(self): + """vocabulary size.""" + return self.model.vocab_size + + @property + def vocab_size_with_added(self): + """vocabulary size with added vocab.""" + if self._vocab_size_with_added is not None: + return self._vocab_size_with_added + self._vocab_size_with_added = len(self.model.get_vocab()) + return self._vocab_size_with_added + + @property + def bos_token_id(self): + """begine of the sentence token id.""" + return self.model.bos_token_id + + @property + def eos_token_id(self): + """end of the sentence token id.""" + return self.model.eos_token_id + + @property + def prefix_space_tokens(self): + """tokens without prefix space.""" + if self._prefix_space_tokens is None: + vocab = self.model.convert_ids_to_tokens( + list(range(self.vocab_size))) + self._prefix_space_tokens = { + i + for i, tok in enumerate(vocab) + if tok.startswith('▁' if isinstance(tok, str) else b' ') + } + return self._prefix_space_tokens + + def _maybe_add_prefix_space(self, tokens: List[int], decoded: str): + """maybe add prefix space for incremental decoding.""" + if len(tokens) and not decoded.startswith(' ') and\ + tokens[0] in self.prefix_space_tokens: + return ' ' + decoded + else: + return decoded + + @property + def maybe_decode_bytes(self): + """Check if self.model.convert_ids_to_tokens return not a str value.""" + if self._maybe_decode_bytes is None: + self._maybe_decode_bytes = False + vocab = self.model.convert_ids_to_tokens( + list(range(self.vocab_size))) + for tok in vocab: + if not isinstance(tok, str): + self._maybe_decode_bytes = True + break + return self._maybe_decode_bytes + + def indexes_containing_token(self, token: str): + """Return all the possible indexes, whose decoding output may contain + the input token.""" + # traversing vocab is time consuming, can not be accelerated with + # multi threads (computation) or multi process (can't pickle tokenizer) + # so, we maintain latest 10 stop words and return directly if matched + for _token, _indexes in self._indexes_tokens_deque: + if token == _token: + return _indexes + + if self.token2id == {}: + # decode is slower than convert_ids_to_tokens + if self.maybe_decode_bytes: + self.token2id = { + self.model.decode(i): i + for i in range(self.vocab_size) + } + else: + self.token2id = { + self.model.convert_ids_to_tokens(i): i + for i in range(self.vocab_size) + } + if token == ' ': # ' ' is special + token = '▁' + indexes = [i for _token, i in self.token2id.items() if token in _token] + if len(indexes) > self.max_indexes_num: + indexes = self.encode(token, add_bos=False)[-1:] + self.logger.warning( + f'There are too many(>{self.max_indexes_num}) possible ' + f'indexes may decoding {token}, we will use {indexes} only') + # there might be token id that exceeds self.vocab_size + if len(indexes) == 0: + indexes = self.encode(token, False) + if len(indexes) != 1: + self.logger.warning( + f'The token {token}, its length of indexes {indexes} is ' + 'not 1. Currently, it can not be used as stop words') + indexes = [] + self._indexes_tokens_deque.append((token, indexes)) + return indexes + + def encode(self, s: str, add_bos: bool = True, **kwargs): + """Tokenize a prompt. + + Args: + s (str): a prompt + Returns: + list[int]: token ids + """ + encoded = self.model.encode(s, **kwargs) + if not add_bos: + # in the middle of a session + if len(encoded) and encoded[0] == self.bos_token_id: + encoded = encoded[1:] + return encoded + + def decode(self, t: Sequence[int], offset: Optional[int] = None): + """De-tokenize. + + Args: + t (List[int]): a list of token ids + offset (int): for incrementally decoding. Default to None, which + means not applied. + Returns: + str: text of decoding tokens + """ + skip_special_tokens = True + t = t[offset:] + out_string = self.model.decode(t, + skip_special_tokens=skip_special_tokens) + if offset: + out_string = self._maybe_add_prefix_space(t, out_string) + return out_string + + def __call__(self, s: Union[str, Sequence[str]]): + """Tokenize prompts. + + Args: + s (str): prompts + Returns: + list[int]: token ids + """ + add_special_tokens = False + return self.model(s, add_special_tokens=add_special_tokens) + + +class Tokenizer: + """Tokenize prompts or de-tokenize tokens into texts. + + Args: + model_file (str): the path of the tokenizer model + """ + + def __init__(self, model_file: str): + if model_file.endswith('.model'): + model_folder = osp.split(model_file)[0] + else: + model_folder = model_file + model_file = osp.join(model_folder, 'tokenizer.model') + tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json') + + model_file_exists = osp.exists(model_file) + config_exists = osp.exists(tokenizer_config_file) + use_hf_model = config_exists or not model_file_exists + self.logger = get_logger('lmdeploy') + if not use_hf_model: + self.model = SentencePieceTokenizer(model_file) + else: + self.model = HuggingFaceTokenizer(model_folder) + + @property + def vocab_size(self): + """vocabulary size.""" + return self.model.vocab_size + + @property + def bos_token_id(self): + """begine of the sentence token id.""" + return self.model.bos_token_id + + @property + def eos_token_id(self): + """end of the sentence token id.""" + return self.model.eos_token_id + + def encode(self, s: str, add_bos: bool = True, **kwargs): + """Tokenize a prompt. + + Args: + s (str): a prompt + Returns: + list[int]: token ids + """ + return self.model.encode(s, add_bos, **kwargs) + + def decode(self, t: Sequence[int], offset: Optional[int] = None): + """De-tokenize. + + Args: + t (List[int]): a list of token ids + offset (int): for incrementally decoding. Default to None, which + means not applied. + Returns: + str: text of decoding tokens + """ + return self.model.decode(t, offset) + + def __call__(self, s: Union[str, Sequence[str]]): + """Tokenize prompts. + + Args: + s (str): prompts + Returns: + list[int]: token ids + """ + return self.model(s) + + def indexes_containing_token(self, token): + """Return all the possible indexes, whose decoding output may contain + the input token.""" + encoded = self.encode(token, add_bos=False) + if len(encoded) > 1: + self.logger.warning( + f'The token {token}, its length of indexes {encoded} is over ' + 'than 1. Currently, it can not be used as stop words') + return [] + return self.model.indexes_containing_token(token) diff --git a/model_repository/postprocessing/1/tokenizer/tokenizer_config.json b/model_repository/postprocessing/1/tokenizer/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f133449013be570f08fdf7c70f1a2c8ccb4724da --- /dev/null +++ b/model_repository/postprocessing/1/tokenizer/tokenizer_config.json @@ -0,0 +1,90 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92538": { + "content": "<|plugin|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92539": { + "content": "<|interpreter|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92540": { + "content": "<|action_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92541": { + "content": "<|action_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92542": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92543": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "auto_map": { + "AutoTokenizer": [ + "tokenization_internlm.InternLMTokenizer", + null + ] + }, + "bos_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "tokenizer_class": "InternLMTokenizer", + "unk_token": "" +} diff --git a/model_repository/postprocessing/config.pbtxt b/model_repository/postprocessing/config.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..a4c3fd1041dcd03dc5c18b3fc28533cb82ac5653 --- /dev/null +++ b/model_repository/postprocessing/config.pbtxt @@ -0,0 +1,36 @@ +name: "postprocessing" +backend: "python" +max_batch_size: 1 +input [ + { + name: "TOKENS_BATCH" + data_type: TYPE_UINT32 + dims: [ -1, -1 ] + }, + { + name: "sequence_length" + data_type: TYPE_UINT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_STRING + dims: [ -1, -1 ] + } +] + +instance_group [ + { + count: 16 + kind: KIND_CPU + } +] + +parameters { + key: "tokenizer_path" + value: { + string_value: "tokenizer/tokenizer.model" + } +} diff --git a/model_repository/preprocessing/1/__pycache__/model.cpython-310.pyc b/model_repository/preprocessing/1/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..447bea773ddcc3daff21ef636ce8437c6632fed8 Binary files /dev/null and b/model_repository/preprocessing/1/__pycache__/model.cpython-310.pyc differ diff --git a/model_repository/preprocessing/1/model.py b/model_repository/preprocessing/1/model.py new file mode 100644 index 0000000000000000000000000000000000000000..7e659fbae01737bd0a83980faf0e1eff9e607c3f --- /dev/null +++ b/model_repository/preprocessing/1/model.py @@ -0,0 +1,151 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp +from pathlib import Path + +import numpy as np +import torch +import triton_python_backend_utils as pb_utils +from torch.nn.utils.rnn import pad_sequence + +# This tokenizer is `lmdeploy/turbomind/tokenizer.py`. When an LLM is served +# by triton inference server, it has to be converted first by running +# `python lmdeploy/serve/turbomind/deploy.py`. Then +# `lmdeploy/turbomind/tokenizer.py` will be copied to `tokenizer/tokenizer.py` +from .tokenizer.tokenizer import Tokenizer + + +class TritonPythonModel: + """Your Python model must use the same class name. + + Every Python model that is created must have "TritonPythonModel" as the + class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device + ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # Parse model configs + self.model_config = model_config = json.loads(args['model_config']) + + # Parse model output configs and convert Triton types to numpy types + input_names = ['INPUT_ID', 'REQUEST_INPUT_LEN'] + for input_name in input_names: + setattr( + self, + input_name.lower() + '_dtype', + pb_utils.triton_string_to_numpy( + pb_utils.get_output_config_by_name( + model_config, input_name)['data_type'])) + + cur_folder = Path(__file__).parent + self.tokenizer = Tokenizer( + osp.join( + cur_folder, self.model_config['parameters']['tokenizer_path'] + ['string_value'])) + self.start_id = self.tokenizer.bos_token_id + self.end_id = self.tokenizer.eos_token_id + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + for idx, request in enumerate(requests): + # Get input tensors + query = pb_utils.get_input_tensor_by_name(request, + 'QUERY').as_numpy() + + # Preprocessing input data. + input_id, request_input_len = self._create_request(query) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + input_id_tensor = pb_utils.Tensor( + 'INPUT_ID', + np.array(input_id).astype(self.input_id_dtype)) + request_input_len_tensor = pb_utils.Tensor( + 'REQUEST_INPUT_LEN', + np.array(request_input_len).astype( + self.request_input_len_dtype)) + + # Create InferenceResponse. You can set an error here in case + # there was a problem with handling this inference request. + # Below is an example of how you can set errors in inference + # response: + # + # pb_utils.InferenceResponse( + # output_tensors=..., TritonError("An error occurred")) + inference_response = pb_utils.InferenceResponse( + output_tensors=[input_id_tensor, request_input_len_tensor]) + responses.append(inference_response) + + # You should return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + + Implementing `finalize` function is optional. This function allows the + model to perform any necessary clean ups before exit. + """ + print('Cleaning up...') + + def _create_request(self, query): + """Tokenize prompts and return the token ids and their length. + + Args: + query (List[str]): a list of prompt + Returns: + tuple: token ids and their length + """ + start_ids = [] + for s in query: + _s = s[0].decode() + if _s == '': + start_id = [self.start_id + ] if self.start_id is not None else [-1] + elif _s == '': + start_id = [self.end_id] if self.end_id is not None else [-1] + else: + start_id = self.tokenizer.encode(_s) + start_ids.append(torch.IntTensor(start_id)) + + start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids]) + start_ids = pad_sequence(start_ids, + batch_first=True, + padding_value=self.end_id) + return start_ids, start_lengths diff --git a/model_repository/preprocessing/1/tokenizer/config.json b/model_repository/preprocessing/1/tokenizer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..84235b8a1a9618cc0ac265caf61ea4088780e3b1 --- /dev/null +++ b/model_repository/preprocessing/1/tokenizer/config.json @@ -0,0 +1,37 @@ +{ + "_name_or_path": "/root/psy/internlm2-7b/work_dirs/internlm2_chat_7b_qlora_oasst1_512_e3_copy/hf_2/merge", + "architectures": [ + "InternLM2ForCausalLM" + ], + "attn_implementation": "eager", + "auto_map": { + "AutoConfig": "configuration_internlm.InternLMConfig", + "AutoModel": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM" + }, + "bias": false, + "bos_token_id": 1, + "eos_token_id": 2, + "fp16": true, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "model_type": "internlm", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pad_token_id": 2, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 2.0, + "type": "dynamic" + }, + "rope_theta": 1000000, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.37.2", + "use_cache": false, + "vocab_size": 92544 +} diff --git a/model_repository/preprocessing/1/tokenizer/configuration_internlm.py b/model_repository/preprocessing/1/tokenizer/configuration_internlm.py new file mode 100644 index 0000000000000000000000000000000000000000..4d013582feaa1f9970a4256c4a0f77000fa645de --- /dev/null +++ b/model_repository/preprocessing/1/tokenizer/configuration_internlm.py @@ -0,0 +1,164 @@ +# coding=utf-8 +# Copyright (c) InternLM. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" InternLM model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {} + + +class InternLMConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate + an InternLM model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the InternLM-7B. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`InternLMModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings(`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + Example: + + ```python + >>> from transformers import InternLMModel, InternLMConfig + + >>> # Initializing a InternLM internlm-7b style configuration + >>> configuration = InternLMConfig() + + >>> # Initializing a model from the internlm-7b style configuration + >>> model = InternLMModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "internlm" + _auto_class = "AutoConfig" + + def __init__( # pylint: disable=W0102 + self, + vocab_size=103168, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + bias=True, + rope_theta=10000, + rope_scaling=None, + attn_implementation="eager", + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.bias = bias + + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + + self.attn_implementation = attn_implementation + if self.attn_implementation is None: + self.attn_implementation = "eager" + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}") diff --git a/model_repository/preprocessing/1/tokenizer/generation_config.json b/model_repository/preprocessing/1/tokenizer/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cc5efeadd3bf2caa4462a3be79d580690f410668 --- /dev/null +++ b/model_repository/preprocessing/1/tokenizer/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2, + "transformers_version": "4.37.2" +} diff --git a/model_repository/preprocessing/1/tokenizer/modeling_internlm2.py b/model_repository/preprocessing/1/tokenizer/modeling_internlm2.py new file mode 100644 index 0000000000000000000000000000000000000000..39d6f71d2933385988ec05f845d3f6386c97f74b --- /dev/null +++ b/model_repository/preprocessing/1/tokenizer/modeling_internlm2.py @@ -0,0 +1,1385 @@ +# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved. +# +# This code is based on transformers/src/transformers/models/llama/modeling_llama.py +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch InternLM2 model.""" +import math +import queue +import threading +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from einops import rearrange +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from transformers.activations import ACT2FN +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) + +try: + from transformers.generation.streamers import BaseStreamer +except: # noqa # pylint: disable=bare-except + BaseStreamer = None + +from .configuration_internlm import InternLMConfig as InternLM2Config + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "InternLM2Config" + +flash_attn_func, flash_attn_varlen_func = None, None +pad_input, index_first_axis, unpad_input = None, None, None +def _import_flash_attn(): + global flash_attn_func, flash_attn_varlen_func + global pad_input, index_first_axis, unpad_input + try: + from flash_attn import flash_attn_func as _flash_attn_func, flash_attn_varlen_func as _flash_attn_varlen_func + from flash_attn.bert_padding import pad_input as _pad_input, index_first_axis as _index_first_axis, unpad_input as _unpad_input + flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func + pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input + except ImportError: + raise ImportError("flash_attn is not installed.") + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.bart.modeling_bart._make_causal_mask +def _make_causal_mask( + input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 +): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM2 +class InternLM2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + InternLM2RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +# Copied from transformers.model.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM2 +class InternLM2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=torch.float32) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.model.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->InternLM2 +class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding): + """InternLM2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + t = t / self.scaling_factor + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.model.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM2 +class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding): + """InternLM2RotaryEmbedding extended with Dynamic NTK scaling. + Credits to the Reddit users /u/bloc97 and /u/emozilla. + """ + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.model.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors.""" + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class InternLM2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.w1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.w3 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.w2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x)) + + return down_proj + + +# Copied from transformers.model.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +# Modified from transformers.model.llama.modeling_llama.LlamaAttention +class InternLM2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: InternLM2Config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.wqkv = nn.Linear( + self.hidden_size, + (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim, + bias=config.bias, + ) + + self.wo = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = InternLM2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "dynamic": + self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, + scaling_factor=scaling_factor, + ) + elif scaling_type == "linear": + self.rotary_emb = InternLM2LinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, + scaling_factor=scaling_factor, + ) + else: + raise ValueError("Currently we only support rotary embedding's type being 'dynamic' or 'linear'.") + return self.rotary_emb + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + + bsz, q_len, _ = hidden_states.size() + + qkv_states = self.wqkv(hidden_states) + + qkv_states = rearrange( + qkv_states, + "b q (h gs d) -> b q h gs d", + gs=2 + self.num_key_value_groups, + d=self.head_dim, + ) + + query_states = qkv_states[..., : self.num_key_value_groups, :] + query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d") + key_states = qkv_states[..., -2, :] + value_states = qkv_states[..., -1, :] + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.wo(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +# Modified from transformers.model.llama.modeling_llama.InternLM2FlashAttention2 +class InternLM2FlashAttention2(InternLM2Attention): + """ + InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # InternLM2FlashAttention2 attention does not support output_attentions + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + qkv_states = self.wqkv(hidden_states) + + qkv_states = rearrange( + qkv_states, + "b q (h gs d) -> b q h gs d", + gs=2 + self.num_key_value_groups, + d=self.head_dim, + ) + + query_states = qkv_states[..., : self.num_key_value_groups, :] + query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d") + key_states = qkv_states[..., -2, :] + value_states = qkv_states[..., -1, :] + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, q_len + ) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.wo(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + # Contains at least one padding token in the sequence + causal = self.is_causal and query_length != 1 + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q.to(torch.int64), + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + +INTERNLM2_ATTENTION_CLASSES = { + "eager": InternLM2Attention, + "flash_attention_2": InternLM2FlashAttention2, +} + +# Modified from transformers.model.llama.modeling_llama.LlamaDecoderLayer +class InternLM2DecoderLayer(nn.Module): + def __init__(self, config: InternLM2Config): + super().__init__() + self.hidden_size = config.hidden_size + + self.attention = INTERNLM2_ATTENTION_CLASSES[config.attn_implementation](config=config) + + self.feed_forward = InternLM2MLP(config) + self.attention_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.ffn_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + + residual = hidden_states + + hidden_states = self.attention_norm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.attention( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.ffn_norm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +InternLM2_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`InternLM2Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2 +@add_start_docstrings( + "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.", + InternLM2_START_DOCSTRING, +) +class InternLM2PreTrainedModel(PreTrainedModel): + config_class = InternLM2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["InternLM2DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +InternLM2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or + when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Modified from transformers.model.llama.modeling_llama.LlamaModel +@add_start_docstrings( + "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.", + InternLM2_START_DOCSTRING, +) +class InternLM2Model(InternLM2PreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLM2DecoderLayer`] + + Args: + config: InternLM2Config + """ + + _auto_class = "AutoModel" + + def __init__(self, config: InternLM2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.config = config + + self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + + self.layers = nn.ModuleList([InternLM2DecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.tok_embeddings + + def set_input_embeddings(self, value): + self.tok_embeddings = value + + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + device=inputs_embeds.device, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( + inputs_embeds.device + ) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.attn_implementation == "flash_attention_2": + _import_flash_attn() + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + seq_length_with_past = seq_length + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.tok_embeddings(input_ids) + + if self.config.attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + else: + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + # embed positions + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, output_attentions, None) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +# Modified from transformers.model.llama.modeling_llama.LlamaForCausalLM +class InternLM2ForCausalLM(InternLM2PreTrainedModel): + _auto_class = "AutoModelForCausalLM" + + _tied_weights_keys = ["output.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = InternLM2Model(config) + self.vocab_size = config.vocab_size + self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.tok_embeddings + + def set_input_embeddings(self, value): + self.model.tok_embeddings = value + + def get_output_embeddings(self): + return self.output + + def set_output_embeddings(self, new_embeddings): + self.output = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, InternLM2ForCausalLM + + >>> model = InternLM2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.output(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values is not None: + past_length = past_key_values[0][0].shape[2] + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""): + prompt = "" + if meta_instruction: + prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n""" + else: + prompt += "" + for record in history: + prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n""" + prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n""" + return tokenizer([prompt], return_tensors="pt") + + @torch.no_grad() + def chat( + self, + tokenizer, + query: str, + history: List[Tuple[str, str]] = [], + streamer: Optional[BaseStreamer] = None, + max_new_tokens: int = 1024, + do_sample: bool = True, + temperature: float = 0.8, + top_p: float = 0.8, + meta_instruction: str = "You are an AI assistant whose name is InternLM (书生·浦语).\n" + "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n" + "- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.", + **kwargs, + ): + inputs = self.build_inputs(tokenizer, query, history, meta_instruction) + inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)} + # also add end-of-assistant token in eos token id to avoid unnecessary generation + eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]] + outputs = self.generate( + **inputs, + streamer=streamer, + max_new_tokens=max_new_tokens, + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + eos_token_id=eos_token_id, + **kwargs, + ) + outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :] + response = tokenizer.decode(outputs, skip_special_tokens=True) + response = response.split("<|im_end|>")[0] + history = history + [(query, response)] + return response, history + + @torch.no_grad() + def stream_chat( + self, + tokenizer, + query: str, + history: List[Tuple[str, str]] = [], + max_new_tokens: int = 1024, + do_sample: bool = True, + temperature: float = 0.8, + top_p: float = 0.8, + **kwargs, + ): + """ + Return a generator in format: (response, history) + Eg. + ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')]) + ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')]) + """ + if BaseStreamer is None: + raise ModuleNotFoundError( + "The version of `transformers` is too low. Please make sure " + "that you have installed `transformers>=4.28.0`." + ) + + response_queue = queue.Queue(maxsize=20) + + class ChatStreamer(BaseStreamer): + def __init__(self, tokenizer) -> None: + super().__init__() + self.tokenizer = tokenizer + self.queue = response_queue + self.query = query + self.history = history + self.response = "" + self.received_inputs = False + self.queue.put((self.response, history + [(self.query, self.response)])) + + def put(self, value): + if len(value.shape) > 1 and value.shape[0] > 1: + raise ValueError("ChatStreamer only supports batch size 1") + elif len(value.shape) > 1: + value = value[0] + + if not self.received_inputs: + # The first received value is input_ids, ignore here + self.received_inputs = True + return + + token = self.tokenizer.decode([value[-1]], skip_special_tokens=True) + if token.strip() != "<|im_end|>": + self.response = self.response + token + history = self.history + [(self.query, self.response)] + self.queue.put((self.response, history)) + + def end(self): + self.queue.put(None) + + def stream_producer(): + return self.chat( + tokenizer=tokenizer, + query=query, + streamer=ChatStreamer(tokenizer=tokenizer), + history=history, + max_new_tokens=max_new_tokens, + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + **kwargs, + ) + + def consumer(): + producer = threading.Thread(target=stream_producer) + producer.start() + while True: + res = response_queue.get() + if res is None: + return + yield res + + return consumer() + + +# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2 +@add_start_docstrings( + """ + The InternLM2 Model transformer with a sequence classification head on top (linear layer). + + [`InternLM2ForSequenceClassification`] uses the last token in order to do the classification, + as other causal models (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + InternLM2_START_DOCSTRING, +) +class InternLM2ForSequenceClassification(InternLM2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = InternLM2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.tok_embeddings + + def set_input_embeddings(self, value): + self.model.tok_embeddings = value + + @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to( + logits.device + ) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/model_repository/preprocessing/1/tokenizer/placeholder b/model_repository/preprocessing/1/tokenizer/placeholder new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model_repository/preprocessing/1/tokenizer/pytorch_model.bin.index.json b/model_repository/preprocessing/1/tokenizer/pytorch_model.bin.index.json new file mode 100644 index 0000000000000000000000000000000000000000..7d95cf180df4c423e817c55f30f5ce93ac80e220 --- /dev/null +++ b/model_repository/preprocessing/1/tokenizer/pytorch_model.bin.index.json @@ -0,0 +1,554 @@ +{ + "metadata": { + "total_size": 5251801088 + }, + "weight_map": { + "model.layers.0.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.10.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.10.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.10.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.2.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.20.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.27.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.27.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.27.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.3.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.30.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.4.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.norm.weight": "pytorch_model-00003-of-00003.bin", + "model.tok_embeddings.weight": "pytorch_model-00001-of-00003.bin", + "output.weight": "pytorch_model-00003-of-00003.bin" + } +} diff --git a/model_repository/preprocessing/1/tokenizer/special_tokens_map.json b/model_repository/preprocessing/1/tokenizer/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..492d4b2966a1763442d426d880dbc29f94906e4c --- /dev/null +++ b/model_repository/preprocessing/1/tokenizer/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/model_repository/preprocessing/1/tokenizer/tokenization_internlm.py b/model_repository/preprocessing/1/tokenizer/tokenization_internlm.py new file mode 100644 index 0000000000000000000000000000000000000000..9792349c7fed6fc64476eabdd9dad7a84640c3ee --- /dev/null +++ b/model_repository/preprocessing/1/tokenizer/tokenization_internlm.py @@ -0,0 +1,240 @@ +# coding=utf-8 +# Copyright (c) InternLM. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tokenization classes for IntermLM.""" +import os +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm +from transformers.tokenization_utils import PreTrainedTokenizer +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"} + +PRETRAINED_VOCAB_FILES_MAP = {} + + +class InternLMTokenizer(PreTrainedTokenizer): + """ + Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + model_input_names = ["input_ids", "attention_mask"] + _auto_class = "AutoTokenizer" + + def __init__( + self, + vocab_file, + unk_token="", + bos_token="", + eos_token="", + pad_token="", + sp_model_kwargs: Optional[Dict[str, Any]] = None, + add_bos_token=True, + add_eos_token=False, + decode_with_prefix_space=False, + clean_up_tokenization_spaces=False, + **kwargs, + ): + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.vocab_file = vocab_file + self.add_bos_token = add_bos_token + self.add_eos_token = add_eos_token + self.decode_with_prefix_space = decode_with_prefix_space + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + self._no_prefix_space_tokens = None + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + """ Initialization""" + + @property + def no_prefix_space_tokens(self): + if self._no_prefix_space_tokens is None: + vocab = self.convert_ids_to_tokens(list(range(self.vocab_size))) + self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")} + return self._no_prefix_space_tokens + + @property + def vocab_size(self): + """Returns vocab size""" + return self.sp_model.get_piece_size() + + @property + def bos_token_id(self) -> Optional[int]: + return self.sp_model.bos_id() + + @property + def eos_token_id(self) -> Optional[int]: + return self.sp_model.eos_id() + + def get_vocab(self): + """Returns vocab as a dict""" + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text): + """Returns a tokenized string.""" + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.piece_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + token = self.sp_model.IdToPiece(index) + return token + + def _maybe_add_prefix_space(self, tokens, decoded): + if tokens and tokens[0] not in self.no_prefix_space_tokens: + return " " + decoded + else: + return decoded + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + out_string = self.clean_up_tokenization(out_string) + out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string) + return out_string[1:] + + def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + if self.add_bos_token: + bos_token_ids = [self.bos_token_id] + else: + bos_token_ids = [] + + output = bos_token_ids + token_ids_0 + + if token_ids_1 is not None: + output = output + token_ids_1 + + if self.add_eos_token: + output = output + [self.eos_token_id] + + return output + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make + use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + eos = [self.eos_token_id] + + if token_ids_1 is None: + return len(token_ids_0 + eos) * [0] + return len(token_ids_0 + eos + token_ids_1 + eos) * [0] diff --git a/model_repository/preprocessing/1/tokenizer/tokenizer.model b/model_repository/preprocessing/1/tokenizer/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6600712949ca9c4ffb50f25275993a21fba0b408 --- /dev/null +++ b/model_repository/preprocessing/1/tokenizer/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b +size 1477754 diff --git a/model_repository/preprocessing/1/tokenizer/tokenizer.py b/model_repository/preprocessing/1/tokenizer/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..db936a5501cb07d33d56083656dbd734ba7431bf --- /dev/null +++ b/model_repository/preprocessing/1/tokenizer/tokenizer.py @@ -0,0 +1,400 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os +import os.path as osp +from collections import deque +from typing import List, Optional, Sequence, Union + +import torch + +from lmdeploy.utils import get_logger + +# this file will be copied to triton server, make sure all +# importing are starting from the package root lmdeploy + + +class SentencePieceTokenizer: + """Tokenizer of sentencepiece. + + Args: + model_file (str): the path of the tokenizer model + """ + + def __init__(self, model_file: str): + from sentencepiece import SentencePieceProcessor + self.model = SentencePieceProcessor(model_file=model_file) + self._prefix_space_tokens = None + # for stop words + self._maybe_decode_bytes: bool = None + # TODO maybe lack a constant.py + self._indexes_tokens_deque = deque(maxlen=10) + self.max_indexes_num = 5 + self.logger = get_logger('lmdeploy') + + @property + def vocab_size(self): + """vocabulary size.""" + return self.model.vocab_size() + + @property + def bos_token_id(self): + """begine of the sentence token id.""" + return self.model.bos_id() + + @property + def eos_token_id(self): + """end of the sentence token id.""" + return self.model.eos_id() + + @property + def prefix_space_tokens(self): + """tokens without prefix space.""" + if self._prefix_space_tokens is None: + vocab = self.model.IdToPiece(list(range(self.vocab_size))) + self._prefix_space_tokens = { + i + for i, tok in enumerate(vocab) if tok.startswith('▁') + } + return self._prefix_space_tokens + + def _maybe_add_prefix_space(self, tokens, decoded): + """maybe add prefix space for incremental decoding.""" + if len(tokens) and not decoded.startswith(' ') and\ + tokens[0] in self.prefix_space_tokens: + return ' ' + decoded + else: + return decoded + + def indexes_containing_token(self, token: str): + """Return all the possible indexes, whose decoding output may contain + the input token.""" + # traversing vocab is time consuming, can not be accelerated with + # multi threads (computation) or multi process (can't pickle tokenizer) + # so, we maintain latest 10 stop words and return directly if matched + for _token, _indexes in self._indexes_tokens_deque: + if token == _token: + return _indexes + if token == ' ': # ' ' is special + token = '▁' + vocab = self.model.IdToPiece(list(range(self.vocab_size))) + indexes = [i for i, voc in enumerate(vocab) if token in voc] + if len(indexes) > self.max_indexes_num: + indexes = self.encode(token, add_bos=False)[-1:] + self.logger.warning( + f'There are too many(>{self.max_indexes_num}) possible ' + f'indexes may decoding {token}, we will use {indexes} only') + self._indexes_tokens_deque.append((token, indexes)) + return indexes + + def encode(self, s: str, add_bos: bool = True, **kwargs): + """Tokenize a prompt. + + Args: + s (str): a prompt + Returns: + list[int]: token ids + """ + return self.model.Encode(s, add_bos=add_bos, **kwargs) + + def decode(self, t: Sequence[int], offset: Optional[int] = None): + """De-tokenize. + + Args: + t (List[int]): a list of token ids + offset (int): for incrementally decoding. Default to None, which + means not applied. + Returns: + str: text of decoding tokens + """ + if isinstance(t, torch.Tensor): + t = t.tolist() + t = t[offset:] + out_string = self.model.Decode(t) + if offset: + out_string = self._maybe_add_prefix_space(t, out_string) + return out_string + + def __call__(self, s: Union[str, Sequence[str]]): + """Tokenize prompts. + + Args: + s (str): prompts + Returns: + list[int]: token ids + """ + import addict + add_bos = False + add_eos = False + + input_ids = self.model.Encode(s, add_bos=add_bos, add_eos=add_eos) + return addict.Addict(input_ids=input_ids) + + +class HuggingFaceTokenizer: + """Tokenizer of sentencepiece. + + Args: + model_dir (str): the directory of the tokenizer model + """ + + def __init__(self, model_dir: str): + from transformers import AutoTokenizer + model_file = osp.join(model_dir, 'tokenizer.model') + backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json') + model_file_exists = osp.exists(model_file) + self.logger = get_logger('lmdeploy') + if not osp.exists(backend_tokenizer_file) and model_file_exists: + self.logger.warning( + 'Can not find tokenizer.json. ' + 'It may take long time to initialize the tokenizer.') + self.model = AutoTokenizer.from_pretrained(model_dir, + trust_remote_code=True) + self._prefix_space_tokens = None + # save tokenizer.json to reuse + if not osp.exists(backend_tokenizer_file) and model_file_exists: + if hasattr(self.model, 'backend_tokenizer'): + if os.access(model_dir, os.W_OK): + self.model.backend_tokenizer.save(backend_tokenizer_file) + + if self.model.eos_token_id is None: + generation_config_file = osp.join(model_dir, + 'generation_config.json') + if osp.exists(generation_config_file): + with open(generation_config_file, 'r') as f: + cfg = json.load(f) + self.model.eos_token_id = cfg['eos_token_id'] + elif hasattr(self.model, 'eod_id'): # Qwen remote + self.model.eos_token_id = self.model.eod_id + + # for stop words + self._vocab_size_with_added: int = None + self._maybe_decode_bytes: bool = None + # TODO maybe lack a constant.py + self._indexes_tokens_deque = deque(maxlen=10) + self.max_indexes_num = 5 + self.token2id = {} + + @property + def vocab_size(self): + """vocabulary size.""" + return self.model.vocab_size + + @property + def vocab_size_with_added(self): + """vocabulary size with added vocab.""" + if self._vocab_size_with_added is not None: + return self._vocab_size_with_added + self._vocab_size_with_added = len(self.model.get_vocab()) + return self._vocab_size_with_added + + @property + def bos_token_id(self): + """begine of the sentence token id.""" + return self.model.bos_token_id + + @property + def eos_token_id(self): + """end of the sentence token id.""" + return self.model.eos_token_id + + @property + def prefix_space_tokens(self): + """tokens without prefix space.""" + if self._prefix_space_tokens is None: + vocab = self.model.convert_ids_to_tokens( + list(range(self.vocab_size))) + self._prefix_space_tokens = { + i + for i, tok in enumerate(vocab) + if tok.startswith('▁' if isinstance(tok, str) else b' ') + } + return self._prefix_space_tokens + + def _maybe_add_prefix_space(self, tokens: List[int], decoded: str): + """maybe add prefix space for incremental decoding.""" + if len(tokens) and not decoded.startswith(' ') and\ + tokens[0] in self.prefix_space_tokens: + return ' ' + decoded + else: + return decoded + + @property + def maybe_decode_bytes(self): + """Check if self.model.convert_ids_to_tokens return not a str value.""" + if self._maybe_decode_bytes is None: + self._maybe_decode_bytes = False + vocab = self.model.convert_ids_to_tokens( + list(range(self.vocab_size))) + for tok in vocab: + if not isinstance(tok, str): + self._maybe_decode_bytes = True + break + return self._maybe_decode_bytes + + def indexes_containing_token(self, token: str): + """Return all the possible indexes, whose decoding output may contain + the input token.""" + # traversing vocab is time consuming, can not be accelerated with + # multi threads (computation) or multi process (can't pickle tokenizer) + # so, we maintain latest 10 stop words and return directly if matched + for _token, _indexes in self._indexes_tokens_deque: + if token == _token: + return _indexes + + if self.token2id == {}: + # decode is slower than convert_ids_to_tokens + if self.maybe_decode_bytes: + self.token2id = { + self.model.decode(i): i + for i in range(self.vocab_size) + } + else: + self.token2id = { + self.model.convert_ids_to_tokens(i): i + for i in range(self.vocab_size) + } + if token == ' ': # ' ' is special + token = '▁' + indexes = [i for _token, i in self.token2id.items() if token in _token] + if len(indexes) > self.max_indexes_num: + indexes = self.encode(token, add_bos=False)[-1:] + self.logger.warning( + f'There are too many(>{self.max_indexes_num}) possible ' + f'indexes may decoding {token}, we will use {indexes} only') + # there might be token id that exceeds self.vocab_size + if len(indexes) == 0: + indexes = self.encode(token, False) + if len(indexes) != 1: + self.logger.warning( + f'The token {token}, its length of indexes {indexes} is ' + 'not 1. Currently, it can not be used as stop words') + indexes = [] + self._indexes_tokens_deque.append((token, indexes)) + return indexes + + def encode(self, s: str, add_bos: bool = True, **kwargs): + """Tokenize a prompt. + + Args: + s (str): a prompt + Returns: + list[int]: token ids + """ + encoded = self.model.encode(s, **kwargs) + if not add_bos: + # in the middle of a session + if len(encoded) and encoded[0] == self.bos_token_id: + encoded = encoded[1:] + return encoded + + def decode(self, t: Sequence[int], offset: Optional[int] = None): + """De-tokenize. + + Args: + t (List[int]): a list of token ids + offset (int): for incrementally decoding. Default to None, which + means not applied. + Returns: + str: text of decoding tokens + """ + skip_special_tokens = True + t = t[offset:] + out_string = self.model.decode(t, + skip_special_tokens=skip_special_tokens) + if offset: + out_string = self._maybe_add_prefix_space(t, out_string) + return out_string + + def __call__(self, s: Union[str, Sequence[str]]): + """Tokenize prompts. + + Args: + s (str): prompts + Returns: + list[int]: token ids + """ + add_special_tokens = False + return self.model(s, add_special_tokens=add_special_tokens) + + +class Tokenizer: + """Tokenize prompts or de-tokenize tokens into texts. + + Args: + model_file (str): the path of the tokenizer model + """ + + def __init__(self, model_file: str): + if model_file.endswith('.model'): + model_folder = osp.split(model_file)[0] + else: + model_folder = model_file + model_file = osp.join(model_folder, 'tokenizer.model') + tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json') + + model_file_exists = osp.exists(model_file) + config_exists = osp.exists(tokenizer_config_file) + use_hf_model = config_exists or not model_file_exists + self.logger = get_logger('lmdeploy') + if not use_hf_model: + self.model = SentencePieceTokenizer(model_file) + else: + self.model = HuggingFaceTokenizer(model_folder) + + @property + def vocab_size(self): + """vocabulary size.""" + return self.model.vocab_size + + @property + def bos_token_id(self): + """begine of the sentence token id.""" + return self.model.bos_token_id + + @property + def eos_token_id(self): + """end of the sentence token id.""" + return self.model.eos_token_id + + def encode(self, s: str, add_bos: bool = True, **kwargs): + """Tokenize a prompt. + + Args: + s (str): a prompt + Returns: + list[int]: token ids + """ + return self.model.encode(s, add_bos, **kwargs) + + def decode(self, t: Sequence[int], offset: Optional[int] = None): + """De-tokenize. + + Args: + t (List[int]): a list of token ids + offset (int): for incrementally decoding. Default to None, which + means not applied. + Returns: + str: text of decoding tokens + """ + return self.model.decode(t, offset) + + def __call__(self, s: Union[str, Sequence[str]]): + """Tokenize prompts. + + Args: + s (str): prompts + Returns: + list[int]: token ids + """ + return self.model(s) + + def indexes_containing_token(self, token): + """Return all the possible indexes, whose decoding output may contain + the input token.""" + encoded = self.encode(token, add_bos=False) + if len(encoded) > 1: + self.logger.warning( + f'The token {token}, its length of indexes {encoded} is over ' + 'than 1. Currently, it can not be used as stop words') + return [] + return self.model.indexes_containing_token(token) diff --git a/model_repository/preprocessing/1/tokenizer/tokenizer_config.json b/model_repository/preprocessing/1/tokenizer/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f133449013be570f08fdf7c70f1a2c8ccb4724da --- /dev/null +++ b/model_repository/preprocessing/1/tokenizer/tokenizer_config.json @@ -0,0 +1,90 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92538": { + "content": "<|plugin|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92539": { + "content": "<|interpreter|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92540": { + "content": "<|action_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92541": { + "content": "<|action_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92542": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92543": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "auto_map": { + "AutoTokenizer": [ + "tokenization_internlm.InternLMTokenizer", + null + ] + }, + "bos_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "tokenizer_class": "InternLMTokenizer", + "unk_token": "" +} diff --git a/model_repository/preprocessing/config.pbtxt b/model_repository/preprocessing/config.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..997ba399ba04f1f521bdbf088815d1dd3c26f696 --- /dev/null +++ b/model_repository/preprocessing/config.pbtxt @@ -0,0 +1,37 @@ +name: "preprocessing" +backend: "python" +max_batch_size: 1 + +input [ + { + name: "QUERY" + data_type: TYPE_STRING + dims: [ -1 ] + } +] +output [ + { + name: "INPUT_ID" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "REQUEST_INPUT_LEN" + data_type: TYPE_UINT32 + dims: [ 1 ] + } +] + +instance_group [ + { + count: 4 + kind: KIND_CPU + } +] + +parameters { + key: "tokenizer_path" + value: { + string_value: "tokenizer/tokenizer.model" + } +} diff --git a/model_repository/turbomind/1/placeholder b/model_repository/turbomind/1/placeholder new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model_repository/turbomind/1/weights/config.ini b/model_repository/turbomind/1/weights/config.ini new file mode 100644 index 0000000000000000000000000000000000000000..88f3d40970a1e663689736be546f8d3d64bb8734 --- /dev/null +++ b/model_repository/turbomind/1/weights/config.ini @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c8358cd3fffcb86829f6b600bdd0ba77b6147eed572f88700ec4d914db070d6 +size 645 diff --git a/model_repository/turbomind/1/weights/layers.0.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.0.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..4f5435a75963ce7ce17b0536f500c8ebf8ca4220 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.0.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1763929a6e7bbdafdb81d39ebfa08263351ccea12347aa68b292b1b7c458e45 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.0.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.0.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..52107ec494683ad0e0403e4189bcceed1ceabdcb --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.0.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ed40e83191f5304fd2df93ff5b90ae9a165bbe489af8020e06948fbbb289d7d +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.0.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.0.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..6e21231bbe43b92e43a0d2600ed6969f6c00e767 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.0.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6710235be94402052aaaae809e488f433d75d6d33acf546e2d0bf7aae4d8f0f +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.0.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.0.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..4961bf6cfbf6ae7592675c56d719924794d8da68 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.0.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c069c91ef3a796ac2e9e0230319fabb6bc8433c68284c6e5ca71baa477a3438 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.0.attention_norm.weight b/model_repository/turbomind/1/weights/layers.0.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..51dd734ab95204a4ce7fd026707a375f1a85219f --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.0.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dde3cfe82d02d87660f40c667186249cd17a5ee5924ab2a3ea0385919a2d0f3b +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.0.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.0.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f3167a75e6defd59aa396437f58c797bb5cf1b2c --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.0.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26bc912102aa2b487baf312f3bfd8f97dc46ba6761c2328bfd3e45581bfbcfd4 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.0.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.0.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..68343cbdcbc17ec725af43c1a1d53b62bc5c32c0 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.0.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:309c93937a8778e4e4dce879efd1e0673f4bb7701644628abbaa8420e5b24cf0 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.0.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.0.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..3e82c77a6ba7b16d19d55f544f872223d33fba6d --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.0.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d096d08769d4b05f7483b4ed024224e0d4d35772231e757157e69c9c0dc1c6ef +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.0.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.0.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..fee7031bc4703588c99d993aaf4e1c0f1d080e5b --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.0.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdb73c0a0f614f1033850266d6ff4311374557a2653e0fa7857f8507ca87058e +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.0.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.0.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..e8f321d4e16161bcdf7f2b6979e9f90b8aa04ef3 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.0.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5b414270e0d50fbec62cdab6ecd217c2f688872d5ed7d9f91bb75dfff46651b +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.0.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.0.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..e376c6acc6ad65b07267f834beda69a889c5f0b1 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.0.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25f7250671024d0129c45c3f3d8f57887921d219c280350697d41e9170925c77 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.1.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.1.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..bb3ebc7beaa1d925c4a14fbad6d2df2ec6bad94f --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.1.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a125e82d7ee989858902abca2bec9dc3f4ad74008f5307a1e7a635d148c53f3a +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.1.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.1.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..bc0ed1f6f8ef00629e07ce4989e2ddde96723c08 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.1.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f96d91127194d8a8404809f81602727e59903c86473ee27012bb303f83cdf77 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.1.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.1.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2eaa43207863db980e17ed160bc4613b175baf27 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.1.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4905342d79812e6bd9d6d993443ee6b30df2f80cef44176d1398dc884c458bad +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.1.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.1.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c136a82b25947dc950216cf643734a4a5ee81a36 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.1.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c7971bdedd76bbe5630fd97b2badbdd26d22055ffe6fe0374fff051af9feb80 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.1.attention_norm.weight b/model_repository/turbomind/1/weights/layers.1.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..abe49b3b4fe282cbcf269cc92e4a1b03f8304d1b --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.1.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d589a6b27b707580d37c4b198dc952071bb1a34967ebd9175f9055ac012bc781 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.1.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.1.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..7d2bbd8d926a99dd1ba3adf0859660ace736b884 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.1.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dd761cf75a1f95c5a55a245fbe1a8bca8967be0d7a03dd12108d0be835d7682 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.1.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.1.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..9fb67e07dca86f3c043855b520b84ed83c9b4930 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.1.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d4fdfeee03517f7896aadab5adec50c8449a2e1bda2f0cf5b8725b26057d1f6 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.1.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.1.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..83348571bf69b92747b68f25d3755c7b2146e4c5 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.1.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0c42be27fe2e9f48473b5cc4ec63cd06575ade857ea8699b4bd05eb4f801dc6 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.1.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.1.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..7f8d31081aee57241eed23ae114dd5e39f9e6bbf --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.1.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe915a8697f98fe80270d235325b469219fac1c8a4529052fd15f6b1ee8f13e6 +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.1.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.1.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..6db47869baaf62ea10c904bb39ca2fd8dcb35aa5 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.1.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90fa27f32ad04b368d7110fb689b24ea02904efb2f2b7a9f9be876c331fc7212 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.1.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.1.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..87ba80c2080cfc64bd645133d99c4fb0f602b920 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.1.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08456e5241a0fbd14699cb889680261c9e0ca7d30051066d899e99be24e15d52 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.10.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.10.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..35f6c98510eb157f0971d9d241b2ec765cd3c834 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.10.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4d8d7ae69eea66730a10e906758105f2c99b16d082b9ea84d7e7cd8afcdbd4c +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.10.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.10.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..77eb52490f504dbd5b089674f267142c27e7acc0 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.10.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2885240377b91bd85bbe4ee6f67b8ca23233584c35ce71b752f9f3bbb66e266c +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.10.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.10.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..440d3e309d85cdfb81736fd024a2834f4d0ce308 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.10.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae5115820467dcb2720eeb7abbdaf3ecd5edb56d9d7453fb0bf4f6b65323445a +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.10.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.10.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..18b5ecc65f6f8133a1821de0925d37622a67af48 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.10.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4438217ed5de15cb91f4e30f0644b08952e981d25015dd4b75c4a0cae83517c2 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.10.attention_norm.weight b/model_repository/turbomind/1/weights/layers.10.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..4f0f39a02bb84010dd644e2fc96ef3b46d4c2820 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.10.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cd2c0d884542c0a881ef8fcfc9fbcc1feb67afbff0a8befc9bb741e2d8ea2af +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.10.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.10.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..bf50b623e7b1f4520d761286edd1db51a109c4c6 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.10.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a1258ea1e97e4c41db26a363eddedd3bd47c6d49f7bf738703c5746c54f4e37 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.10.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.10.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..ee36f684587a649d68d9579441ca3e90af8d7d6e --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.10.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48e7492a7d4447980961b5891a0997f2568bdbe10ed15ba0998f8ca1bdaf0a4c +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.10.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.10.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..b0cce8413321f6074dc61c7a28bc92377f4c7ab2 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.10.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fb81b3c6a3f7b674506b003621b7e92925754e97d23ecb1209003f2232e33cb +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.10.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.10.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..ce1603f2d10d9ae9ef7251cb66a02c3e0cba6b67 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.10.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:773b9c8eb4a3818b2667162b3169bd4fe813f2fcba5c708a49b79fa5c5053c61 +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.10.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.10.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..bbe9a16316f0db34745e41ef00224f94b9237fee --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.10.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b576f4d059d0f37a4fd3e626e640dad540ff4758aa449bafe55a78046a01dc9b +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.10.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.10.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..da0421db9e924c29c37c13c09376487aaa383c8d --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.10.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:430d675f2f2e4512591d558ea6f29e42dd38c55ffcd8d21873a12e9ff90e15b2 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.11.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.11.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..d5058e0b21a7342d2379f3a9315e85ef9bbe7682 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.11.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2871ddd112a88bb89a549de3bf1c53af525e962e118eb7ad0feac6a56599a26e +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.11.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.11.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..92844164ec6f5b42e8222c577ce94bae5314a9c9 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.11.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de7017bdedc110df3a9f9fab19466968a5488b9ab3ad533f0908f2d368371adb +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.11.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.11.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c67e6d4b3e11faa456791b77155fef70589e246f --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.11.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:530e3110fadceb664c29ff9da577cf401128e93ae21601affd1c62137b04db35 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.11.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.11.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..4e0d310e48ae8ebd9b629872134eb3687a55e341 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.11.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1725da8fac86700a95c4ee9d40cf9ebf0d1ebabb4b145c2d57c4a31c42299cb8 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.11.attention_norm.weight b/model_repository/turbomind/1/weights/layers.11.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..f57dfc1e256d2fca8f1c8d59982ea28fb2f209c8 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.11.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cb24612b49347f84741d6daab9a90b828aab924fc9b21fd2d2ca6b67abf8ea8 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.11.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.11.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..27905dc8bb55b6305cefdf0135d72eda3e7e17d9 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.11.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0af7f58d1e58e6610b5b56291bf697d79471c1eeaefdff9466fdc87996c3c86 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.11.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.11.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..474796975c206470856a63e5627806fdd1a9d0e4 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.11.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46e2d6846839f995e9434c35519a1152c52285d29672febe66e9f07b0e7523e5 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.11.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.11.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..b8e4a4f967601a2151a7eb5da1c126599eea4743 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.11.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ae182cb83af72cac11a76113fc5492ae4ccda1cd45df36facac10e65369d22c +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.11.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.11.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..aac9a3ac0afb93d279461dacd82e1fd80dfb6161 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.11.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54af6ef8d3b0aaa32183d5fb176a4d2097bd043e44ebea37ba43ac4021e18253 +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.11.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.11.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..6f958acb3e97bbc263ba99adb14ceb897dc7e573 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.11.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ae646b4e03481a9e0eccf0a151deeae360012b79d455f413d6b4c8c05ead016 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.11.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.11.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..3bf7aed58e43958ad08d6b6e8beffe072f7e15e6 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.11.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:114046d9b18a39823a18019529563163f191e5a74c65e959db74c96b77c9b4b9 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.12.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.12.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..b026bcfd8643c18461670a5a2980cf9a8539bb2b --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.12.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d30b7fa1db362abf3186072da75c305cd7e79f90f4b1eea6095014d9f7989da7 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.12.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.12.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..011903f321dd322447298b693e1eedb17f35c3ac --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.12.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:654fe994288ed138b388cb0e14a9c4e7124b601ac4efa404788e3267ed137307 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.12.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.12.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..fd89f748d1ea906c6617d240a4e123d243105b64 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.12.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:069d9e054d6cd0171b229e37a70b6a2fca364783cc8e80de9f81060931964e0b +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.12.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.12.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..b46cd92e96aa0e40ba260aea37674bdb9fbf1fd6 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.12.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:394968e46096fa0f50701fe0d09193561276359f023ea5dbc3a16bb3f1aff8b8 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.12.attention_norm.weight b/model_repository/turbomind/1/weights/layers.12.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..0020f8c429974d047571347728c95d5259c0da58 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.12.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:020a5a9ed0a5065303d1079d24ce7252b639f6f76bf49c7b8fb5fac3bc93fc1b +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.12.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.12.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f5cd9ca940d4417db1082cb6b445b56fc3ed304e --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.12.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9293f916e4009deb3dd715ac0fea08afe5be75548d2fe2e70a67fd5826664cea +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.12.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.12.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..be6c9b7b29a56d2d3afaec63b36099fc29d1ba80 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.12.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89899a4751211dda4328e2380ceec5d62d0d0b13fd164ccb7c9f5e189409a08f +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.12.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.12.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..53e4822e263ce179450dcfacefe7dd882447324d --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.12.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f0f0481d3c7eeecc2717614f38dcd54163c287431e82da95a1e8d5fd182cc27 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.12.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.12.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..2f8d90a6c38370788887ee529f4ad8c7b4fd6593 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.12.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:690b11e4c0f825ec39db6b53fc1ccdd51d051c752199195f2cff8079ef3b980d +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.12.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.12.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..35e00aeee302ec1726ef04c71f2a2f429fe0d23e --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.12.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce6abd982c6b4b398f13a6113cfaefff0fe65190ff1b232c8b9a68acb30fbfdb +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.12.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.12.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..8fb69a827363200f7cd82be1b4f35bab6e143bb7 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.12.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3cee21f879722a16a454f6455c8d8c3aec77cbfdba6cbebac9c4762d1d03bb2 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.13.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.13.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..63d098e6067e1aac3d4f6083c34f967abcfb40f4 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.13.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:983fa35043fba20d8f39610fc859862486472388df708d85176e198b9493f194 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.13.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.13.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..f78fb596aaf17a70c0fc17098a02d2fbd9f8b12e --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.13.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcfbdb8a6f2d86500e49d21e3d0cf88dda2e18b505be8459e46962f1a5403902 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.13.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.13.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..d0443fc30519b3ca74b5e3d4e0317af1dbe8b32d --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.13.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e76d5b55510b3111a4c8068f8bf2abe8372c9868a5346fd03831633817f49a3 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.13.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.13.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..6cbcd17aed1ae804e9e87a936274b99c9ad81296 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.13.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da85282928c5b1723c48e93cdadc416b400deb61bb90f28c4675989ab7d2f4f8 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.13.attention_norm.weight b/model_repository/turbomind/1/weights/layers.13.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..58edee2f8e729e06965c92f434900ae4f75e1a49 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.13.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:592d7039e973372cadcf8b3f717c19ecbcb911e2f40140d617855643bf2bfa3f +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.13.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.13.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..0f2f191246be551220b2b9df11e88d070f4b63c7 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.13.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1cbe619508e858a2637045e1e07f9cb0ec4c6020d6041e40bc9558aaa9fd290 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.13.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.13.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..8114a135ab96b7c28393bb44bad7050a71bd712c --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.13.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c555740ee91741c87411db09bc23b419caa191a4ac0ccf7e34b00fe64e614493 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.13.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.13.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..efc53988aa0826924baa6153c20d1fb1abae3183 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.13.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5434cecf17636b9bbdf1df6ae4b6d1eb6c06a611c93fe0291ad0d3892d850a81 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.13.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.13.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c927886fb77c90e7e2afb11bb38945c179e779cd --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.13.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c89194f222aef9d0488e0677d654d9f4cc783cebad2ba76e9013ef99684a1c2c +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.13.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.13.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..0044a510f007c3e66e363ee02bbc25f4c26cb6a6 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.13.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75cc6d0e292ec019791db0f7ef63b0508d8a5d19404fadb09c1b06a8dcae7cdb +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.13.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.13.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..313f047a7db61ca9b3fed45b948aad24958ec896 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.13.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e86a948027461837c94daa03c444ddaa2a484bdadcab47a89f78d0d332ba0370 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.14.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.14.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..d34a88071016d52838a914b177b787d6b7f5e989 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.14.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd65317b8701a195eabe835058a9366309ad055eebd4354fe994187573dcfcb4 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.14.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.14.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..dbf55a9dd11b2bb29fb5f7a2ec180b89f6372195 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.14.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a8b7af909bb0ee02940f92c80cde0a7a869e60bd4778c7eb5934ed7134b1e56 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.14.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.14.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f2e7385fd3b0a6c38260980964dfd035abe25f95 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.14.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f17aa0c464ae8e87100f9946574744e554c50847775d5e3cc888584c920b51bf +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.14.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.14.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..cca81645ed7af2fd8f2039c751f0856ab6332929 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.14.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac63fb5629b386babfc0cf09324e8388735c894def38688f57e5fa413a76a6b6 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.14.attention_norm.weight b/model_repository/turbomind/1/weights/layers.14.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..a2e5c82b9d622524d9390c76957ed9e8994aa2b8 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.14.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9d54e43cc40808a7a12fb34802e7e3fa239938943e4f247ea54556f65191e0e +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.14.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.14.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..efb7ccb2234e6b179d310051c53ba547a39f7b6b --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.14.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f974af156ac932cd0619e0e86095071dccc8cd0608319df5c1042492b2002e9d +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.14.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.14.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..8d916976c94c174148b04db334b907ec77c7d638 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.14.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5be3c8f04a42c5e0c9de9d00508fbb981849cf188dba80cf6127d8f4b4b712d +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.14.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.14.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c926dcac71d930076be55189beacbb36cfb1a777 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.14.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c33e3534172410d4656b1a244becc400d680dc19664a6fe5d2531f0733b24b1 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.14.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.14.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..78c574771e660fcfc3a237c9d56afe57b62f1ea0 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.14.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3be2e077ef369c828ac8f31826249f327d120baaaf9d0141f67b9a814f95a57b +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.14.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.14.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..3094bf1d424cd5ba8300cb6dddb32e4bc9d78073 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.14.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdb3dd1a12abaf094e03a1d933aa4ab506d5c4c0cd21cf0802c04f4a0d5a85c7 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.14.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.14.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..a1ff0007bbe4e1f0abfdccce67158196a9b3ba13 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.14.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39dfb751ce93881ea2c4e2f68155583024cfcf9e85b5705781348b079cc29b0d +size 16 diff --git a/model_repository/turbomind/1/weights/layers.15.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.15.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..8d981e2ef18ba6fa67894151d2e5d33aec76e769 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.15.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f2d6afe6100ef0eb47d5b379ce3faa38ec1063ba36d47d9526647ea7fa4bda2 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.15.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.15.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..92d62c8db383b4e459224b1370a1d87eaa416096 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.15.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8abb8c1bad2acba915885821b231c1884cd63fd978d62d23a25775671c97f9b +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.15.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.15.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..43781b59b7834c4758226fadd3757cd458eb9001 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.15.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fca2dec7e83b35a6b582edfc05ddf49890b234aeba53a3d88384a436cc96c4c1 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.15.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.15.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..51a58827bb1c84c5a11deab1134c99e4cd37f472 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.15.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83bb55b56df6d0d2c1f6f04d894e5d6e63d476b8fffe1dd0441a892eed850502 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.15.attention_norm.weight b/model_repository/turbomind/1/weights/layers.15.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..7e895dc7fffaa82cf585391595f009adf667e4cd --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.15.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06c4e4b6e08466593216c5fffe5bb16fbe296be7d83b8d67084a728b4f0d26d0 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.15.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.15.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..8dfc85e4b6b9e369447163acf76550539913fb5a --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.15.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b271e071ebc5f1e37284433f76d394ee2ba20920d64e64355f6c37672bd68f3 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.15.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.15.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c0f10138fba546a8c454600fd6a73289e0a7f8fd --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.15.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b42f1cdd3b5b76e04cd4154950ade000eff8bfc44853c827ff351d00526201bc +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.15.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.15.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e0d0b67b1d9d4d9530690ac220e426dedaddb1fc --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.15.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c44d9731ffc2bbd8a368f60064a8e8e85f50b04677d059c25fce70aae38dc81 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.15.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.15.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..a99be30bc9c12257d3764ef09722a06f15ef0437 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.15.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:287e909a7bd9bcc0b456c57c361a614c1898383785bccf9f57eee7f91599e3b3 +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.15.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.15.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..630c4372de835971e521542c84649a00c3b2e403 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.15.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8dafc8ea6132b5caec667dde3f6dda741e7ff23e40b8ff5f5ccc59232ca434b +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.15.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.15.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..a47b7192fa2a190ceb02a526a527aed679e93740 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.15.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c945e5779fcddbf5dff47a4c3502bce9ba0bace5158abc583e852d1418f9513a +size 16 diff --git a/model_repository/turbomind/1/weights/layers.16.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.16.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..b17d911138bd69b5faa2b303479e7cca9c12b659 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.16.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf8c2d841b0c3dfd0a4349bb4aa84c0d85141c14277e879c033484e225096715 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.16.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.16.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..bd4333af13bff4ad87c753e24461be8ab19102ab --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.16.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a54b05a6ce8083736ca7db382672bb83d215649338920308cf0edd2e4f1ae07 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.16.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.16.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e09e8104c2418067fc961e4fa84dc074da5eaa81 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.16.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b8f9b5eb6ea1827048eb48661af27f66fbf5f510055f7dfc813f28f79967c83 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.16.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.16.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..a056f4943ce26b8bb7e3c8d3d052feb2f324a4d8 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.16.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3031c7a07ae7554fdc02af0112aaf4f343c164f1da7e65ac0926e0b33ec1daf +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.16.attention_norm.weight b/model_repository/turbomind/1/weights/layers.16.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..710904f88b607829b98f69d31a704b5ccb2180d3 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.16.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0996c709a45131cb25cd72865a06e38920f31941b25f83f2d78ed5751645c284 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.16.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.16.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..ea56d48779234f87b2b0a859e2cb110d0718e2b9 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.16.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50fe105dfc87e7a2f06e12b9d1d92899b4b20106d29198eb7f8156c888b57620 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.16.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.16.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..5773631e90c5be54da0f5ca15e355b6bf855b4e3 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.16.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8081c981a8cc02210f42ffa6b41e8f8a018cc273f18dd184e7a76ea6a14af908 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.16.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.16.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..5a19b7dd919248c1d8f24d12508ffb36be409a0b --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.16.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b58ad7e7bd4aaf5109590b6f4b500643cea2e5ee7ecf3de2f2bafd931fecbba +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.16.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.16.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..17e81af1aaa097a81bf4407a23e87dfb0810ba73 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.16.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05659661021dfb93c23ca810756fba0afa33f7dc7103bb74e79a5b5cee0630c2 +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.16.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.16.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..f45d501c72951cd1746375922f7e113162bef097 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.16.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:990398b91f28bd4d0ea10d21a8f911746291d93d353659c273a0d263f3f8b26f +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.16.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.16.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..cc7a02ca2638e540d970eba9c8c2ca40c599f58e --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.16.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a46e5538c6531808ab35a4aa3f8acc92997393bf5778110738282e7d0b5a6253 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.17.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.17.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..b7d289a0a181f768648b3388209609a158c0d194 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.17.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a874ceb40f2cd87b1fbadffe4f336e766e4632d1486bae80a524aca3884a760 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.17.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.17.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..02676e7729a5ae2a782c7397622f5661a55ae306 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.17.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3e383f96fe0c11172a8eb7c833e16437243ddf5083fe742f2f5267c606bf46f +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.17.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.17.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f5d248ed5bb53bc83690b851c4850179affe3a1e --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.17.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ba47e294f57c2391d17559990d81c10b3febf1ac79cdaf9646ea4b5b1efe9ae +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.17.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.17.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..cec2b0826f0458f462a1f155b2420afe3cade230 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.17.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19655fc3273537cb5a737021f0914fcaba9f520ae85a241b6943a1e375859c5a +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.17.attention_norm.weight b/model_repository/turbomind/1/weights/layers.17.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..493203ace8591c626f3ddd92a1d30a132fb91f7c --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.17.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f60382d336b8fe223742bf477d6e1d6b03a426c1397370821017d77560828a40 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.17.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.17.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..fada103f386b9576504b44aad9effb7227b81161 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.17.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6347e704f461d7d6ee0ae21b790cdd6180debf826b736f1862a27bc9ced0045 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.17.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.17.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..e34de3f6584cca7245e62f91730286274c18de9f --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.17.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13d6a83305e5bb3038ce5829693b70573fbcbfd18ef9251f42334a92a864f2f2 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.17.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.17.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..62706b91c086f1c95651471ed13767ce01618e08 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.17.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62bbff754141a2d1cf72617d73f2522333bb2694a88e8a5b37c1aca6b22b17a0 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.17.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.17.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..7d16b3f60264de0aab7805c342d890386aa3c7ec --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.17.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2aced42506d0f633676edf55b7de564b795eb6de86d8c0f6c0f1d1301233312 +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.17.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.17.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..2115ea8bcc2774631a370c71a768d54242473864 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.17.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7866c4443b210b814e1bcca660a34c2b78f21172253d2c53300be2c3e3d44fc +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.17.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.17.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..945eb96703d8de2eef6085a642b1a27de7fb8cba --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.17.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8029ca34c285ba5e30b011338457cb6e1aa2bde375aa5bddeb10d5f735b827aa +size 16 diff --git a/model_repository/turbomind/1/weights/layers.18.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.18.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c8f8e2fdabca3f7c34468465c2a769b83df35ce8 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.18.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:802bfc3126429a1c8f50bb8bc82a62b62b5e4fac66b2e5201d5ca3dadc76b2b0 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.18.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.18.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..15b491c33507c9aa77edc43db2d844a6f497fca7 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.18.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b1e35a7c3f4353a260afd771398ed0e6f3fb0cfe2c9e57c9c6aa837187477b +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.18.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.18.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..fda05fdf95a8e38dbba3ae8e857729fde60e6d1b --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.18.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d5e9b4b8ac11947e865c95a0ee01bea2b98bb4d8e186bc655980c0819220337 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.18.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.18.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..56d79eb2481c7040c86fa26964ede1eeae1395e4 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.18.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fb7cefb270cbf64d8347c25b5d776be71d432c570ac277fc6dcb8160f358040 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.18.attention_norm.weight b/model_repository/turbomind/1/weights/layers.18.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..3c20c25a40ad141d017b4cce8700f88ca3d8efca --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.18.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dac1fd7000d40fa00eb19ec7e140c8fd08a7e2fba5ac80c0f15abf00fd9048e +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.18.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.18.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..3c1d6af45afa49731996db41ef7d18503411125c --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.18.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23dee44b6cb77a166863b69487459d9de5dfd4c3989306919d4c35dc20c884be +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.18.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.18.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..54489f50388ea9154fce92dbadd4bf6a1a861f86 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.18.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10a6c1e2ca46dac304c89690e837221b7cd15133dc1e7ccfb18f69187af51208 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.18.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.18.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e90ed3787e1ac9da6ffed10588e004c09bf3b9b1 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.18.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a35d9d5c12d752b160f51f53a49e9a763662605165cb85272e539b60a9f92055 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.18.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.18.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..17951129ba756efbad134062196862ef2b290c05 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.18.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:845ca7749cf6829cc274de80528f41dbd289d125720a4f68417677871dd528c9 +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.18.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.18.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..3fdc07d36718c6a4fb843c7a0e547971f25bbe50 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.18.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:885808cbeec44e76e545008343da6029dce51d48908c85d61f4e3e5734a316a7 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.18.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.18.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..4b8d6bdb257005f9da0843e14b064394e5e12366 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.18.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da3eda4da09ebaeb73ef447011ce0b9ef2ee982ab26d8d0408ad482f9b2b389e +size 16 diff --git a/model_repository/turbomind/1/weights/layers.19.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.19.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f58ac78fbf8480c4a875a904f3eca7296b9d1dc7 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.19.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a697cc9e5c643856df75e5d40a4ddc810ad41c0ab9362ad6c7745862c000ccf +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.19.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.19.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..ff2f26342ca1663ff6c89e5015b02b41e976f9a9 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.19.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5deb01a923b8c70c8adaa62c3b6128231899cb7c185908822279725696d1c819 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.19.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.19.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f444fcc2661a285f914957b05cedde19a4954ace --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.19.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:682754ebee51648ef7b0249fee7289fdf825e61916f97ec62087c8e39e9c14bb +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.19.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.19.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..41cb9a3fa2554343948079acebcb10fa2a940517 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.19.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6d4a938a39924f222f02b460355a83ffb98a00ff19d05048c3bcb82c9e57edc +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.19.attention_norm.weight b/model_repository/turbomind/1/weights/layers.19.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..5acd5f2587a22bc1a1e2870e9b4af8ea1eaeb505 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.19.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63d26f2643a9aceebf2af38dbc611dc36da45a176257e478e62f85ddbc559f55 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.19.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.19.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..cc8dd8ef920737fc2e432adac1ce42303e7d7111 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.19.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a54bcfb108f050cf4a7c7cb37114ceb35476b3f8bb6cf6c541e8df014fbf6133 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.19.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.19.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c378e9b9bed297468e52701cb4eea8586e317e8f --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.19.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11cb4b7bd0b53f894236952f72793d3d4e647e6d07fc37e1112b0c5ba392176c +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.19.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.19.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..131386a17e034a3ba0ce59be9c0351b35dfc20e1 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.19.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f32b6e7bb6005ba215aa938a0b52300230f7008150b45a11916829314ef3494 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.19.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.19.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..af5383b2c8c39d1c54f5dea9298ea08f5cbe267b --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.19.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84f83448a65d6bf12e5484bdf2805b2648a5ee6c0f71f592f1399a71f787a365 +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.19.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.19.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..6f5513a9af9eec5fbc82dd527339fb220156deb0 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.19.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7e2f003c72088419d2608b060a98ab42356eeffed53510f1d468f4ccd3f1141 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.19.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.19.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..fd5be00138be7b2df59bf0b592a9bef86dc82eb8 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.19.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c71b33b311eb0e23a8b2494a543ba1181fd72314b49cf78a9749b9cf4a00df4 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.2.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.2.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2d9c45e71e2c0ab82208f4202b06c9b97f6ba148 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.2.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fa15c6683fb8dd4f6a17b49bb0a989e462a984b2b1a62741c0261b0205e4d3a +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.2.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.2.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..cf230e2e4ec022b7dadc04504edd265c2736423a --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.2.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d46a56b3063ca3e890569f20f0f9554bd4b8b3dce4dd28c6de2a2c8b018de692 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.2.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.2.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2ec2d68e756cc1afd558415a1c748d3366f51240 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.2.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:745bd18832a4be0427eecf06fbd16e5b4d9045d9bae02a538648bf061f1bcd31 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.2.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.2.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..d1e959a3fa4ef4072ae44bb537bc108a99c3799e --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.2.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f165998aa89a2e93b82203e08444995edcdc00ed2dd2b3dc3171ed8c4aef68f +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.2.attention_norm.weight b/model_repository/turbomind/1/weights/layers.2.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..775cfb53b3214e57d496df775c7f2e98df37a237 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.2.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35db76352c3fef9616c14aefa7c0b05850df54a54e3e6c922df8876639c7048e +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.2.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.2.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..1b19b3f633c84fa1134ae29f0bf9f119d9b25d42 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.2.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5d14e61c9cc1a1874bbf7c1db7fb04e8b97f8d49e011bf0b5c2003a072083cf +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.2.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.2.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..e293bf94f00d2acb588e4a05e8b36c07adfd4cfe --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.2.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02a79b8fb1590037f3bcbe91f25dbcb82b2b91fe0a109dca31de0493a089fcdd +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.2.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.2.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c43fcc94e533822deff81b234c66897d23c2a5aa --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.2.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbde66d92d3be35621cdb2171a2b9e5ab5448d229f07d7da65d25553adcce029 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.2.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.2.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c5beb7d2b7d8320386a5105a4a2618ceec4e4943 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.2.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41bfc952713a7fd5409f909e9ab107d9ef734e730f7b00d97fc34ef24395e62e +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.2.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.2.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..45e884fea486483f4689411e2b0f5841bb3e6317 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.2.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f16599930e314f9a8ef2b760cc6773e75961152d32432b5fc3e411955dbdc227 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.2.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.2.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..70e74bf48eaad9dd65823e3d66a8d46c4452b13d --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.2.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7808c14f00dcb7b2b77edadc8852138f46802e013a3025e161a669adde20339 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.20.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.20.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..6053a83955560e1c2a84e72515c7672d70304835 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.20.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45521551eeea8b702589fe7c6b19749333abf647f53f56713807dc38f58041ec +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.20.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.20.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..0e188dc213c48bf55e4b2001a68e495c895187a7 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.20.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7d9740714493408c67acb934d26406c11421ab7efdabd743bd990103a90f701 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.20.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.20.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..730a6aa484d4286f408baf8abf88ea73e0b5aa02 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.20.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55586decc011d181feef941588d73d75de2ec8040bce7db734699a33a7bd6f42 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.20.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.20.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..affb6ab65788c985dc6ccf43d5cb3fcc8f4e91f6 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.20.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3dff92bdb0d4bd34ecf08c0c024d9aabfeb9dc6407b55b55d25835922bddb9c +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.20.attention_norm.weight b/model_repository/turbomind/1/weights/layers.20.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..a4b06c9551477c77ebc9de6151cd219a9c13f63c --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.20.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dfd453a8ca7eaa0368df85c67b0c4520d044c50e21e3e9c642016e56425fe2c +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.20.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.20.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e0aa342e545feda824e44af8745b7bf6714e3672 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.20.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a12408ddaac163c3473e187a838044bf3c05b1a72758d6b77338da700a74f845 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.20.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.20.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..595f2605064e623b1acbbbb39aad1abe47d2b5fe --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.20.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a20c9c4a6621e851abb268c647e4f9459277dc53bc5f64a0504562c9e7736b61 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.20.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.20.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..3881b21e76f4c55a6f5a94d56794ece1d12912e8 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.20.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e13a13177f50e58cd454dfef4083e8b8da065d25bd277aeabcbbd65d9c7ee2db +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.20.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.20.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..f0c038b596c5143988722e1d044fdba36b9f4c53 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.20.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2bb55062eaf5f412bae85c9ac428ddc2e0e59d0e53ebd21abb1228cf4d1ea3c +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.20.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.20.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..3cfe4cc50ce587ea9b564a20130b4fe2225d7d52 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.20.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37c809eef52d6f683a42650531b04e14b95934556c2f3607466882fff2c7a049 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.20.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.20.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..3fe9d60389494bd97b6721514bbf76a4a2f4aeea --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.20.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97801b00a17ab91f1019edf80b667e915c772df1461e322cb8602d8bd831a8b1 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.21.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.21.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..905d5eb82f1967282905cf3974e526f1e48e2b90 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.21.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2792bae2516c6d5167b1efdd66141ddc18439be883865eee923aa0d64f3501f7 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.21.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.21.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..9a1f6b2beb40845a92a60a5b1ea44afefad5446c --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.21.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:953b7c49b7ba4bab3b5ab552b697d5be9184144ec4f8f6ea9815a0e12420a4c6 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.21.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.21.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..fbd8d63b76ae1f3a0394dfd4c09e724627ce656a --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.21.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f719914491c7941474c1b6efa5a79541ade54eff71a6d65a28dcff17baeacd89 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.21.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.21.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..3199f31825d84cf98169a9ac8361fd01195c513a --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.21.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21e70d0275306b0d766b533780955602dc9d5163028c509745120b4e9dd070d1 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.21.attention_norm.weight b/model_repository/turbomind/1/weights/layers.21.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..ace9b471c09970005b6d8dcb34406ac8671f3340 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.21.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f5b37279d734e53f01e524b941104c4a2a0794819cb443255e46130190eb060 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.21.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.21.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..93ad736f2b44139c784864069aece4a59db96543 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.21.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7298a7ea1a9a2f16bfcca14510dce8da6342ceaccf48354e63945a00c86a8887 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.21.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.21.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..a7e502a74af20d234730806f84f0ee0fbec81a3d --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.21.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90e896e7361f2fde100ee9cbf4591ba2509c11ad2e06ff9150614c28f39f6cc7 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.21.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.21.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e129776d2c3518130aa1688eefa5ce1d57e1f1cb --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.21.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0415c4da6fb2feb289a75e84a73c525272f0098ee5c14faf5544454178576f62 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.21.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.21.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..60435a424658f628b48358ed84954acb2782b727 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.21.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ff5c969303a6b351d8bb80064aad2c92e8c5c32d85bff840317ca0739ced463 +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.21.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.21.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..6655336998857a70516ff902b71f61175fd1a6c3 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.21.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8042770bf17c4b7520332fdeeef3decf2eb77871e6d80a2fcfe79e850827faae +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.21.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.21.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..68bb063c7fe76ee11dc858fe2552eff20f89fc06 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.21.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:babef4e3b7889042e89f865f3c8bb53f6191e2c9329e3eb418e0627256b4bbf7 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.22.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.22.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..26e5e328af67eb6995b4eccd4f3f47e2a5572bbb --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.22.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3845fa57cee6ae1adc7c640c17820f11d196a86138e3ab1b26d1fcdb5a12d480 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.22.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.22.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..25e896649de6e4eebef3fb52b4695e66834ea627 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.22.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60a8fb6d26d3741fbf2dbd24d9e96a689ce0d8311349bc7b7d487a94ffae7309 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.22.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.22.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..30d513ba9872686a172b2e5bb54d7dc19c89b18b --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.22.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e8c0a44652ccfbbb876d6c56c552653b788b14188b48f41b957d17036111f93 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.22.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.22.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..63489132ff37547f3c5a7082e39f7d6e60d99e2f --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.22.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cf24c066812a6a36df8eec192b40520df7d10573d5a2bfd2327ddaecf6e938a +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.22.attention_norm.weight b/model_repository/turbomind/1/weights/layers.22.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..67e9beee3472ac10efd53bef75c3678f86f0287a --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.22.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87603494aa61475dfc747464841436f303bcf654dc27b1a07564f53558ebc0e8 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.22.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.22.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..a6f81f752873c957d60d333f567fcf45dc101888 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.22.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37604a1d32f8001155e15ab4e13282b050da543ad0d0a25b759081246fdbdb15 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.22.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.22.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..7fc132bdca2ee4128bec7e863686fdca2f7aebf4 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.22.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06d1aced0b15076b9f26d4ea4f4f6b732368d7b373e7a588635da39cb9db5f39 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.22.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.22.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2679586d03d73f48a045c13e8c8b19ad6eaa9b50 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.22.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15b2a9ac0ae91a96deefa360ba92e79339705410d925b2356b9815692ea31061 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.22.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.22.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..7216f3454da54e1117fd4e92befe84b4c8b46a1a --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.22.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a99b63ab8c94e4d8f81bc8cab1561f47e3c2bac9f6e13f0b23d9438e02d7d1e +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.22.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.22.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..4d71b5ceacf9dcc9afaaf1adf8978c2911ea951f --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.22.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:309c8793e4e6d01a426ded64878ab5bb81fc897a4369e2e12e180067d9e2f97f +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.22.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.22.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..265569647dc54011c0c7aa312cda60679eddf224 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.22.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a792b8d14741661477851bbe77b6f5dc4fecf7ce07009fb7d6bd25090b2ad2b +size 16 diff --git a/model_repository/turbomind/1/weights/layers.23.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.23.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..3c4b6c3a2d7fa4c456839afe2c5df63b4801cf29 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.23.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2a664f7c9133d9a3d3f013ae68b7c826124f0ce8ee3e2a8b7a3d412fc4ce18c +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.23.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.23.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..6980321a22d78892613c341246abfd4fa6a6ec1b --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.23.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d1caf7d6d040d5052d79ec08aa4282d486d3fd63e54ce73293b62776d97cc01 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.23.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.23.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..a959f9c51c2010dee1865544214aa31aca8e384b --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.23.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:019ccc843a3257c4a7b36900f96de821382e2847851af142ae89a9238b434b20 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.23.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.23.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..63ad5cf1b74567dc10825bf3797cef1aeaf45b20 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.23.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80a82f597426b697fe58ed646f41dd9a6f4514d8d93e7f2791fac932dac100ca +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.23.attention_norm.weight b/model_repository/turbomind/1/weights/layers.23.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..95ac563b56807e330af49708f5e09a5b5d763971 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.23.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d621b52a30d8a04c1866972255522c844eebd9f0b57ee2b90fd4f8e5e7ba07a +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.23.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.23.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..070dac5924104453edc840b81f83c3af7c79534c --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.23.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e95a18e90a00cd47b6fce45cb8c1eeedb6ec2b8fed6f0cd8de85f36cfd5dedee +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.23.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.23.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..53c5e980f8815c039d907e5466820c61f9d1076c --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.23.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae6d90f0468717c0bf1b22ab4914319697011c4ee53f13241c0ca1970acc3331 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.23.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.23.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..3dbd1908961ec50661072cfe35a0e65123ee0522 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.23.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1036d81bd9d055c59bed34241ec3328c1035676dbcd78a0186946147c58af98b +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.23.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.23.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..377898876f13249c94c85b69c632e4edbf89ca0d --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.23.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f354eef95b3a2007598e99428488351bc81e825cc08c8a22beea2a74432f0e91 +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.23.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.23.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..6034309e63a873c266790385d8a50379dff8c851 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.23.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36a712b30e1f4b920e2bf0e553bf62898650a968b94cb544d4c0cb45dd9724ba +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.23.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.23.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..2054dd9b5bac4cc5f3947a6a29b0a00ee9c8f9c6 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.23.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:362bc48a1da392c1d9c1404743b87e700f048e91e2236c0f23136126cbd17a42 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.24.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.24.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..90ca332aa05b52f6a6c1174451a057235aeec1f3 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.24.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c5cb069457b3e48f9401929077bc5a44b988b7741941ed8157cf23fc0af8fa2 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.24.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.24.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c424c3a6af59cdb2e6cd3d2acdd6fa6b8585e46b --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.24.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b47c34802342bd2a02dc98d311924169d7abdc703e43279cffdcf1422243038d +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.24.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.24.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..183cbc95eb079e344c88e1fa4774f568a66dbbd9 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.24.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6468f6b524dabe33d4487522c605b92a5c91eaaa9d6b39433dd31588bfd09215 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.24.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.24.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c435ad2044cc72cc87bf58ea590aea7b6e463349 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.24.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59fa63a2023ffc20a936686267ae08fe6c793889ca330e0fb0a44ab2b5fe8041 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.24.attention_norm.weight b/model_repository/turbomind/1/weights/layers.24.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..dccff49fb462091aab55a0c4eb163652123ff7d5 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.24.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d38dd18c9fe84631f30cb2b7cb92efc25473d4ba1c438a7817690ed3bbaabd8 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.24.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.24.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f0bea0526b3fe332953eeee191fd4d279f3a8286 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.24.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db478db4b91a673763d0252f233423fa31c7a562f80cbc6c106931886d56e253 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.24.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.24.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..8d233c239c539161b7c5f0b5f890f196d9c544c2 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.24.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5329cd85fc6390d7fc596abdb5907e3c2576c2fb6fc87d7c0dc2dbae326a826 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.24.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.24.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..d4c99dfed4f5fd009c04c0693ddd1253dadfb80e --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.24.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78e4b556d2c58615b1f3bcbfe8780a1217bc0420383b55afbf6767315ca09e66 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.24.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.24.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..8d61abbf087e7f17d99482529ceb6649e5f98e4b --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.24.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9861b1f0dcf30259bc7a9d1c02969f271b805981c696d49b1dcdd939a7ff504b +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.24.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.24.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..a5247850bcab46ee044a136c8ca64f1223e6f1a7 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.24.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f64ff3faab2a3c58cde1f351d57bef281660b552a9dbb9c0aa49bff00dcd6719 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.24.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.24.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..3a9a25a5c3ba55692571909bb40b460b6ed82ade --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.24.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d2ab419befc2e7b0391b3b7e7bfa13bf728db0d6cba53136aedc0802a4fcc8c +size 16 diff --git a/model_repository/turbomind/1/weights/layers.25.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.25.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..41c3344f95ab3594af8a3648d644979c8b8a3e84 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.25.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0971d51d3ac5fa3cb80bf7adb2616878c3921d6810a7b8c312f2c5edfc20ba2b +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.25.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.25.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..380f67b6fde572f2eecd73076b154bb56c631ceb --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.25.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd9d2322fc1ac860eeeb0ae4f57b15011ca5728cab0c2de14ad0734c813b1070 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.25.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.25.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..58a080a5403fbc6975a8c92d3d8890d106c41f32 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.25.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42757d1b84d12da08d617496b557df5dc43260ad03444559342e57effdeff897 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.25.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.25.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..a623dfbef7759c22ba42888f23b6af5e7c88703c --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.25.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc49597aa705026d30a172bcee0421ded59135ee57d2d1a38d511274fd00db51 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.25.attention_norm.weight b/model_repository/turbomind/1/weights/layers.25.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..e330398be316b3c7d2b4e8091847c876352631d0 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.25.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f978aa26bb24bbd527a1e949719d548e1c7bf7d30f04b02f0f28d1343053132 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.25.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.25.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..941b657818aee3d6c553e08ef74566cd98e55321 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.25.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:063a4b6c0bb854f67986762bafa9651778da009fd725fe723fa47306a99a845f +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.25.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.25.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..4df2b6e64935f05f8ec6ea3db6b9723c6ca0a7bd --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.25.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4a77dbd2274b6de3cfb89254d1cb2c0af54b304bb9134a280cbe9b620a361a9 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.25.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.25.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..a2a36f211eb8cebc2e1ce26bbd4bcd9a806cee31 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.25.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1626e0d17ba4f05b0f1e65537f46ada22bef2d00deb136c30dd6bb481b617d58 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.25.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.25.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..09e7a5b567087d78bfcd3614b11b21106f5f8f59 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.25.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d9b0e50a31c6c29d57500a64edf731ea04db50967219bfdcb0853730c574333 +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.25.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.25.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..026c4beed926345148e983d57a1eb89a25c4fd1c --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.25.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0eea4a26418b7a503c71abf443da9d784c2adca6551e4f1b998f94d6145d696 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.25.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.25.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..67871afaf8d1df47fbde1f4a65674ded07d4a864 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.25.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cad249894548c60911d6d65a7d5846938c1e479698b4466d4cc6e03d2444922 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.26.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.26.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..8e3258b77728a5579d15c2a374b61be41a2afa09 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.26.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3b88ded4b32bf8ff5ab7fa3616ab98f1bfea6fd86f37b729ad69ffe89d33e97 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.26.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.26.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..cb16882090f73a8651b55899be0c7b66b7d89aef --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.26.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1303373a67371e1e2f3ed25bc8cd8e559b9503bc5b4fdc37bfaf758cd26acfb3 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.26.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.26.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f65b33bea38f966cd6cd26980998df21898fad28 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.26.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da07e11c5ce840df7eaa7de1ddff66356a2995b93b6d1cdefe1d96f6d4eb62a6 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.26.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.26.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..e34f9fbc1e33e117eb223353e64a0d03c3a1ce09 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.26.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec446a339a8b88e9d35b0feb0dc82c82f64420cc45aa67b0730bc6fdfeb33b24 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.26.attention_norm.weight b/model_repository/turbomind/1/weights/layers.26.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..bd89d7d2bb2a10e4537def6bc6550ddf681db645 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.26.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:452e37de79706d39a7fddbbd901e8353363bb41bb1178eebb42b0a9aad1998fc +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.26.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.26.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..ef1f200bdb37b79404804e211dddd09441a90cfb --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.26.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fac2317afed02f28c9f68eae5e04821f1fea2d7553bd4ce30b68b9a7e896be65 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.26.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.26.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..3613b7754b7de11bd7146b2f99bbb2aabad43346 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.26.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e892079f260d62e05e5169a508c1b50c3beffc1e568e189b358850a9596863ac +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.26.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.26.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..42508b0d05c03cfe54875df80e5848f92e3a2148 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.26.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b2ab3bee38aee899c1454a69dc424ae61b6d14d67438c307369be02f6460085 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.26.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.26.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..6078af07ebbfebda87b1016fd58cdcffbb0b4c73 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.26.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:552933cb4c5ad88c47fcfc8c8982e8a9d6c2bcf4975d0a1ff17f85a0de9a72a0 +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.26.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.26.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..659727ca29164c591b4db04c441375c79e981fce --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.26.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a474d6dce328dea51c94d84fde68d4472d68dbbf19ce347181b5956b98d41847 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.26.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.26.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..db316b10f011519fdc39c70e40706bb6499001f4 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.26.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d995b27407d7307c6a5b4a4fa7f6247eac5d8c1cc62c066c9bd4395d0455a939 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.27.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.27.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2b398a0b63fe43f5bd6467e9001673b60b3d8b76 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.27.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fb11cc9d2229d99f45200d53d2430007eca65a120d988a8ace070a0e3754128 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.27.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.27.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..36269d2bb210deac5bfb20fc68c3a3c0ba2430d9 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.27.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3b885790c722268908e56129344337198b0c0e4b3bf5e21a7f091d0846a5d30 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.27.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.27.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..75c54cf768728053f1051c6d1260296c943bc2cd --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.27.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d46493db19a5dc9a8d01151f769f22f10733969cad257ff2372fe9ef169efdc7 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.27.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.27.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..91523912e1e6240ee472d551a8422724c7f9396f --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.27.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f80605e605d11e0f5a9e470c80c72859f9651f99f3db043b9eab3989fffd647 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.27.attention_norm.weight b/model_repository/turbomind/1/weights/layers.27.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..92e464dfb802dd2cde189e137b6e908acaec5c38 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.27.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b48e7db8fe774bd46f4eecc92ef7f6bde3cb8e3ba66836e6cae00572ea0e14e +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.27.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.27.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e7392da13e07a3f00396eb1965e2c22daece98a8 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.27.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a634ce6c3f2743a5e0fa245a0adf32df70a41dc7c969d40b1a3197f0436cdf5 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.27.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.27.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..4fadfc7e45425848c37d17c3f39ffbbb822a8c78 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.27.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc138f3c7e31e1be2b6e2a57d7d5a2ffab4fa52343122dd272e41ac4bfd9096e +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.27.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.27.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..aae88c0abda360c16b47ef75abda1c4077edf25e --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.27.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9052da467e48c0c4138fd3769e456cb753464bb30a03a4942846a5b3877131f +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.27.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.27.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..3b2fa2b516a8c83d6eed1702e517e005ac19f281 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.27.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e1f67441bf5d4f5ca51f1f289e07a3c59907d324265741f76ad966bf1755749 +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.27.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.27.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..c15c40329868b970cca611aff6e2bbe13d48abf0 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.27.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fda3309eb353c9341280ab8f2a516011494cba8b769560e91cd0c9d27fc6561 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.27.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.27.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..d8710f2aebc08c7c65db4a66ef9daeba362df5ce --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.27.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2367dba495b15a673a5e8f907f19e98254caa8845195d88897b3ecc36d7c794 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.28.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.28.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..11c1eafa7f15149287cd144977ef8e5a42645397 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.28.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1f9e7857882c7a56236572f8a03d72222b257c8d9ed6e2efa1d66c6b5e21fb1 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.28.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.28.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..f725cdf5914a0af48485baa5a948fb90c3030913 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.28.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da00a72b006477cacf5f86157b6206faefb0b9a1945fed4e5f2a2f9fc9846f55 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.28.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.28.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..294eeaef86a93508f7f8b171fb8a303bcfb5602c --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.28.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:626eff3b0dc5215c6954f774fc8116aa989824ab9c971a3782d8bce5ad31d0a8 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.28.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.28.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..90a1002de820fee0fabb5d5081cde6d434fa08dc --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.28.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5faf82a3313ab0b53237e677fa72b3b44137a47ab5f26d401a3bf43f5beb1bd8 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.28.attention_norm.weight b/model_repository/turbomind/1/weights/layers.28.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..1ec94894ca9c51e452e351065e83a91a22a1d264 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.28.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac4a8732ba2c28970db1dc7e821bd6c8b0e4de12f8de1b6bc6692840154562a4 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.28.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.28.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2ad5905fe8ebd68dafedb5c0bbe70d34f3f8c71d --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.28.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f375cdf0cd1a60d7c9d00319853242606c44be5322598f91dbff37284f0ab67 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.28.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.28.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..f8676ba3b145e257dc1c75c1f9d9dd86413bc37d --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.28.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f57f5b0745ad5281aa67d83c0da6f1ebc7539dff487ae1345761bf995aedb1c +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.28.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.28.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e3532b664b06cd727ceb44f27462084bddb160c3 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.28.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:393b972c36770d253df01db59d0c889a018a26ec7a18cf1e69617828344e2ed4 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.28.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.28.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..9cba65bef1506cf3787aac95439d21334e5424fa --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.28.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f4650f45c05fbd9d52eade717d47d32b1127ad57db10133ba490f5af3843551 +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.28.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.28.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..0a50537a8d1863c6ea2bf1177d91c15f67d42dec --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.28.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26ab58696d625c79d618dd907bbeefb29dcb441a358411ed99c0f88e8649e74b +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.28.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.28.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..335aa2710f889028753142ad7c1c770b5aaece8c --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.28.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be67c63310802e47b331969149928657a52d9caadc4dcd0599f0ed63fa8fe4c3 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.29.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.29.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f7fb2a0c283d5309b0acac81e3f78bf535e119e0 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.29.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:964846927bb91f85e501fe1626e8958dba12656845d1c2963d6f0d31ba0e6fe9 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.29.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.29.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..e4616ace3831b1353261ce821a222788574a6a7e --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.29.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59389b1002ea4286ef68d6a28a48de0070a8fe63bb33881a4ea5b4d4824b586a +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.29.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.29.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c339b504ad1ca7893a586fe0fbab27e0414733d4 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.29.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a9f306da7ef17418be8aa9f47f97e653aeab2c155aaf1f32ea93c6e3e424c19 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.29.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.29.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..880d7d9c3c95158609d1215b2f6bba14a3a6c655 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.29.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1370f068209c9ab1f42b6657508b06a3511d1d2d8d2c5b5988f4d58591d40279 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.29.attention_norm.weight b/model_repository/turbomind/1/weights/layers.29.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..dc3408e864d2f349f03d2ea9f976241c0dd4ae19 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.29.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0136d8df649cc27c395128240a43f899929866414704347f851202cc638b9ec0 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.29.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.29.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..12bd5dfc4141909486de6f81eb5de2cd0541f243 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.29.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90f34915975f77f41c0057ec1ddc7e83098a74c6efe44d5cfcbd6252f7483773 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.29.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.29.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..92ba76313e8ccbbbbf563a230bc24e60c122fbbb --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.29.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56814e27f2fc6ea900d3623c77d1df558ea69fe154c99fe57fd45b6567a62186 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.29.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.29.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..850b76dcf051ec7876aa7626f2aee3c02df70a73 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.29.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95e520a4a76d63d5f4cfad6bb9577ab1343c24d563ee6491b0120e8b8f605a24 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.29.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.29.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..8d8434eea29d62735d93ec7d3ed91e73a56773a5 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.29.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a45ecef0ec7bb53ccdd1499338dfc1590c5b4d4e64ca01119d8e2eac40c5249 +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.29.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.29.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..cf3ccd85ec2a836282f95d8ffa96f001a6c78bfb --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.29.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80000d50b78aad7b0076bc159838fbc0e679d1b07aa00f374142e40c5fcbba01 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.29.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.29.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..135fea41df0db406183c0c705ee1bf4e15b3d938 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.29.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2334dc6b4e2acee8b2c60625419023d8b5cb9692341970a8cb0cb0950658940d +size 16 diff --git a/model_repository/turbomind/1/weights/layers.3.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.3.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..741f2dbe9906898116ac1c0bcf6b6f1305ac0c7d --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.3.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b023e843f1b897e2768f8aa9d1f18e1a2fcb8a17ee904981117c3822cafda263 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.3.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.3.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..58882890a176f4e5d124ddfbdce381fc920d5b9d --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.3.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02c5a27de7ab84dc800a722021cefc12233818ba708f7ef20abed96d1efa3b29 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.3.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.3.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..28835af03e975d2a253d1b43e9094dcef5665859 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.3.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:961c0e6293f13ca0eb880f274fcf96b1394f554b645856d99f898ae03ba05ab1 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.3.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.3.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..4941d02a83a0dab878ad6795511df8e08e216ce0 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.3.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6a94458f402b8342d3936d5c436bcc1125e642d5216c1cf70ad7850d134dbdf +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.3.attention_norm.weight b/model_repository/turbomind/1/weights/layers.3.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..fee571b50c58b11c6d17e7daaf1a1796af101e8a --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.3.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e702523cc2696abf9ea5f86ca0c3b8110cbc92f9074f3573cd0935519da7f326 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.3.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.3.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..6576fcc897f882a63b4376d2366b8a16b75529b2 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.3.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec59414d327ec0ca8adf200f8593102b1cbef09d5a97e88f7e6f3d1d941e32d7 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.3.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.3.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..39bfc8b9158d17ace10985a0aefa5ed9b27c830f --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.3.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:592014759039919238673a2d601e2d397b3eb60f2b684d06201310dc35e6f870 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.3.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.3.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..a2dc182c2e093651d77ac65087453506558cc6df --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.3.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c67555a8eae4e6cc55420ec37ea21933418f802190fc809bb33855011f8ec82a +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.3.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.3.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..b12f9eae6cb382f2ef562f1e7dad7d8f2c7f4f48 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.3.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b8d6409835e70b1c0fdf81979b61995fb90f43381277f9e457070df5a91229c +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.3.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.3.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..1ac16014018db6a631b37da0836ea438c9d2fdaa --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.3.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b485c2892ea53a76f21e84c2ed42436b05a41f5dab146fab77f25d2b506ae53 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.3.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.3.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..86f8adc521ad298ee51185ebf02afa53325facc9 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.3.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76bf77db19b1d0234ee2da545c98ee3d5921030e6deaa8b2742d4e9d400d7207 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.30.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.30.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..087b322573894903eb8e5cf81dc0e4962ccbb4bb --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.30.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b085323586c5f61228e43ec3cf935799c983d169abd417a55a6c3f82cd255a1 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.30.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.30.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..de17498ac115e410694314f9e590322ecc3140ef --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.30.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:602a6e94ab5a7bda70167414ea1e71c46be0e7b46a69689d093f991dc6930079 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.30.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.30.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e9eddf6db391e55430e3ca4f04fc6966cdb3bc10 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.30.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5864869bc2f57778cafb236ed45dbcacce36836e1c8b3dd94fd1375829174baa +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.30.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.30.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..f810acf8fcee1cdadd5b34adde32f9c37b177343 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.30.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c899fc162f4dbec0809e3059f9ed0ba9d3004a75d31841ade9aaf16df93493e +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.30.attention_norm.weight b/model_repository/turbomind/1/weights/layers.30.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..ad23a4893d3cffe2d398058b89dc78f528c91053 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.30.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:683f799d6ecb59ef5b47ee78d4d1653b6a49da4dc6c6865734f2832457ad888e +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.30.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.30.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..b61119e589e6b7759f74e927ba8c5a5286eb965f --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.30.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb97c170f0415eeb563dfaab343a6b7c736fb302b605cf65ac29e190d485f03a +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.30.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.30.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..3f892216a36905289e63b4b93c0eaf050e7acc02 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.30.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:debf89602b57cf687b1f434d484beefd647c3ea0e8305484658248c8238a347f +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.30.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.30.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..d0743b7b13a262d47d3c95ff5f00bcf70dca3937 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.30.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00143d530f528cfdded636568772b1ac564990d10d52c943463e8198b0f45b22 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.30.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.30.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..649ffe4f3c74051e77a62d2bd111b1c8956635a4 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.30.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6881934dda1754f8b7bdb5619bed9e9ec7cd819080a5080d36c545274e7563bd +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.30.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.30.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..19611f78c82d05c2fa778fc4099462db96768018 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.30.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c07830c7b5e53981d0d97e28af650885ba42b1395e88e2a8b553c080258be805 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.30.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.30.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..ebf0f2ce5ad46a9897b292cf74ea4074253d9e00 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.30.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7a7079eaefe501289467f67ff3ec35deb358c17022eff2a2d77c011d87a7485 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.31.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.31.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..33f1f7e919ab93f0f093697cc6564c8041cf7c9a --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.31.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42e8c9373e34e9f38c5aa5b7f9e7282f283dd138fa488699361a998289d4f0b8 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.31.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.31.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..51b423248b2e8762a232cb9f6524cc2d2882e6a1 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.31.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e74870d817de1f15c0b372de19d9049754192d574290aa47cc2da4114e02fbe3 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.31.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.31.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..7976fa7add831d946d9634761ff8db4d07f69a6b --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.31.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:882c11872607c376a08d0e7ab4025ebae8050ca0a958b4678fa7c5f5fe34af8c +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.31.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.31.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..87b74517a018f5d65e974fc575140a80f0cf2f63 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.31.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:780d8a3fc0d41d7e42ab7524e0e8eb3a5044627584cb749954a08d74e8889cc2 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.31.attention_norm.weight b/model_repository/turbomind/1/weights/layers.31.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..9e1759f5a7b8ce3bcbdf54ac4a167aa2a3836eeb --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.31.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13b79fca3496315c35d45be930b96ac34c0616ae9bb69018d41d4fe7d77fa1c3 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.31.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.31.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..fa724a72baf441d9817165d242ae54e77b819e7d --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.31.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d6490623b97868d9d81417ecbbc40bbcf24f872882ca23b74a76f6f384082cd +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.31.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.31.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..4e046750532412be4588ab28e7285c8f68bccf2f --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.31.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b03dd848d3c92adda40904bb369f812d1a2de1d72e53600bdf89cf3002aa5e4 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.31.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.31.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..7954c17e1c4aac980fc31bc92786998b66007879 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.31.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f491d3ff06bae3646c8cabbf8c8b6e14963e909e5a3f2cadd84931bb1acc076 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.31.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.31.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..1f95fe4038958211cbda9224b4161cae99e0c2e5 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.31.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7895c436da989422f207c0631685485aada8b0cf45d0db3bbf0cb18b8573d8f4 +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.31.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.31.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..491eadebff5c76dbdda444c927fd0bb153d54dbd --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.31.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b010068e8df791fcfd32ddefe46198f72adc5cb104f59512820541ed232ed52 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.31.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.31.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..9ed6ce58e195ff81f658649f8fbf99311dad0183 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.31.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcd30ad8a1a6ae548b3b6cdbe2b3693c1d260fcf73e63e4cb201f4ff3a9216e8 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.4.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.4.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..9efa7ae8526ee807be03ca3903436c1c4e096b2a --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.4.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd04897e691fff067678bfb5826f8c0dae0914c4a822266312a9fd08f9c8dfb9 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.4.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.4.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..b717a0bccf881f43c4dd4849aa9abac991f829b7 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.4.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a4e0a9b4313f6f28361952f5e1c00250e0bc8d8e348238f634679cc9983d4b0 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.4.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.4.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..bbc885705f67c282413e4e10b430177fa24c64d1 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.4.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83ef42f037338f04aa63a71554b631e20e2cc1f4c44d0498061891de5d46dfec +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.4.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.4.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..0dea56a4d1087a93efcf6c1d4c45d4eddcffd41d --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.4.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92669ba1e130035258630c4bb58a6ae23088baa4c818edb89d18126368fdd2b1 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.4.attention_norm.weight b/model_repository/turbomind/1/weights/layers.4.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..85901d7d4381bcdd1d25c69d8652668e9e82e4d7 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.4.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4392ba124c790351e1e804e3f6954b04df59cabe55918fb2ab208b9fcb1a25d4 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.4.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.4.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2eecef389220ebcbbb1b399d81d28d5c7123895d --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.4.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efab7d32785919b64059b2e20f610eae03ee8a2ba95bcd5c2d786e3074f66875 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.4.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.4.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..203aad693c83911b91ea533a372c2414914f0c33 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.4.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:624fd673a1cb8d5eed0814f7d0ebcfa6de1f0933f2c808a43fe9915863d06992 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.4.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.4.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..22624a1646b9f3bc812053a3e4eccd3aa066e8cc --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.4.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2a9bc1f9a857eb51f12e913af082a9d065232ad278a46bf3312fee70b57c929 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.4.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.4.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..ba1d032b1632c72d516bf607d69ef9d858ec3f69 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.4.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f5a160ff8d293e97b6037541c207caf6ea4b15e625bd94dba7be81f1aa3052f +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.4.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.4.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..10fdc6cff9055cfb29be992fd58fec67e3a1e156 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.4.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7584bdc2460f81e60ad3db90f314b1c3c0bb458b724ad5a8ef2f6b87991871f +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.4.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.4.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..8ab0548585972c0f9a19539e4f0246ed192f0042 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.4.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:734c894776290dd532cb25f542e38b56c9151c45fb751e1d58f5aba3c1cf86ce +size 16 diff --git a/model_repository/turbomind/1/weights/layers.5.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.5.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..04ab0a16f4f6b5b500d30b4b27152a073d6efffb --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.5.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76f7240f7f94715ffc2e22da1e1986a7738b3a81d2803a89fa8d467ab37d52f3 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.5.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.5.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..35b017f6b8442ef2ed28b4f1d7f2aab7e6c8f3d4 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.5.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f30a98755d5e88115a8343930c20bbfd34ef8095694f4c0709b299e0ee587b25 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.5.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.5.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..4b270cc9d0768c5834bf5dee3db2ae53b9d1a2db --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.5.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c2c8b87162bc3f8d4c6044cbbba5bff1a0b4d484418966d683cd8edd5ffe289 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.5.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.5.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..2170f6316f894a43c57df7c6f3b6435d6d290e59 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.5.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8a0bc293e079e00c8fb29ea166613fb81fc7a51dfae01bda404298bd3541858 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.5.attention_norm.weight b/model_repository/turbomind/1/weights/layers.5.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..e56c76ec2f895f4ab09e315bcb026a0cd110898e --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.5.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e322bf9e96c707a007b6cf18e95291034a7b4acc28cc9c868ba72a2067f42a4a +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.5.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.5.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c0603e429404aebb532d112009658a498d6a25d2 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.5.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b111a37c3e4700a7ac8bcc755e22baf0cdd205a4f64cce28587b12e6bf542fa5 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.5.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.5.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..54720e241e1c6574c937ac39760a84933da14ee8 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.5.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccbdd88d473982cb63c5daa191f2956e0826feff876c6303ad46054ce474a9f3 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.5.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.5.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f69f281b519e24e86576e49e914a3f29b9833837 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.5.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d055b75469902bb480fb2470766fc359100caf6f512e030d846c895cb23501e +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.5.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.5.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..39d27ba627be29fdb76869d39b5a02b38030a6a9 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.5.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf2b8068885689ca049003d3dff4bc8e68b47ddb9be7d7fdd56b39582b7fd61e +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.5.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.5.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..8f90bb2bd06c0ff2405bb8ca61c65441dc384653 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.5.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c886bfe39172273f70831164b7b87f48054c0da65cd1724be839673c817009b9 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.5.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.5.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..0032439aec9359a437391315477b7201d232b7ba --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.5.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b479855806803e6c485764401a2ed76b362ac09f2606a6d58fbba9b134ee186 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.6.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.6.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..08c09cae235117db0cf2be801f075c4236bd6ba2 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.6.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebf9ddd2465c02a1a37bafe82e009127d6cbbcf0bec3b323eece36934bb6eeff +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.6.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.6.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..78b67e25716cf86de09b47dc537db6ec420fd21a --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.6.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b39acb9cc4de067c3ef5b0128c253ad0b646756445766d91f2421ca30ab6e272 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.6.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.6.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2d2cd5ddae6f67b08f6610fd6bfd8fe17ff43ad7 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.6.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81ad5a0787961305a05ec9b7c0fb89cc2aa70589a36efea39557a8ff33be93c9 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.6.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.6.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..380b6dedbd40afe6240e0271cfd0000ef9f17b01 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.6.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edadc4493b3568ab5ebe758a1aedc2ef5fefcd688f5a78eb1866379967ca1cd6 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.6.attention_norm.weight b/model_repository/turbomind/1/weights/layers.6.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..68cf1e82a5f3d60ef2c37bde39437efe411c0263 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.6.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dcd4367593812ecec39d8b1ff7cd21912c1283686db24be488384fd2453162c +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.6.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.6.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f66c0c431c68905f3cc431d2b266b628bcc1f9b1 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.6.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3cc20446684f9b809fd52c40bda9d32c115789c650575c0e54f5ab030b7ceed +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.6.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.6.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..d158d234d215899f80ded95207cff364e20e0c1d --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.6.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f01f13b1cd0cd8080d7c4906d71e44200b8053aa605a37069f1a9e1034a81f93 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.6.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.6.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..0bee7d213091341bc193cd21b808a3776987b7dd --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.6.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95865a00e74b9d37ba9c21241922979b4f26eb06b78b84b25be12bcfba617657 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.6.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.6.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..80f3f7257450ba5de9d4dabaa61b516c7c807046 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.6.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0dcaefa2acb86a25aedc25d60558af179bbf8968f1fd023b20343dad73b0184 +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.6.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.6.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..b56799656e38d049d14d02b2d7e4ab1e470bac6d --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.6.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e458ef7058c9d7734737447072dc2908dea9ebf64a2ebcef932e4d6832057f5b +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.6.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.6.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..70c460d32701c69c43ce43977e55d4c5e407b1c8 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.6.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa3e886e06b35057d676139206ed116fafd8c8dd29244eff07cf1221837e8807 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.7.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.7.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..4bd1b6da8292c5b10b20dbee8e2ee7e95a46637d --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.7.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0c4ca025a4e163c0dc2da98d463549125001a9cc93654f37907cce2a9882d52 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.7.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.7.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..8846088f9a04128c3626ebdde6d6747d1d663587 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.7.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c086c5de28164657905ed6eaed423d6244ae0368c6180aa26fc0a6eb89724a83 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.7.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.7.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c4891059c086711d0200456b57dc31f93418ba81 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.7.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efcb8926a09d3f78acbff4e19e2e5bafad04172d17321a6af2b4fe7974c40fe1 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.7.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.7.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..a08abb8652ecda43c661807290bbefa793fb0160 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.7.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c0cdf8402670c6998b317082c140f0eb51c4bb0b41ca4e6386c6f1648f56a76 +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.7.attention_norm.weight b/model_repository/turbomind/1/weights/layers.7.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..37c18cd18f7054a248d6352d4d5a25ac9a4175e5 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.7.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28cf5e25d536f7d9180c2eb1d7dcfd7d4bb749816849f75c5e09f0210cdbc417 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.7.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.7.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..9b50669a9dc81bf91e567a299ee57d333907a007 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.7.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0581fd7f812265f9b47b8eab7621664a046c4c6f98279676df767aaf339eee7 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.7.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.7.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..93d6f40d2e5bcd8b2a2da3d12418121279963070 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.7.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f86e5d5f8bd7d8eded5bf5a5cbefc9b1b3242cdb2b486f6b1b0289d75f4df828 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.7.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.7.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..9d07164c18362f5b0879cc88dbb43ef395f284f2 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.7.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b02b881d979d0fb77a4d705ed4bc68ca58e7cfa84a504d90b9e816ddd99a6b0 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.7.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.7.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..b95f34d475e6c10781aca4639fbcadc9e706fc5a --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.7.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0c7e60168198f2ac9347ac8eb4fc59ea42fe0380e24550cd4fa2e989a2d90b4 +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.7.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.7.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..7669f396fbea22312892ecc7e69f5847e3e3d0f7 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.7.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bce0233aef9e8401ea7eaddce5b44f2a28b6fd1018023ec3f2cae495f4d205b6 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.7.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.7.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..d2b299db6620c0abf87b67b228dd03b696854499 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.7.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae08ed15fa296e998f7e93b866fb5536103b357ca8fd0e8ee44423c4fe3ea4d3 +size 16 diff --git a/model_repository/turbomind/1/weights/layers.8.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.8.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..9a071d9e1c24a362c04a0f4335000d1eeeadbfea --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.8.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:312a5231076c36e023c30c18761d4793c7aaf2d1658f740a4ed6fe3ab9fb9532 +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.8.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.8.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..b756258fc2694a8580c1d6d55d73c1aae4f88737 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.8.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:045eb164e9d18487951013b4a69dab786f034139e232a0c079e6c6de0b84d445 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.8.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.8.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..79dcacb0bc5ed37629a105bb0afdc20c383e1736 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.8.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:917ac6b4102a88cb5fe47a13834f30fb45329e8234e6bf4a6d5def09acfca138 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.8.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.8.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..3f21f5d05d73002cb0251350fce183ec3b6f82cc --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.8.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:075ca25071e36779993618787bcad51f47a6210b5c7efb13836b9f0c39113c7b +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.8.attention_norm.weight b/model_repository/turbomind/1/weights/layers.8.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..6441edc914d86ab07b46c530e63df5e212499fbf --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.8.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7645c5cc08248a97031708e37a8869793e72e86be7d529ee2d38214aa125f326 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.8.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.8.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..6b623d7f4ebef4670369d48905c1f66aa9b3fd94 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.8.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0a76bb17ba96c365a1bf660f901c21c3fc1d15165b0532e97c7ad86158513f0 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.8.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.8.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..f7b56f5fefdb81227823903289604a2f9e33cbf6 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.8.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f6cc9bf35da7c08e89248a2d1151ca84f97e0d44fda2f474fbe090fa2b71bc6 +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.8.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.8.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c837700cdf510ee1df94f861174695bb0e1ccfc8 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.8.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67d6a461146ce6fca245beab647f837c7718f50c1ae6d48f852becd4b88ecd68 +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.8.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.8.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..63ba13362b7c68d37224b01f241452a27cf8717a --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.8.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22f763f7c06275a5821c55ab0428986c7982da93d02ec561c4c1cf0bc83cb82a +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.8.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.8.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..c4ec482ee099d1dd8d7b2633b38f9546642f8c04 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.8.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97f607d08fdcc7d4a7048194e994afa25c34242bddec4d56534a779484534dec +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.8.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.8.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..dae30d205782945d230c044159736e88b8c261e0 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.8.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55e7e6e9663622f872cb332c414eac32a102e97ffdf3f5a2b6afa6f8371e1a5f +size 16 diff --git a/model_repository/turbomind/1/weights/layers.9.attention.w_qkv.0.qweight b/model_repository/turbomind/1/weights/layers.9.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..502cfce88cfb73bd839f1fb667fba672259c4294 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.9.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ad1c9bfda707333f5860de8512ec7db789721d5f17e96ec0c1f79f98533c42c +size 12582912 diff --git a/model_repository/turbomind/1/weights/layers.9.attention.w_qkv.0.scales_zeros b/model_repository/turbomind/1/weights/layers.9.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..47605d66d4acddffb2885150c9d68d184f94a9c6 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.9.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b5179dc3fba3abadb58abf409bfef33b382dc7373a002c3c43da9785c86f614 +size 786432 diff --git a/model_repository/turbomind/1/weights/layers.9.attention.wo.0.qweight b/model_repository/turbomind/1/weights/layers.9.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..0c3613bd080dd0fe0abbe07c8a567bf85e48e33d --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.9.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:535eb0ed2a008590448c38ddcfcf990219dd0c1752e28d11fe3310cdf4039d57 +size 8388608 diff --git a/model_repository/turbomind/1/weights/layers.9.attention.wo.0.scales_zeros b/model_repository/turbomind/1/weights/layers.9.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..bc68d0462949d41fb22495d6fc4d8a2c6c21b6a6 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.9.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee2d02d009e36ca78d86a48ea408c2017c21903b64400397a77f437f495d936c +size 524288 diff --git a/model_repository/turbomind/1/weights/layers.9.attention_norm.weight b/model_repository/turbomind/1/weights/layers.9.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..8493ee9741dd897107d9fe3cea7c2d01fdd4dee5 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.9.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcacb811b4cf62144e1ac2d3eadbafab30083e3420c46a92df1ab21840b29fe5 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.9.feed_forward.w13.0.qweight b/model_repository/turbomind/1/weights/layers.9.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..bcb62122ef3b2bf1d13099eb7e64cd4f6266f02c --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.9.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aca67258bcd3c39f17fb15a14b72cfe8ca597aeb30e0f4f298efa5eb093abcf3 +size 58720256 diff --git a/model_repository/turbomind/1/weights/layers.9.feed_forward.w13.0.scales_zeros b/model_repository/turbomind/1/weights/layers.9.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..3e0e6af0add56eeb2e1cf7bc0142e52be7a5ae29 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.9.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4b60ceaccc0af57c36de7cd69acf05d8c307f2d6d27a7e765e0f132ae95d17a +size 3670016 diff --git a/model_repository/turbomind/1/weights/layers.9.feed_forward.w2.0.qweight b/model_repository/turbomind/1/weights/layers.9.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..399c1fc8d6cc43a27e802ca067c88fc4f9a3bc73 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.9.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e07e422f44ddda11dc7404b257cacd675b2b7f44491941e6754155df3a31d2e +size 29360128 diff --git a/model_repository/turbomind/1/weights/layers.9.feed_forward.w2.0.scales_zeros b/model_repository/turbomind/1/weights/layers.9.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..9509fd872d04e11bf53f07f99129e785b2056187 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.9.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cc346804097116087236c77f2e2c018922efba4f2e32d8a71ddf8b026c9d34d +size 1835008 diff --git a/model_repository/turbomind/1/weights/layers.9.ffn_norm.weight b/model_repository/turbomind/1/weights/layers.9.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..185031880012c613c2cf8937d4aa159e1c93a4c0 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.9.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98119ccde8c54eacba56311e43a7c74e62e30e0d7302b011202dea6a6348ba66 +size 8192 diff --git a/model_repository/turbomind/1/weights/layers.9.past_kv_scale.0.weight b/model_repository/turbomind/1/weights/layers.9.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..0ec9f90c9c5be11398b7b1bdba1df5b0975ab0d4 --- /dev/null +++ b/model_repository/turbomind/1/weights/layers.9.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62cf0a7960b56038dd17b81e2a1c38a016c2b78bd7272299dee18ae8e53e5c92 +size 16 diff --git a/model_repository/turbomind/1/weights/norm.weight b/model_repository/turbomind/1/weights/norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..906361178f72cf7bd1f01447accc35bf0e1b633a --- /dev/null +++ b/model_repository/turbomind/1/weights/norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efcd3fb0c1c5225c17e0eeb5b46068bb7311f716a4908d5a39d79b37985b58e7 +size 8192 diff --git a/model_repository/turbomind/1/weights/output.weight b/model_repository/turbomind/1/weights/output.weight new file mode 100644 index 0000000000000000000000000000000000000000..04e8f86f0b46051b3db62d5eefcbebda87641472 --- /dev/null +++ b/model_repository/turbomind/1/weights/output.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b0ed41b4df8f91647fc8bdd2aa61f55c39e09b6e063c8bd509b591797293919 +size 758120448 diff --git a/model_repository/turbomind/1/weights/tok_embeddings.weight b/model_repository/turbomind/1/weights/tok_embeddings.weight new file mode 100644 index 0000000000000000000000000000000000000000..0b3edbd16fbb690f7c781043ea905fd4380e5f04 --- /dev/null +++ b/model_repository/turbomind/1/weights/tok_embeddings.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8986115ad7e59813a41c88c0d601235fa36138d6c15e5657a050cf4ec40fb037 +size 758120448 diff --git a/model_repository/turbomind/config.pbtxt b/model_repository/turbomind/config.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..f139d5b2234c0dfa94e3792dda985f9e8034a5a8 --- /dev/null +++ b/model_repository/turbomind/config.pbtxt @@ -0,0 +1,293 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "turbomind" +backend: "turbomind" +default_model_filename: "weights" +max_batch_size: 1 + +model_transaction_policy { + decoupled: True +} + +instance_group [ + { + # max concurrent instances + count: 48 + kind: KIND_CPU + } +] + +input [ + { + name: "input_ids" + data_type: TYPE_UINT32 + dims: [ -1 ] + # allow_ragged_batch: true + }, + { + name: "input_lengths" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + }, + { + name: "request_output_len" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "input_embeddings" + data_type: TYPE_INT8 + dims: [ -1 ] + optional: true + }, + { + name: "input_embedding_ranges" + data_type: TYPE_UINT32 + dims: [ -1, 2 ] + optional: true + }, + { + name: "step" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "session_len" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_k" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "beam_search_diversity_rate" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "temperature" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "len_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "repetition_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "random_seed" + data_type: TYPE_UINT64 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "is_return_log_probs" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "beam_width" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "start_id" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "end_id" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "bad_words_list" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + optional: true + }, + { + name: "stop_words_list" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + optional: true + }, + { + name: "prompt_learning_task_name_ids" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "top_p_decay" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "top_p_min" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "top_p_reset_ids" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "START" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "END" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "STOP" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "CORRID" + data_type: TYPE_UINT64 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + } +] +output [ + { + name: "output_ids" + data_type: TYPE_UINT32 + dims: [ -1, -1 ] + }, + { + name: "sequence_length" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "cum_log_probs" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "output_log_probs" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + } +] + +parameters { + key: "pipeline_para_size" + value: { + string_value: "1" + } +} +parameters { + key: "data_type" + value: { + string_value: "fp16" + } +} +parameters { + key: "model_type" + value: { + string_value: "Llama" + } +} + +parameters { + key: "enable_custom_all_reduce" + value: { + string_value: "0" + } +} +parameters { + key: "tensor_para_size" + value: { + string_value: "1" + } +} +parameters { + key: "model_name" + value: { + string_value: "internlm2-chat-7b" + } +} diff --git a/triton_models/interactive/1/placeholder b/triton_models/interactive/1/placeholder new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/triton_models/interactive/1/weights/config.ini b/triton_models/interactive/1/weights/config.ini new file mode 100644 index 0000000000000000000000000000000000000000..88f3d40970a1e663689736be546f8d3d64bb8734 --- /dev/null +++ b/triton_models/interactive/1/weights/config.ini @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c8358cd3fffcb86829f6b600bdd0ba77b6147eed572f88700ec4d914db070d6 +size 645 diff --git a/triton_models/interactive/1/weights/layers.0.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.0.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..4f5435a75963ce7ce17b0536f500c8ebf8ca4220 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.0.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1763929a6e7bbdafdb81d39ebfa08263351ccea12347aa68b292b1b7c458e45 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.0.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.0.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..52107ec494683ad0e0403e4189bcceed1ceabdcb --- /dev/null +++ b/triton_models/interactive/1/weights/layers.0.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ed40e83191f5304fd2df93ff5b90ae9a165bbe489af8020e06948fbbb289d7d +size 786432 diff --git a/triton_models/interactive/1/weights/layers.0.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.0.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..6e21231bbe43b92e43a0d2600ed6969f6c00e767 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.0.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6710235be94402052aaaae809e488f433d75d6d33acf546e2d0bf7aae4d8f0f +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.0.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.0.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..4961bf6cfbf6ae7592675c56d719924794d8da68 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.0.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c069c91ef3a796ac2e9e0230319fabb6bc8433c68284c6e5ca71baa477a3438 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.0.attention_norm.weight b/triton_models/interactive/1/weights/layers.0.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..51dd734ab95204a4ce7fd026707a375f1a85219f --- /dev/null +++ b/triton_models/interactive/1/weights/layers.0.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dde3cfe82d02d87660f40c667186249cd17a5ee5924ab2a3ea0385919a2d0f3b +size 8192 diff --git a/triton_models/interactive/1/weights/layers.0.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.0.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f3167a75e6defd59aa396437f58c797bb5cf1b2c --- /dev/null +++ b/triton_models/interactive/1/weights/layers.0.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26bc912102aa2b487baf312f3bfd8f97dc46ba6761c2328bfd3e45581bfbcfd4 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.0.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.0.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..68343cbdcbc17ec725af43c1a1d53b62bc5c32c0 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.0.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:309c93937a8778e4e4dce879efd1e0673f4bb7701644628abbaa8420e5b24cf0 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.0.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.0.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..3e82c77a6ba7b16d19d55f544f872223d33fba6d --- /dev/null +++ b/triton_models/interactive/1/weights/layers.0.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d096d08769d4b05f7483b4ed024224e0d4d35772231e757157e69c9c0dc1c6ef +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.0.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.0.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..fee7031bc4703588c99d993aaf4e1c0f1d080e5b --- /dev/null +++ b/triton_models/interactive/1/weights/layers.0.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdb73c0a0f614f1033850266d6ff4311374557a2653e0fa7857f8507ca87058e +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.0.ffn_norm.weight b/triton_models/interactive/1/weights/layers.0.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..e8f321d4e16161bcdf7f2b6979e9f90b8aa04ef3 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.0.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5b414270e0d50fbec62cdab6ecd217c2f688872d5ed7d9f91bb75dfff46651b +size 8192 diff --git a/triton_models/interactive/1/weights/layers.0.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.0.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..e376c6acc6ad65b07267f834beda69a889c5f0b1 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.0.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25f7250671024d0129c45c3f3d8f57887921d219c280350697d41e9170925c77 +size 16 diff --git a/triton_models/interactive/1/weights/layers.1.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.1.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..bb3ebc7beaa1d925c4a14fbad6d2df2ec6bad94f --- /dev/null +++ b/triton_models/interactive/1/weights/layers.1.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a125e82d7ee989858902abca2bec9dc3f4ad74008f5307a1e7a635d148c53f3a +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.1.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.1.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..bc0ed1f6f8ef00629e07ce4989e2ddde96723c08 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.1.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f96d91127194d8a8404809f81602727e59903c86473ee27012bb303f83cdf77 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.1.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.1.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2eaa43207863db980e17ed160bc4613b175baf27 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.1.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4905342d79812e6bd9d6d993443ee6b30df2f80cef44176d1398dc884c458bad +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.1.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.1.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c136a82b25947dc950216cf643734a4a5ee81a36 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.1.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c7971bdedd76bbe5630fd97b2badbdd26d22055ffe6fe0374fff051af9feb80 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.1.attention_norm.weight b/triton_models/interactive/1/weights/layers.1.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..abe49b3b4fe282cbcf269cc92e4a1b03f8304d1b --- /dev/null +++ b/triton_models/interactive/1/weights/layers.1.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d589a6b27b707580d37c4b198dc952071bb1a34967ebd9175f9055ac012bc781 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.1.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.1.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..7d2bbd8d926a99dd1ba3adf0859660ace736b884 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.1.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dd761cf75a1f95c5a55a245fbe1a8bca8967be0d7a03dd12108d0be835d7682 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.1.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.1.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..9fb67e07dca86f3c043855b520b84ed83c9b4930 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.1.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d4fdfeee03517f7896aadab5adec50c8449a2e1bda2f0cf5b8725b26057d1f6 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.1.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.1.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..83348571bf69b92747b68f25d3755c7b2146e4c5 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.1.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0c42be27fe2e9f48473b5cc4ec63cd06575ade857ea8699b4bd05eb4f801dc6 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.1.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.1.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..7f8d31081aee57241eed23ae114dd5e39f9e6bbf --- /dev/null +++ b/triton_models/interactive/1/weights/layers.1.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe915a8697f98fe80270d235325b469219fac1c8a4529052fd15f6b1ee8f13e6 +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.1.ffn_norm.weight b/triton_models/interactive/1/weights/layers.1.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..6db47869baaf62ea10c904bb39ca2fd8dcb35aa5 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.1.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90fa27f32ad04b368d7110fb689b24ea02904efb2f2b7a9f9be876c331fc7212 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.1.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.1.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..87ba80c2080cfc64bd645133d99c4fb0f602b920 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.1.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08456e5241a0fbd14699cb889680261c9e0ca7d30051066d899e99be24e15d52 +size 16 diff --git a/triton_models/interactive/1/weights/layers.10.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.10.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..35f6c98510eb157f0971d9d241b2ec765cd3c834 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.10.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4d8d7ae69eea66730a10e906758105f2c99b16d082b9ea84d7e7cd8afcdbd4c +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.10.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.10.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..77eb52490f504dbd5b089674f267142c27e7acc0 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.10.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2885240377b91bd85bbe4ee6f67b8ca23233584c35ce71b752f9f3bbb66e266c +size 786432 diff --git a/triton_models/interactive/1/weights/layers.10.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.10.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..440d3e309d85cdfb81736fd024a2834f4d0ce308 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.10.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae5115820467dcb2720eeb7abbdaf3ecd5edb56d9d7453fb0bf4f6b65323445a +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.10.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.10.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..18b5ecc65f6f8133a1821de0925d37622a67af48 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.10.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4438217ed5de15cb91f4e30f0644b08952e981d25015dd4b75c4a0cae83517c2 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.10.attention_norm.weight b/triton_models/interactive/1/weights/layers.10.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..4f0f39a02bb84010dd644e2fc96ef3b46d4c2820 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.10.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cd2c0d884542c0a881ef8fcfc9fbcc1feb67afbff0a8befc9bb741e2d8ea2af +size 8192 diff --git a/triton_models/interactive/1/weights/layers.10.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.10.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..bf50b623e7b1f4520d761286edd1db51a109c4c6 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.10.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a1258ea1e97e4c41db26a363eddedd3bd47c6d49f7bf738703c5746c54f4e37 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.10.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.10.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..ee36f684587a649d68d9579441ca3e90af8d7d6e --- /dev/null +++ b/triton_models/interactive/1/weights/layers.10.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48e7492a7d4447980961b5891a0997f2568bdbe10ed15ba0998f8ca1bdaf0a4c +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.10.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.10.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..b0cce8413321f6074dc61c7a28bc92377f4c7ab2 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.10.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fb81b3c6a3f7b674506b003621b7e92925754e97d23ecb1209003f2232e33cb +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.10.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.10.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..ce1603f2d10d9ae9ef7251cb66a02c3e0cba6b67 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.10.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:773b9c8eb4a3818b2667162b3169bd4fe813f2fcba5c708a49b79fa5c5053c61 +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.10.ffn_norm.weight b/triton_models/interactive/1/weights/layers.10.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..bbe9a16316f0db34745e41ef00224f94b9237fee --- /dev/null +++ b/triton_models/interactive/1/weights/layers.10.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b576f4d059d0f37a4fd3e626e640dad540ff4758aa449bafe55a78046a01dc9b +size 8192 diff --git a/triton_models/interactive/1/weights/layers.10.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.10.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..da0421db9e924c29c37c13c09376487aaa383c8d --- /dev/null +++ b/triton_models/interactive/1/weights/layers.10.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:430d675f2f2e4512591d558ea6f29e42dd38c55ffcd8d21873a12e9ff90e15b2 +size 16 diff --git a/triton_models/interactive/1/weights/layers.11.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.11.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..d5058e0b21a7342d2379f3a9315e85ef9bbe7682 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.11.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2871ddd112a88bb89a549de3bf1c53af525e962e118eb7ad0feac6a56599a26e +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.11.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.11.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..92844164ec6f5b42e8222c577ce94bae5314a9c9 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.11.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de7017bdedc110df3a9f9fab19466968a5488b9ab3ad533f0908f2d368371adb +size 786432 diff --git a/triton_models/interactive/1/weights/layers.11.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.11.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c67e6d4b3e11faa456791b77155fef70589e246f --- /dev/null +++ b/triton_models/interactive/1/weights/layers.11.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:530e3110fadceb664c29ff9da577cf401128e93ae21601affd1c62137b04db35 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.11.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.11.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..4e0d310e48ae8ebd9b629872134eb3687a55e341 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.11.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1725da8fac86700a95c4ee9d40cf9ebf0d1ebabb4b145c2d57c4a31c42299cb8 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.11.attention_norm.weight b/triton_models/interactive/1/weights/layers.11.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..f57dfc1e256d2fca8f1c8d59982ea28fb2f209c8 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.11.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cb24612b49347f84741d6daab9a90b828aab924fc9b21fd2d2ca6b67abf8ea8 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.11.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.11.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..27905dc8bb55b6305cefdf0135d72eda3e7e17d9 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.11.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0af7f58d1e58e6610b5b56291bf697d79471c1eeaefdff9466fdc87996c3c86 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.11.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.11.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..474796975c206470856a63e5627806fdd1a9d0e4 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.11.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46e2d6846839f995e9434c35519a1152c52285d29672febe66e9f07b0e7523e5 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.11.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.11.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..b8e4a4f967601a2151a7eb5da1c126599eea4743 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.11.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ae182cb83af72cac11a76113fc5492ae4ccda1cd45df36facac10e65369d22c +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.11.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.11.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..aac9a3ac0afb93d279461dacd82e1fd80dfb6161 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.11.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54af6ef8d3b0aaa32183d5fb176a4d2097bd043e44ebea37ba43ac4021e18253 +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.11.ffn_norm.weight b/triton_models/interactive/1/weights/layers.11.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..6f958acb3e97bbc263ba99adb14ceb897dc7e573 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.11.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ae646b4e03481a9e0eccf0a151deeae360012b79d455f413d6b4c8c05ead016 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.11.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.11.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..3bf7aed58e43958ad08d6b6e8beffe072f7e15e6 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.11.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:114046d9b18a39823a18019529563163f191e5a74c65e959db74c96b77c9b4b9 +size 16 diff --git a/triton_models/interactive/1/weights/layers.12.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.12.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..b026bcfd8643c18461670a5a2980cf9a8539bb2b --- /dev/null +++ b/triton_models/interactive/1/weights/layers.12.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d30b7fa1db362abf3186072da75c305cd7e79f90f4b1eea6095014d9f7989da7 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.12.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.12.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..011903f321dd322447298b693e1eedb17f35c3ac --- /dev/null +++ b/triton_models/interactive/1/weights/layers.12.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:654fe994288ed138b388cb0e14a9c4e7124b601ac4efa404788e3267ed137307 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.12.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.12.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..fd89f748d1ea906c6617d240a4e123d243105b64 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.12.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:069d9e054d6cd0171b229e37a70b6a2fca364783cc8e80de9f81060931964e0b +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.12.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.12.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..b46cd92e96aa0e40ba260aea37674bdb9fbf1fd6 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.12.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:394968e46096fa0f50701fe0d09193561276359f023ea5dbc3a16bb3f1aff8b8 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.12.attention_norm.weight b/triton_models/interactive/1/weights/layers.12.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..0020f8c429974d047571347728c95d5259c0da58 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.12.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:020a5a9ed0a5065303d1079d24ce7252b639f6f76bf49c7b8fb5fac3bc93fc1b +size 8192 diff --git a/triton_models/interactive/1/weights/layers.12.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.12.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f5cd9ca940d4417db1082cb6b445b56fc3ed304e --- /dev/null +++ b/triton_models/interactive/1/weights/layers.12.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9293f916e4009deb3dd715ac0fea08afe5be75548d2fe2e70a67fd5826664cea +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.12.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.12.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..be6c9b7b29a56d2d3afaec63b36099fc29d1ba80 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.12.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89899a4751211dda4328e2380ceec5d62d0d0b13fd164ccb7c9f5e189409a08f +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.12.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.12.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..53e4822e263ce179450dcfacefe7dd882447324d --- /dev/null +++ b/triton_models/interactive/1/weights/layers.12.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f0f0481d3c7eeecc2717614f38dcd54163c287431e82da95a1e8d5fd182cc27 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.12.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.12.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..2f8d90a6c38370788887ee529f4ad8c7b4fd6593 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.12.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:690b11e4c0f825ec39db6b53fc1ccdd51d051c752199195f2cff8079ef3b980d +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.12.ffn_norm.weight b/triton_models/interactive/1/weights/layers.12.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..35e00aeee302ec1726ef04c71f2a2f429fe0d23e --- /dev/null +++ b/triton_models/interactive/1/weights/layers.12.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce6abd982c6b4b398f13a6113cfaefff0fe65190ff1b232c8b9a68acb30fbfdb +size 8192 diff --git a/triton_models/interactive/1/weights/layers.12.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.12.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..8fb69a827363200f7cd82be1b4f35bab6e143bb7 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.12.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3cee21f879722a16a454f6455c8d8c3aec77cbfdba6cbebac9c4762d1d03bb2 +size 16 diff --git a/triton_models/interactive/1/weights/layers.13.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.13.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..63d098e6067e1aac3d4f6083c34f967abcfb40f4 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.13.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:983fa35043fba20d8f39610fc859862486472388df708d85176e198b9493f194 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.13.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.13.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..f78fb596aaf17a70c0fc17098a02d2fbd9f8b12e --- /dev/null +++ b/triton_models/interactive/1/weights/layers.13.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcfbdb8a6f2d86500e49d21e3d0cf88dda2e18b505be8459e46962f1a5403902 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.13.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.13.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..d0443fc30519b3ca74b5e3d4e0317af1dbe8b32d --- /dev/null +++ b/triton_models/interactive/1/weights/layers.13.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e76d5b55510b3111a4c8068f8bf2abe8372c9868a5346fd03831633817f49a3 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.13.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.13.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..6cbcd17aed1ae804e9e87a936274b99c9ad81296 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.13.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da85282928c5b1723c48e93cdadc416b400deb61bb90f28c4675989ab7d2f4f8 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.13.attention_norm.weight b/triton_models/interactive/1/weights/layers.13.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..58edee2f8e729e06965c92f434900ae4f75e1a49 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.13.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:592d7039e973372cadcf8b3f717c19ecbcb911e2f40140d617855643bf2bfa3f +size 8192 diff --git a/triton_models/interactive/1/weights/layers.13.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.13.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..0f2f191246be551220b2b9df11e88d070f4b63c7 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.13.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1cbe619508e858a2637045e1e07f9cb0ec4c6020d6041e40bc9558aaa9fd290 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.13.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.13.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..8114a135ab96b7c28393bb44bad7050a71bd712c --- /dev/null +++ b/triton_models/interactive/1/weights/layers.13.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c555740ee91741c87411db09bc23b419caa191a4ac0ccf7e34b00fe64e614493 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.13.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.13.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..efc53988aa0826924baa6153c20d1fb1abae3183 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.13.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5434cecf17636b9bbdf1df6ae4b6d1eb6c06a611c93fe0291ad0d3892d850a81 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.13.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.13.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c927886fb77c90e7e2afb11bb38945c179e779cd --- /dev/null +++ b/triton_models/interactive/1/weights/layers.13.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c89194f222aef9d0488e0677d654d9f4cc783cebad2ba76e9013ef99684a1c2c +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.13.ffn_norm.weight b/triton_models/interactive/1/weights/layers.13.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..0044a510f007c3e66e363ee02bbc25f4c26cb6a6 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.13.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75cc6d0e292ec019791db0f7ef63b0508d8a5d19404fadb09c1b06a8dcae7cdb +size 8192 diff --git a/triton_models/interactive/1/weights/layers.13.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.13.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..313f047a7db61ca9b3fed45b948aad24958ec896 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.13.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e86a948027461837c94daa03c444ddaa2a484bdadcab47a89f78d0d332ba0370 +size 16 diff --git a/triton_models/interactive/1/weights/layers.14.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.14.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..d34a88071016d52838a914b177b787d6b7f5e989 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.14.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd65317b8701a195eabe835058a9366309ad055eebd4354fe994187573dcfcb4 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.14.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.14.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..dbf55a9dd11b2bb29fb5f7a2ec180b89f6372195 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.14.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a8b7af909bb0ee02940f92c80cde0a7a869e60bd4778c7eb5934ed7134b1e56 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.14.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.14.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f2e7385fd3b0a6c38260980964dfd035abe25f95 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.14.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f17aa0c464ae8e87100f9946574744e554c50847775d5e3cc888584c920b51bf +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.14.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.14.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..cca81645ed7af2fd8f2039c751f0856ab6332929 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.14.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac63fb5629b386babfc0cf09324e8388735c894def38688f57e5fa413a76a6b6 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.14.attention_norm.weight b/triton_models/interactive/1/weights/layers.14.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..a2e5c82b9d622524d9390c76957ed9e8994aa2b8 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.14.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9d54e43cc40808a7a12fb34802e7e3fa239938943e4f247ea54556f65191e0e +size 8192 diff --git a/triton_models/interactive/1/weights/layers.14.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.14.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..efb7ccb2234e6b179d310051c53ba547a39f7b6b --- /dev/null +++ b/triton_models/interactive/1/weights/layers.14.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f974af156ac932cd0619e0e86095071dccc8cd0608319df5c1042492b2002e9d +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.14.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.14.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..8d916976c94c174148b04db334b907ec77c7d638 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.14.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5be3c8f04a42c5e0c9de9d00508fbb981849cf188dba80cf6127d8f4b4b712d +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.14.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.14.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c926dcac71d930076be55189beacbb36cfb1a777 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.14.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c33e3534172410d4656b1a244becc400d680dc19664a6fe5d2531f0733b24b1 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.14.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.14.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..78c574771e660fcfc3a237c9d56afe57b62f1ea0 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.14.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3be2e077ef369c828ac8f31826249f327d120baaaf9d0141f67b9a814f95a57b +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.14.ffn_norm.weight b/triton_models/interactive/1/weights/layers.14.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..3094bf1d424cd5ba8300cb6dddb32e4bc9d78073 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.14.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdb3dd1a12abaf094e03a1d933aa4ab506d5c4c0cd21cf0802c04f4a0d5a85c7 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.14.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.14.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..a1ff0007bbe4e1f0abfdccce67158196a9b3ba13 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.14.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39dfb751ce93881ea2c4e2f68155583024cfcf9e85b5705781348b079cc29b0d +size 16 diff --git a/triton_models/interactive/1/weights/layers.15.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.15.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..8d981e2ef18ba6fa67894151d2e5d33aec76e769 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.15.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f2d6afe6100ef0eb47d5b379ce3faa38ec1063ba36d47d9526647ea7fa4bda2 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.15.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.15.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..92d62c8db383b4e459224b1370a1d87eaa416096 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.15.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8abb8c1bad2acba915885821b231c1884cd63fd978d62d23a25775671c97f9b +size 786432 diff --git a/triton_models/interactive/1/weights/layers.15.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.15.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..43781b59b7834c4758226fadd3757cd458eb9001 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.15.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fca2dec7e83b35a6b582edfc05ddf49890b234aeba53a3d88384a436cc96c4c1 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.15.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.15.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..51a58827bb1c84c5a11deab1134c99e4cd37f472 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.15.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83bb55b56df6d0d2c1f6f04d894e5d6e63d476b8fffe1dd0441a892eed850502 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.15.attention_norm.weight b/triton_models/interactive/1/weights/layers.15.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..7e895dc7fffaa82cf585391595f009adf667e4cd --- /dev/null +++ b/triton_models/interactive/1/weights/layers.15.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06c4e4b6e08466593216c5fffe5bb16fbe296be7d83b8d67084a728b4f0d26d0 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.15.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.15.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..8dfc85e4b6b9e369447163acf76550539913fb5a --- /dev/null +++ b/triton_models/interactive/1/weights/layers.15.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b271e071ebc5f1e37284433f76d394ee2ba20920d64e64355f6c37672bd68f3 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.15.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.15.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c0f10138fba546a8c454600fd6a73289e0a7f8fd --- /dev/null +++ b/triton_models/interactive/1/weights/layers.15.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b42f1cdd3b5b76e04cd4154950ade000eff8bfc44853c827ff351d00526201bc +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.15.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.15.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e0d0b67b1d9d4d9530690ac220e426dedaddb1fc --- /dev/null +++ b/triton_models/interactive/1/weights/layers.15.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c44d9731ffc2bbd8a368f60064a8e8e85f50b04677d059c25fce70aae38dc81 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.15.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.15.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..a99be30bc9c12257d3764ef09722a06f15ef0437 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.15.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:287e909a7bd9bcc0b456c57c361a614c1898383785bccf9f57eee7f91599e3b3 +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.15.ffn_norm.weight b/triton_models/interactive/1/weights/layers.15.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..630c4372de835971e521542c84649a00c3b2e403 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.15.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8dafc8ea6132b5caec667dde3f6dda741e7ff23e40b8ff5f5ccc59232ca434b +size 8192 diff --git a/triton_models/interactive/1/weights/layers.15.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.15.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..a47b7192fa2a190ceb02a526a527aed679e93740 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.15.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c945e5779fcddbf5dff47a4c3502bce9ba0bace5158abc583e852d1418f9513a +size 16 diff --git a/triton_models/interactive/1/weights/layers.16.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.16.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..b17d911138bd69b5faa2b303479e7cca9c12b659 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.16.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf8c2d841b0c3dfd0a4349bb4aa84c0d85141c14277e879c033484e225096715 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.16.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.16.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..bd4333af13bff4ad87c753e24461be8ab19102ab --- /dev/null +++ b/triton_models/interactive/1/weights/layers.16.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a54b05a6ce8083736ca7db382672bb83d215649338920308cf0edd2e4f1ae07 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.16.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.16.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e09e8104c2418067fc961e4fa84dc074da5eaa81 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.16.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b8f9b5eb6ea1827048eb48661af27f66fbf5f510055f7dfc813f28f79967c83 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.16.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.16.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..a056f4943ce26b8bb7e3c8d3d052feb2f324a4d8 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.16.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3031c7a07ae7554fdc02af0112aaf4f343c164f1da7e65ac0926e0b33ec1daf +size 524288 diff --git a/triton_models/interactive/1/weights/layers.16.attention_norm.weight b/triton_models/interactive/1/weights/layers.16.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..710904f88b607829b98f69d31a704b5ccb2180d3 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.16.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0996c709a45131cb25cd72865a06e38920f31941b25f83f2d78ed5751645c284 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.16.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.16.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..ea56d48779234f87b2b0a859e2cb110d0718e2b9 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.16.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50fe105dfc87e7a2f06e12b9d1d92899b4b20106d29198eb7f8156c888b57620 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.16.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.16.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..5773631e90c5be54da0f5ca15e355b6bf855b4e3 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.16.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8081c981a8cc02210f42ffa6b41e8f8a018cc273f18dd184e7a76ea6a14af908 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.16.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.16.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..5a19b7dd919248c1d8f24d12508ffb36be409a0b --- /dev/null +++ b/triton_models/interactive/1/weights/layers.16.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b58ad7e7bd4aaf5109590b6f4b500643cea2e5ee7ecf3de2f2bafd931fecbba +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.16.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.16.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..17e81af1aaa097a81bf4407a23e87dfb0810ba73 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.16.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05659661021dfb93c23ca810756fba0afa33f7dc7103bb74e79a5b5cee0630c2 +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.16.ffn_norm.weight b/triton_models/interactive/1/weights/layers.16.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..f45d501c72951cd1746375922f7e113162bef097 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.16.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:990398b91f28bd4d0ea10d21a8f911746291d93d353659c273a0d263f3f8b26f +size 8192 diff --git a/triton_models/interactive/1/weights/layers.16.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.16.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..cc7a02ca2638e540d970eba9c8c2ca40c599f58e --- /dev/null +++ b/triton_models/interactive/1/weights/layers.16.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a46e5538c6531808ab35a4aa3f8acc92997393bf5778110738282e7d0b5a6253 +size 16 diff --git a/triton_models/interactive/1/weights/layers.17.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.17.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..b7d289a0a181f768648b3388209609a158c0d194 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.17.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a874ceb40f2cd87b1fbadffe4f336e766e4632d1486bae80a524aca3884a760 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.17.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.17.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..02676e7729a5ae2a782c7397622f5661a55ae306 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.17.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3e383f96fe0c11172a8eb7c833e16437243ddf5083fe742f2f5267c606bf46f +size 786432 diff --git a/triton_models/interactive/1/weights/layers.17.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.17.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f5d248ed5bb53bc83690b851c4850179affe3a1e --- /dev/null +++ b/triton_models/interactive/1/weights/layers.17.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ba47e294f57c2391d17559990d81c10b3febf1ac79cdaf9646ea4b5b1efe9ae +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.17.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.17.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..cec2b0826f0458f462a1f155b2420afe3cade230 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.17.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19655fc3273537cb5a737021f0914fcaba9f520ae85a241b6943a1e375859c5a +size 524288 diff --git a/triton_models/interactive/1/weights/layers.17.attention_norm.weight b/triton_models/interactive/1/weights/layers.17.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..493203ace8591c626f3ddd92a1d30a132fb91f7c --- /dev/null +++ b/triton_models/interactive/1/weights/layers.17.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f60382d336b8fe223742bf477d6e1d6b03a426c1397370821017d77560828a40 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.17.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.17.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..fada103f386b9576504b44aad9effb7227b81161 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.17.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6347e704f461d7d6ee0ae21b790cdd6180debf826b736f1862a27bc9ced0045 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.17.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.17.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..e34de3f6584cca7245e62f91730286274c18de9f --- /dev/null +++ b/triton_models/interactive/1/weights/layers.17.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13d6a83305e5bb3038ce5829693b70573fbcbfd18ef9251f42334a92a864f2f2 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.17.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.17.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..62706b91c086f1c95651471ed13767ce01618e08 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.17.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62bbff754141a2d1cf72617d73f2522333bb2694a88e8a5b37c1aca6b22b17a0 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.17.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.17.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..7d16b3f60264de0aab7805c342d890386aa3c7ec --- /dev/null +++ b/triton_models/interactive/1/weights/layers.17.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2aced42506d0f633676edf55b7de564b795eb6de86d8c0f6c0f1d1301233312 +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.17.ffn_norm.weight b/triton_models/interactive/1/weights/layers.17.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..2115ea8bcc2774631a370c71a768d54242473864 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.17.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7866c4443b210b814e1bcca660a34c2b78f21172253d2c53300be2c3e3d44fc +size 8192 diff --git a/triton_models/interactive/1/weights/layers.17.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.17.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..945eb96703d8de2eef6085a642b1a27de7fb8cba --- /dev/null +++ b/triton_models/interactive/1/weights/layers.17.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8029ca34c285ba5e30b011338457cb6e1aa2bde375aa5bddeb10d5f735b827aa +size 16 diff --git a/triton_models/interactive/1/weights/layers.18.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.18.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c8f8e2fdabca3f7c34468465c2a769b83df35ce8 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.18.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:802bfc3126429a1c8f50bb8bc82a62b62b5e4fac66b2e5201d5ca3dadc76b2b0 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.18.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.18.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..15b491c33507c9aa77edc43db2d844a6f497fca7 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.18.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b1e35a7c3f4353a260afd771398ed0e6f3fb0cfe2c9e57c9c6aa837187477b +size 786432 diff --git a/triton_models/interactive/1/weights/layers.18.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.18.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..fda05fdf95a8e38dbba3ae8e857729fde60e6d1b --- /dev/null +++ b/triton_models/interactive/1/weights/layers.18.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d5e9b4b8ac11947e865c95a0ee01bea2b98bb4d8e186bc655980c0819220337 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.18.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.18.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..56d79eb2481c7040c86fa26964ede1eeae1395e4 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.18.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fb7cefb270cbf64d8347c25b5d776be71d432c570ac277fc6dcb8160f358040 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.18.attention_norm.weight b/triton_models/interactive/1/weights/layers.18.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..3c20c25a40ad141d017b4cce8700f88ca3d8efca --- /dev/null +++ b/triton_models/interactive/1/weights/layers.18.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dac1fd7000d40fa00eb19ec7e140c8fd08a7e2fba5ac80c0f15abf00fd9048e +size 8192 diff --git a/triton_models/interactive/1/weights/layers.18.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.18.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..3c1d6af45afa49731996db41ef7d18503411125c --- /dev/null +++ b/triton_models/interactive/1/weights/layers.18.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23dee44b6cb77a166863b69487459d9de5dfd4c3989306919d4c35dc20c884be +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.18.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.18.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..54489f50388ea9154fce92dbadd4bf6a1a861f86 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.18.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10a6c1e2ca46dac304c89690e837221b7cd15133dc1e7ccfb18f69187af51208 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.18.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.18.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e90ed3787e1ac9da6ffed10588e004c09bf3b9b1 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.18.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a35d9d5c12d752b160f51f53a49e9a763662605165cb85272e539b60a9f92055 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.18.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.18.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..17951129ba756efbad134062196862ef2b290c05 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.18.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:845ca7749cf6829cc274de80528f41dbd289d125720a4f68417677871dd528c9 +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.18.ffn_norm.weight b/triton_models/interactive/1/weights/layers.18.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..3fdc07d36718c6a4fb843c7a0e547971f25bbe50 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.18.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:885808cbeec44e76e545008343da6029dce51d48908c85d61f4e3e5734a316a7 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.18.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.18.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..4b8d6bdb257005f9da0843e14b064394e5e12366 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.18.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da3eda4da09ebaeb73ef447011ce0b9ef2ee982ab26d8d0408ad482f9b2b389e +size 16 diff --git a/triton_models/interactive/1/weights/layers.19.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.19.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f58ac78fbf8480c4a875a904f3eca7296b9d1dc7 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.19.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a697cc9e5c643856df75e5d40a4ddc810ad41c0ab9362ad6c7745862c000ccf +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.19.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.19.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..ff2f26342ca1663ff6c89e5015b02b41e976f9a9 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.19.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5deb01a923b8c70c8adaa62c3b6128231899cb7c185908822279725696d1c819 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.19.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.19.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f444fcc2661a285f914957b05cedde19a4954ace --- /dev/null +++ b/triton_models/interactive/1/weights/layers.19.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:682754ebee51648ef7b0249fee7289fdf825e61916f97ec62087c8e39e9c14bb +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.19.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.19.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..41cb9a3fa2554343948079acebcb10fa2a940517 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.19.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6d4a938a39924f222f02b460355a83ffb98a00ff19d05048c3bcb82c9e57edc +size 524288 diff --git a/triton_models/interactive/1/weights/layers.19.attention_norm.weight b/triton_models/interactive/1/weights/layers.19.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..5acd5f2587a22bc1a1e2870e9b4af8ea1eaeb505 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.19.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63d26f2643a9aceebf2af38dbc611dc36da45a176257e478e62f85ddbc559f55 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.19.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.19.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..cc8dd8ef920737fc2e432adac1ce42303e7d7111 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.19.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a54bcfb108f050cf4a7c7cb37114ceb35476b3f8bb6cf6c541e8df014fbf6133 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.19.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.19.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c378e9b9bed297468e52701cb4eea8586e317e8f --- /dev/null +++ b/triton_models/interactive/1/weights/layers.19.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11cb4b7bd0b53f894236952f72793d3d4e647e6d07fc37e1112b0c5ba392176c +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.19.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.19.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..131386a17e034a3ba0ce59be9c0351b35dfc20e1 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.19.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f32b6e7bb6005ba215aa938a0b52300230f7008150b45a11916829314ef3494 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.19.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.19.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..af5383b2c8c39d1c54f5dea9298ea08f5cbe267b --- /dev/null +++ b/triton_models/interactive/1/weights/layers.19.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84f83448a65d6bf12e5484bdf2805b2648a5ee6c0f71f592f1399a71f787a365 +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.19.ffn_norm.weight b/triton_models/interactive/1/weights/layers.19.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..6f5513a9af9eec5fbc82dd527339fb220156deb0 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.19.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7e2f003c72088419d2608b060a98ab42356eeffed53510f1d468f4ccd3f1141 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.19.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.19.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..fd5be00138be7b2df59bf0b592a9bef86dc82eb8 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.19.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c71b33b311eb0e23a8b2494a543ba1181fd72314b49cf78a9749b9cf4a00df4 +size 16 diff --git a/triton_models/interactive/1/weights/layers.2.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.2.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2d9c45e71e2c0ab82208f4202b06c9b97f6ba148 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.2.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fa15c6683fb8dd4f6a17b49bb0a989e462a984b2b1a62741c0261b0205e4d3a +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.2.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.2.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..cf230e2e4ec022b7dadc04504edd265c2736423a --- /dev/null +++ b/triton_models/interactive/1/weights/layers.2.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d46a56b3063ca3e890569f20f0f9554bd4b8b3dce4dd28c6de2a2c8b018de692 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.2.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.2.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2ec2d68e756cc1afd558415a1c748d3366f51240 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.2.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:745bd18832a4be0427eecf06fbd16e5b4d9045d9bae02a538648bf061f1bcd31 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.2.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.2.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..d1e959a3fa4ef4072ae44bb537bc108a99c3799e --- /dev/null +++ b/triton_models/interactive/1/weights/layers.2.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f165998aa89a2e93b82203e08444995edcdc00ed2dd2b3dc3171ed8c4aef68f +size 524288 diff --git a/triton_models/interactive/1/weights/layers.2.attention_norm.weight b/triton_models/interactive/1/weights/layers.2.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..775cfb53b3214e57d496df775c7f2e98df37a237 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.2.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35db76352c3fef9616c14aefa7c0b05850df54a54e3e6c922df8876639c7048e +size 8192 diff --git a/triton_models/interactive/1/weights/layers.2.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.2.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..1b19b3f633c84fa1134ae29f0bf9f119d9b25d42 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.2.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5d14e61c9cc1a1874bbf7c1db7fb04e8b97f8d49e011bf0b5c2003a072083cf +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.2.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.2.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..e293bf94f00d2acb588e4a05e8b36c07adfd4cfe --- /dev/null +++ b/triton_models/interactive/1/weights/layers.2.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02a79b8fb1590037f3bcbe91f25dbcb82b2b91fe0a109dca31de0493a089fcdd +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.2.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.2.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c43fcc94e533822deff81b234c66897d23c2a5aa --- /dev/null +++ b/triton_models/interactive/1/weights/layers.2.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbde66d92d3be35621cdb2171a2b9e5ab5448d229f07d7da65d25553adcce029 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.2.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.2.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c5beb7d2b7d8320386a5105a4a2618ceec4e4943 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.2.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41bfc952713a7fd5409f909e9ab107d9ef734e730f7b00d97fc34ef24395e62e +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.2.ffn_norm.weight b/triton_models/interactive/1/weights/layers.2.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..45e884fea486483f4689411e2b0f5841bb3e6317 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.2.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f16599930e314f9a8ef2b760cc6773e75961152d32432b5fc3e411955dbdc227 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.2.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.2.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..70e74bf48eaad9dd65823e3d66a8d46c4452b13d --- /dev/null +++ b/triton_models/interactive/1/weights/layers.2.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7808c14f00dcb7b2b77edadc8852138f46802e013a3025e161a669adde20339 +size 16 diff --git a/triton_models/interactive/1/weights/layers.20.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.20.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..6053a83955560e1c2a84e72515c7672d70304835 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.20.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45521551eeea8b702589fe7c6b19749333abf647f53f56713807dc38f58041ec +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.20.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.20.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..0e188dc213c48bf55e4b2001a68e495c895187a7 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.20.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7d9740714493408c67acb934d26406c11421ab7efdabd743bd990103a90f701 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.20.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.20.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..730a6aa484d4286f408baf8abf88ea73e0b5aa02 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.20.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55586decc011d181feef941588d73d75de2ec8040bce7db734699a33a7bd6f42 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.20.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.20.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..affb6ab65788c985dc6ccf43d5cb3fcc8f4e91f6 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.20.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3dff92bdb0d4bd34ecf08c0c024d9aabfeb9dc6407b55b55d25835922bddb9c +size 524288 diff --git a/triton_models/interactive/1/weights/layers.20.attention_norm.weight b/triton_models/interactive/1/weights/layers.20.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..a4b06c9551477c77ebc9de6151cd219a9c13f63c --- /dev/null +++ b/triton_models/interactive/1/weights/layers.20.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dfd453a8ca7eaa0368df85c67b0c4520d044c50e21e3e9c642016e56425fe2c +size 8192 diff --git a/triton_models/interactive/1/weights/layers.20.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.20.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e0aa342e545feda824e44af8745b7bf6714e3672 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.20.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a12408ddaac163c3473e187a838044bf3c05b1a72758d6b77338da700a74f845 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.20.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.20.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..595f2605064e623b1acbbbb39aad1abe47d2b5fe --- /dev/null +++ b/triton_models/interactive/1/weights/layers.20.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a20c9c4a6621e851abb268c647e4f9459277dc53bc5f64a0504562c9e7736b61 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.20.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.20.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..3881b21e76f4c55a6f5a94d56794ece1d12912e8 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.20.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e13a13177f50e58cd454dfef4083e8b8da065d25bd277aeabcbbd65d9c7ee2db +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.20.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.20.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..f0c038b596c5143988722e1d044fdba36b9f4c53 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.20.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2bb55062eaf5f412bae85c9ac428ddc2e0e59d0e53ebd21abb1228cf4d1ea3c +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.20.ffn_norm.weight b/triton_models/interactive/1/weights/layers.20.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..3cfe4cc50ce587ea9b564a20130b4fe2225d7d52 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.20.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37c809eef52d6f683a42650531b04e14b95934556c2f3607466882fff2c7a049 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.20.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.20.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..3fe9d60389494bd97b6721514bbf76a4a2f4aeea --- /dev/null +++ b/triton_models/interactive/1/weights/layers.20.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97801b00a17ab91f1019edf80b667e915c772df1461e322cb8602d8bd831a8b1 +size 16 diff --git a/triton_models/interactive/1/weights/layers.21.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.21.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..905d5eb82f1967282905cf3974e526f1e48e2b90 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.21.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2792bae2516c6d5167b1efdd66141ddc18439be883865eee923aa0d64f3501f7 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.21.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.21.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..9a1f6b2beb40845a92a60a5b1ea44afefad5446c --- /dev/null +++ b/triton_models/interactive/1/weights/layers.21.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:953b7c49b7ba4bab3b5ab552b697d5be9184144ec4f8f6ea9815a0e12420a4c6 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.21.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.21.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..fbd8d63b76ae1f3a0394dfd4c09e724627ce656a --- /dev/null +++ b/triton_models/interactive/1/weights/layers.21.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f719914491c7941474c1b6efa5a79541ade54eff71a6d65a28dcff17baeacd89 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.21.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.21.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..3199f31825d84cf98169a9ac8361fd01195c513a --- /dev/null +++ b/triton_models/interactive/1/weights/layers.21.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21e70d0275306b0d766b533780955602dc9d5163028c509745120b4e9dd070d1 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.21.attention_norm.weight b/triton_models/interactive/1/weights/layers.21.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..ace9b471c09970005b6d8dcb34406ac8671f3340 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.21.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f5b37279d734e53f01e524b941104c4a2a0794819cb443255e46130190eb060 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.21.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.21.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..93ad736f2b44139c784864069aece4a59db96543 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.21.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7298a7ea1a9a2f16bfcca14510dce8da6342ceaccf48354e63945a00c86a8887 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.21.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.21.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..a7e502a74af20d234730806f84f0ee0fbec81a3d --- /dev/null +++ b/triton_models/interactive/1/weights/layers.21.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90e896e7361f2fde100ee9cbf4591ba2509c11ad2e06ff9150614c28f39f6cc7 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.21.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.21.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e129776d2c3518130aa1688eefa5ce1d57e1f1cb --- /dev/null +++ b/triton_models/interactive/1/weights/layers.21.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0415c4da6fb2feb289a75e84a73c525272f0098ee5c14faf5544454178576f62 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.21.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.21.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..60435a424658f628b48358ed84954acb2782b727 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.21.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ff5c969303a6b351d8bb80064aad2c92e8c5c32d85bff840317ca0739ced463 +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.21.ffn_norm.weight b/triton_models/interactive/1/weights/layers.21.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..6655336998857a70516ff902b71f61175fd1a6c3 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.21.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8042770bf17c4b7520332fdeeef3decf2eb77871e6d80a2fcfe79e850827faae +size 8192 diff --git a/triton_models/interactive/1/weights/layers.21.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.21.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..68bb063c7fe76ee11dc858fe2552eff20f89fc06 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.21.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:babef4e3b7889042e89f865f3c8bb53f6191e2c9329e3eb418e0627256b4bbf7 +size 16 diff --git a/triton_models/interactive/1/weights/layers.22.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.22.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..26e5e328af67eb6995b4eccd4f3f47e2a5572bbb --- /dev/null +++ b/triton_models/interactive/1/weights/layers.22.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3845fa57cee6ae1adc7c640c17820f11d196a86138e3ab1b26d1fcdb5a12d480 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.22.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.22.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..25e896649de6e4eebef3fb52b4695e66834ea627 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.22.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60a8fb6d26d3741fbf2dbd24d9e96a689ce0d8311349bc7b7d487a94ffae7309 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.22.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.22.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..30d513ba9872686a172b2e5bb54d7dc19c89b18b --- /dev/null +++ b/triton_models/interactive/1/weights/layers.22.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e8c0a44652ccfbbb876d6c56c552653b788b14188b48f41b957d17036111f93 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.22.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.22.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..63489132ff37547f3c5a7082e39f7d6e60d99e2f --- /dev/null +++ b/triton_models/interactive/1/weights/layers.22.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cf24c066812a6a36df8eec192b40520df7d10573d5a2bfd2327ddaecf6e938a +size 524288 diff --git a/triton_models/interactive/1/weights/layers.22.attention_norm.weight b/triton_models/interactive/1/weights/layers.22.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..67e9beee3472ac10efd53bef75c3678f86f0287a --- /dev/null +++ b/triton_models/interactive/1/weights/layers.22.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87603494aa61475dfc747464841436f303bcf654dc27b1a07564f53558ebc0e8 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.22.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.22.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..a6f81f752873c957d60d333f567fcf45dc101888 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.22.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37604a1d32f8001155e15ab4e13282b050da543ad0d0a25b759081246fdbdb15 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.22.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.22.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..7fc132bdca2ee4128bec7e863686fdca2f7aebf4 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.22.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06d1aced0b15076b9f26d4ea4f4f6b732368d7b373e7a588635da39cb9db5f39 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.22.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.22.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2679586d03d73f48a045c13e8c8b19ad6eaa9b50 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.22.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15b2a9ac0ae91a96deefa360ba92e79339705410d925b2356b9815692ea31061 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.22.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.22.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..7216f3454da54e1117fd4e92befe84b4c8b46a1a --- /dev/null +++ b/triton_models/interactive/1/weights/layers.22.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a99b63ab8c94e4d8f81bc8cab1561f47e3c2bac9f6e13f0b23d9438e02d7d1e +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.22.ffn_norm.weight b/triton_models/interactive/1/weights/layers.22.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..4d71b5ceacf9dcc9afaaf1adf8978c2911ea951f --- /dev/null +++ b/triton_models/interactive/1/weights/layers.22.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:309c8793e4e6d01a426ded64878ab5bb81fc897a4369e2e12e180067d9e2f97f +size 8192 diff --git a/triton_models/interactive/1/weights/layers.22.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.22.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..265569647dc54011c0c7aa312cda60679eddf224 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.22.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a792b8d14741661477851bbe77b6f5dc4fecf7ce07009fb7d6bd25090b2ad2b +size 16 diff --git a/triton_models/interactive/1/weights/layers.23.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.23.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..3c4b6c3a2d7fa4c456839afe2c5df63b4801cf29 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.23.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2a664f7c9133d9a3d3f013ae68b7c826124f0ce8ee3e2a8b7a3d412fc4ce18c +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.23.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.23.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..6980321a22d78892613c341246abfd4fa6a6ec1b --- /dev/null +++ b/triton_models/interactive/1/weights/layers.23.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d1caf7d6d040d5052d79ec08aa4282d486d3fd63e54ce73293b62776d97cc01 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.23.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.23.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..a959f9c51c2010dee1865544214aa31aca8e384b --- /dev/null +++ b/triton_models/interactive/1/weights/layers.23.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:019ccc843a3257c4a7b36900f96de821382e2847851af142ae89a9238b434b20 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.23.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.23.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..63ad5cf1b74567dc10825bf3797cef1aeaf45b20 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.23.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80a82f597426b697fe58ed646f41dd9a6f4514d8d93e7f2791fac932dac100ca +size 524288 diff --git a/triton_models/interactive/1/weights/layers.23.attention_norm.weight b/triton_models/interactive/1/weights/layers.23.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..95ac563b56807e330af49708f5e09a5b5d763971 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.23.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d621b52a30d8a04c1866972255522c844eebd9f0b57ee2b90fd4f8e5e7ba07a +size 8192 diff --git a/triton_models/interactive/1/weights/layers.23.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.23.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..070dac5924104453edc840b81f83c3af7c79534c --- /dev/null +++ b/triton_models/interactive/1/weights/layers.23.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e95a18e90a00cd47b6fce45cb8c1eeedb6ec2b8fed6f0cd8de85f36cfd5dedee +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.23.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.23.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..53c5e980f8815c039d907e5466820c61f9d1076c --- /dev/null +++ b/triton_models/interactive/1/weights/layers.23.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae6d90f0468717c0bf1b22ab4914319697011c4ee53f13241c0ca1970acc3331 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.23.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.23.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..3dbd1908961ec50661072cfe35a0e65123ee0522 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.23.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1036d81bd9d055c59bed34241ec3328c1035676dbcd78a0186946147c58af98b +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.23.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.23.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..377898876f13249c94c85b69c632e4edbf89ca0d --- /dev/null +++ b/triton_models/interactive/1/weights/layers.23.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f354eef95b3a2007598e99428488351bc81e825cc08c8a22beea2a74432f0e91 +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.23.ffn_norm.weight b/triton_models/interactive/1/weights/layers.23.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..6034309e63a873c266790385d8a50379dff8c851 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.23.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36a712b30e1f4b920e2bf0e553bf62898650a968b94cb544d4c0cb45dd9724ba +size 8192 diff --git a/triton_models/interactive/1/weights/layers.23.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.23.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..2054dd9b5bac4cc5f3947a6a29b0a00ee9c8f9c6 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.23.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:362bc48a1da392c1d9c1404743b87e700f048e91e2236c0f23136126cbd17a42 +size 16 diff --git a/triton_models/interactive/1/weights/layers.24.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.24.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..90ca332aa05b52f6a6c1174451a057235aeec1f3 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.24.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c5cb069457b3e48f9401929077bc5a44b988b7741941ed8157cf23fc0af8fa2 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.24.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.24.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c424c3a6af59cdb2e6cd3d2acdd6fa6b8585e46b --- /dev/null +++ b/triton_models/interactive/1/weights/layers.24.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b47c34802342bd2a02dc98d311924169d7abdc703e43279cffdcf1422243038d +size 786432 diff --git a/triton_models/interactive/1/weights/layers.24.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.24.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..183cbc95eb079e344c88e1fa4774f568a66dbbd9 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.24.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6468f6b524dabe33d4487522c605b92a5c91eaaa9d6b39433dd31588bfd09215 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.24.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.24.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c435ad2044cc72cc87bf58ea590aea7b6e463349 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.24.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59fa63a2023ffc20a936686267ae08fe6c793889ca330e0fb0a44ab2b5fe8041 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.24.attention_norm.weight b/triton_models/interactive/1/weights/layers.24.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..dccff49fb462091aab55a0c4eb163652123ff7d5 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.24.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d38dd18c9fe84631f30cb2b7cb92efc25473d4ba1c438a7817690ed3bbaabd8 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.24.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.24.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f0bea0526b3fe332953eeee191fd4d279f3a8286 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.24.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db478db4b91a673763d0252f233423fa31c7a562f80cbc6c106931886d56e253 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.24.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.24.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..8d233c239c539161b7c5f0b5f890f196d9c544c2 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.24.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5329cd85fc6390d7fc596abdb5907e3c2576c2fb6fc87d7c0dc2dbae326a826 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.24.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.24.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..d4c99dfed4f5fd009c04c0693ddd1253dadfb80e --- /dev/null +++ b/triton_models/interactive/1/weights/layers.24.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78e4b556d2c58615b1f3bcbfe8780a1217bc0420383b55afbf6767315ca09e66 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.24.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.24.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..8d61abbf087e7f17d99482529ceb6649e5f98e4b --- /dev/null +++ b/triton_models/interactive/1/weights/layers.24.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9861b1f0dcf30259bc7a9d1c02969f271b805981c696d49b1dcdd939a7ff504b +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.24.ffn_norm.weight b/triton_models/interactive/1/weights/layers.24.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..a5247850bcab46ee044a136c8ca64f1223e6f1a7 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.24.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f64ff3faab2a3c58cde1f351d57bef281660b552a9dbb9c0aa49bff00dcd6719 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.24.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.24.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..3a9a25a5c3ba55692571909bb40b460b6ed82ade --- /dev/null +++ b/triton_models/interactive/1/weights/layers.24.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d2ab419befc2e7b0391b3b7e7bfa13bf728db0d6cba53136aedc0802a4fcc8c +size 16 diff --git a/triton_models/interactive/1/weights/layers.25.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.25.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..41c3344f95ab3594af8a3648d644979c8b8a3e84 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.25.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0971d51d3ac5fa3cb80bf7adb2616878c3921d6810a7b8c312f2c5edfc20ba2b +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.25.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.25.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..380f67b6fde572f2eecd73076b154bb56c631ceb --- /dev/null +++ b/triton_models/interactive/1/weights/layers.25.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd9d2322fc1ac860eeeb0ae4f57b15011ca5728cab0c2de14ad0734c813b1070 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.25.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.25.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..58a080a5403fbc6975a8c92d3d8890d106c41f32 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.25.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42757d1b84d12da08d617496b557df5dc43260ad03444559342e57effdeff897 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.25.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.25.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..a623dfbef7759c22ba42888f23b6af5e7c88703c --- /dev/null +++ b/triton_models/interactive/1/weights/layers.25.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc49597aa705026d30a172bcee0421ded59135ee57d2d1a38d511274fd00db51 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.25.attention_norm.weight b/triton_models/interactive/1/weights/layers.25.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..e330398be316b3c7d2b4e8091847c876352631d0 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.25.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f978aa26bb24bbd527a1e949719d548e1c7bf7d30f04b02f0f28d1343053132 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.25.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.25.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..941b657818aee3d6c553e08ef74566cd98e55321 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.25.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:063a4b6c0bb854f67986762bafa9651778da009fd725fe723fa47306a99a845f +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.25.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.25.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..4df2b6e64935f05f8ec6ea3db6b9723c6ca0a7bd --- /dev/null +++ b/triton_models/interactive/1/weights/layers.25.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4a77dbd2274b6de3cfb89254d1cb2c0af54b304bb9134a280cbe9b620a361a9 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.25.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.25.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..a2a36f211eb8cebc2e1ce26bbd4bcd9a806cee31 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.25.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1626e0d17ba4f05b0f1e65537f46ada22bef2d00deb136c30dd6bb481b617d58 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.25.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.25.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..09e7a5b567087d78bfcd3614b11b21106f5f8f59 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.25.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d9b0e50a31c6c29d57500a64edf731ea04db50967219bfdcb0853730c574333 +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.25.ffn_norm.weight b/triton_models/interactive/1/weights/layers.25.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..026c4beed926345148e983d57a1eb89a25c4fd1c --- /dev/null +++ b/triton_models/interactive/1/weights/layers.25.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0eea4a26418b7a503c71abf443da9d784c2adca6551e4f1b998f94d6145d696 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.25.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.25.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..67871afaf8d1df47fbde1f4a65674ded07d4a864 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.25.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cad249894548c60911d6d65a7d5846938c1e479698b4466d4cc6e03d2444922 +size 16 diff --git a/triton_models/interactive/1/weights/layers.26.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.26.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..8e3258b77728a5579d15c2a374b61be41a2afa09 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.26.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3b88ded4b32bf8ff5ab7fa3616ab98f1bfea6fd86f37b729ad69ffe89d33e97 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.26.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.26.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..cb16882090f73a8651b55899be0c7b66b7d89aef --- /dev/null +++ b/triton_models/interactive/1/weights/layers.26.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1303373a67371e1e2f3ed25bc8cd8e559b9503bc5b4fdc37bfaf758cd26acfb3 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.26.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.26.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f65b33bea38f966cd6cd26980998df21898fad28 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.26.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da07e11c5ce840df7eaa7de1ddff66356a2995b93b6d1cdefe1d96f6d4eb62a6 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.26.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.26.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..e34f9fbc1e33e117eb223353e64a0d03c3a1ce09 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.26.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec446a339a8b88e9d35b0feb0dc82c82f64420cc45aa67b0730bc6fdfeb33b24 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.26.attention_norm.weight b/triton_models/interactive/1/weights/layers.26.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..bd89d7d2bb2a10e4537def6bc6550ddf681db645 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.26.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:452e37de79706d39a7fddbbd901e8353363bb41bb1178eebb42b0a9aad1998fc +size 8192 diff --git a/triton_models/interactive/1/weights/layers.26.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.26.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..ef1f200bdb37b79404804e211dddd09441a90cfb --- /dev/null +++ b/triton_models/interactive/1/weights/layers.26.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fac2317afed02f28c9f68eae5e04821f1fea2d7553bd4ce30b68b9a7e896be65 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.26.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.26.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..3613b7754b7de11bd7146b2f99bbb2aabad43346 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.26.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e892079f260d62e05e5169a508c1b50c3beffc1e568e189b358850a9596863ac +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.26.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.26.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..42508b0d05c03cfe54875df80e5848f92e3a2148 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.26.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b2ab3bee38aee899c1454a69dc424ae61b6d14d67438c307369be02f6460085 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.26.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.26.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..6078af07ebbfebda87b1016fd58cdcffbb0b4c73 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.26.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:552933cb4c5ad88c47fcfc8c8982e8a9d6c2bcf4975d0a1ff17f85a0de9a72a0 +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.26.ffn_norm.weight b/triton_models/interactive/1/weights/layers.26.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..659727ca29164c591b4db04c441375c79e981fce --- /dev/null +++ b/triton_models/interactive/1/weights/layers.26.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a474d6dce328dea51c94d84fde68d4472d68dbbf19ce347181b5956b98d41847 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.26.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.26.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..db316b10f011519fdc39c70e40706bb6499001f4 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.26.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d995b27407d7307c6a5b4a4fa7f6247eac5d8c1cc62c066c9bd4395d0455a939 +size 16 diff --git a/triton_models/interactive/1/weights/layers.27.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.27.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2b398a0b63fe43f5bd6467e9001673b60b3d8b76 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.27.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fb11cc9d2229d99f45200d53d2430007eca65a120d988a8ace070a0e3754128 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.27.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.27.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..36269d2bb210deac5bfb20fc68c3a3c0ba2430d9 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.27.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3b885790c722268908e56129344337198b0c0e4b3bf5e21a7f091d0846a5d30 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.27.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.27.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..75c54cf768728053f1051c6d1260296c943bc2cd --- /dev/null +++ b/triton_models/interactive/1/weights/layers.27.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d46493db19a5dc9a8d01151f769f22f10733969cad257ff2372fe9ef169efdc7 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.27.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.27.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..91523912e1e6240ee472d551a8422724c7f9396f --- /dev/null +++ b/triton_models/interactive/1/weights/layers.27.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f80605e605d11e0f5a9e470c80c72859f9651f99f3db043b9eab3989fffd647 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.27.attention_norm.weight b/triton_models/interactive/1/weights/layers.27.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..92e464dfb802dd2cde189e137b6e908acaec5c38 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.27.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b48e7db8fe774bd46f4eecc92ef7f6bde3cb8e3ba66836e6cae00572ea0e14e +size 8192 diff --git a/triton_models/interactive/1/weights/layers.27.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.27.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e7392da13e07a3f00396eb1965e2c22daece98a8 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.27.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a634ce6c3f2743a5e0fa245a0adf32df70a41dc7c969d40b1a3197f0436cdf5 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.27.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.27.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..4fadfc7e45425848c37d17c3f39ffbbb822a8c78 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.27.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc138f3c7e31e1be2b6e2a57d7d5a2ffab4fa52343122dd272e41ac4bfd9096e +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.27.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.27.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..aae88c0abda360c16b47ef75abda1c4077edf25e --- /dev/null +++ b/triton_models/interactive/1/weights/layers.27.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9052da467e48c0c4138fd3769e456cb753464bb30a03a4942846a5b3877131f +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.27.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.27.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..3b2fa2b516a8c83d6eed1702e517e005ac19f281 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.27.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e1f67441bf5d4f5ca51f1f289e07a3c59907d324265741f76ad966bf1755749 +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.27.ffn_norm.weight b/triton_models/interactive/1/weights/layers.27.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..c15c40329868b970cca611aff6e2bbe13d48abf0 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.27.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fda3309eb353c9341280ab8f2a516011494cba8b769560e91cd0c9d27fc6561 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.27.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.27.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..d8710f2aebc08c7c65db4a66ef9daeba362df5ce --- /dev/null +++ b/triton_models/interactive/1/weights/layers.27.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2367dba495b15a673a5e8f907f19e98254caa8845195d88897b3ecc36d7c794 +size 16 diff --git a/triton_models/interactive/1/weights/layers.28.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.28.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..11c1eafa7f15149287cd144977ef8e5a42645397 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.28.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1f9e7857882c7a56236572f8a03d72222b257c8d9ed6e2efa1d66c6b5e21fb1 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.28.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.28.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..f725cdf5914a0af48485baa5a948fb90c3030913 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.28.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da00a72b006477cacf5f86157b6206faefb0b9a1945fed4e5f2a2f9fc9846f55 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.28.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.28.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..294eeaef86a93508f7f8b171fb8a303bcfb5602c --- /dev/null +++ b/triton_models/interactive/1/weights/layers.28.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:626eff3b0dc5215c6954f774fc8116aa989824ab9c971a3782d8bce5ad31d0a8 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.28.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.28.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..90a1002de820fee0fabb5d5081cde6d434fa08dc --- /dev/null +++ b/triton_models/interactive/1/weights/layers.28.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5faf82a3313ab0b53237e677fa72b3b44137a47ab5f26d401a3bf43f5beb1bd8 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.28.attention_norm.weight b/triton_models/interactive/1/weights/layers.28.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..1ec94894ca9c51e452e351065e83a91a22a1d264 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.28.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac4a8732ba2c28970db1dc7e821bd6c8b0e4de12f8de1b6bc6692840154562a4 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.28.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.28.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2ad5905fe8ebd68dafedb5c0bbe70d34f3f8c71d --- /dev/null +++ b/triton_models/interactive/1/weights/layers.28.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f375cdf0cd1a60d7c9d00319853242606c44be5322598f91dbff37284f0ab67 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.28.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.28.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..f8676ba3b145e257dc1c75c1f9d9dd86413bc37d --- /dev/null +++ b/triton_models/interactive/1/weights/layers.28.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f57f5b0745ad5281aa67d83c0da6f1ebc7539dff487ae1345761bf995aedb1c +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.28.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.28.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e3532b664b06cd727ceb44f27462084bddb160c3 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.28.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:393b972c36770d253df01db59d0c889a018a26ec7a18cf1e69617828344e2ed4 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.28.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.28.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..9cba65bef1506cf3787aac95439d21334e5424fa --- /dev/null +++ b/triton_models/interactive/1/weights/layers.28.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f4650f45c05fbd9d52eade717d47d32b1127ad57db10133ba490f5af3843551 +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.28.ffn_norm.weight b/triton_models/interactive/1/weights/layers.28.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..0a50537a8d1863c6ea2bf1177d91c15f67d42dec --- /dev/null +++ b/triton_models/interactive/1/weights/layers.28.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26ab58696d625c79d618dd907bbeefb29dcb441a358411ed99c0f88e8649e74b +size 8192 diff --git a/triton_models/interactive/1/weights/layers.28.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.28.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..335aa2710f889028753142ad7c1c770b5aaece8c --- /dev/null +++ b/triton_models/interactive/1/weights/layers.28.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be67c63310802e47b331969149928657a52d9caadc4dcd0599f0ed63fa8fe4c3 +size 16 diff --git a/triton_models/interactive/1/weights/layers.29.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.29.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f7fb2a0c283d5309b0acac81e3f78bf535e119e0 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.29.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:964846927bb91f85e501fe1626e8958dba12656845d1c2963d6f0d31ba0e6fe9 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.29.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.29.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..e4616ace3831b1353261ce821a222788574a6a7e --- /dev/null +++ b/triton_models/interactive/1/weights/layers.29.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59389b1002ea4286ef68d6a28a48de0070a8fe63bb33881a4ea5b4d4824b586a +size 786432 diff --git a/triton_models/interactive/1/weights/layers.29.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.29.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c339b504ad1ca7893a586fe0fbab27e0414733d4 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.29.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a9f306da7ef17418be8aa9f47f97e653aeab2c155aaf1f32ea93c6e3e424c19 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.29.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.29.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..880d7d9c3c95158609d1215b2f6bba14a3a6c655 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.29.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1370f068209c9ab1f42b6657508b06a3511d1d2d8d2c5b5988f4d58591d40279 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.29.attention_norm.weight b/triton_models/interactive/1/weights/layers.29.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..dc3408e864d2f349f03d2ea9f976241c0dd4ae19 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.29.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0136d8df649cc27c395128240a43f899929866414704347f851202cc638b9ec0 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.29.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.29.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..12bd5dfc4141909486de6f81eb5de2cd0541f243 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.29.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90f34915975f77f41c0057ec1ddc7e83098a74c6efe44d5cfcbd6252f7483773 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.29.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.29.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..92ba76313e8ccbbbbf563a230bc24e60c122fbbb --- /dev/null +++ b/triton_models/interactive/1/weights/layers.29.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56814e27f2fc6ea900d3623c77d1df558ea69fe154c99fe57fd45b6567a62186 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.29.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.29.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..850b76dcf051ec7876aa7626f2aee3c02df70a73 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.29.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95e520a4a76d63d5f4cfad6bb9577ab1343c24d563ee6491b0120e8b8f605a24 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.29.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.29.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..8d8434eea29d62735d93ec7d3ed91e73a56773a5 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.29.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a45ecef0ec7bb53ccdd1499338dfc1590c5b4d4e64ca01119d8e2eac40c5249 +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.29.ffn_norm.weight b/triton_models/interactive/1/weights/layers.29.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..cf3ccd85ec2a836282f95d8ffa96f001a6c78bfb --- /dev/null +++ b/triton_models/interactive/1/weights/layers.29.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80000d50b78aad7b0076bc159838fbc0e679d1b07aa00f374142e40c5fcbba01 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.29.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.29.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..135fea41df0db406183c0c705ee1bf4e15b3d938 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.29.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2334dc6b4e2acee8b2c60625419023d8b5cb9692341970a8cb0cb0950658940d +size 16 diff --git a/triton_models/interactive/1/weights/layers.3.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.3.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..741f2dbe9906898116ac1c0bcf6b6f1305ac0c7d --- /dev/null +++ b/triton_models/interactive/1/weights/layers.3.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b023e843f1b897e2768f8aa9d1f18e1a2fcb8a17ee904981117c3822cafda263 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.3.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.3.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..58882890a176f4e5d124ddfbdce381fc920d5b9d --- /dev/null +++ b/triton_models/interactive/1/weights/layers.3.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02c5a27de7ab84dc800a722021cefc12233818ba708f7ef20abed96d1efa3b29 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.3.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.3.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..28835af03e975d2a253d1b43e9094dcef5665859 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.3.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:961c0e6293f13ca0eb880f274fcf96b1394f554b645856d99f898ae03ba05ab1 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.3.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.3.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..4941d02a83a0dab878ad6795511df8e08e216ce0 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.3.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6a94458f402b8342d3936d5c436bcc1125e642d5216c1cf70ad7850d134dbdf +size 524288 diff --git a/triton_models/interactive/1/weights/layers.3.attention_norm.weight b/triton_models/interactive/1/weights/layers.3.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..fee571b50c58b11c6d17e7daaf1a1796af101e8a --- /dev/null +++ b/triton_models/interactive/1/weights/layers.3.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e702523cc2696abf9ea5f86ca0c3b8110cbc92f9074f3573cd0935519da7f326 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.3.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.3.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..6576fcc897f882a63b4376d2366b8a16b75529b2 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.3.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec59414d327ec0ca8adf200f8593102b1cbef09d5a97e88f7e6f3d1d941e32d7 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.3.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.3.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..39bfc8b9158d17ace10985a0aefa5ed9b27c830f --- /dev/null +++ b/triton_models/interactive/1/weights/layers.3.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:592014759039919238673a2d601e2d397b3eb60f2b684d06201310dc35e6f870 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.3.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.3.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..a2dc182c2e093651d77ac65087453506558cc6df --- /dev/null +++ b/triton_models/interactive/1/weights/layers.3.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c67555a8eae4e6cc55420ec37ea21933418f802190fc809bb33855011f8ec82a +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.3.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.3.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..b12f9eae6cb382f2ef562f1e7dad7d8f2c7f4f48 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.3.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b8d6409835e70b1c0fdf81979b61995fb90f43381277f9e457070df5a91229c +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.3.ffn_norm.weight b/triton_models/interactive/1/weights/layers.3.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..1ac16014018db6a631b37da0836ea438c9d2fdaa --- /dev/null +++ b/triton_models/interactive/1/weights/layers.3.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b485c2892ea53a76f21e84c2ed42436b05a41f5dab146fab77f25d2b506ae53 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.3.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.3.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..86f8adc521ad298ee51185ebf02afa53325facc9 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.3.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76bf77db19b1d0234ee2da545c98ee3d5921030e6deaa8b2742d4e9d400d7207 +size 16 diff --git a/triton_models/interactive/1/weights/layers.30.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.30.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..087b322573894903eb8e5cf81dc0e4962ccbb4bb --- /dev/null +++ b/triton_models/interactive/1/weights/layers.30.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b085323586c5f61228e43ec3cf935799c983d169abd417a55a6c3f82cd255a1 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.30.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.30.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..de17498ac115e410694314f9e590322ecc3140ef --- /dev/null +++ b/triton_models/interactive/1/weights/layers.30.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:602a6e94ab5a7bda70167414ea1e71c46be0e7b46a69689d093f991dc6930079 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.30.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.30.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e9eddf6db391e55430e3ca4f04fc6966cdb3bc10 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.30.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5864869bc2f57778cafb236ed45dbcacce36836e1c8b3dd94fd1375829174baa +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.30.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.30.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..f810acf8fcee1cdadd5b34adde32f9c37b177343 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.30.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c899fc162f4dbec0809e3059f9ed0ba9d3004a75d31841ade9aaf16df93493e +size 524288 diff --git a/triton_models/interactive/1/weights/layers.30.attention_norm.weight b/triton_models/interactive/1/weights/layers.30.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..ad23a4893d3cffe2d398058b89dc78f528c91053 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.30.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:683f799d6ecb59ef5b47ee78d4d1653b6a49da4dc6c6865734f2832457ad888e +size 8192 diff --git a/triton_models/interactive/1/weights/layers.30.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.30.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..b61119e589e6b7759f74e927ba8c5a5286eb965f --- /dev/null +++ b/triton_models/interactive/1/weights/layers.30.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb97c170f0415eeb563dfaab343a6b7c736fb302b605cf65ac29e190d485f03a +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.30.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.30.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..3f892216a36905289e63b4b93c0eaf050e7acc02 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.30.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:debf89602b57cf687b1f434d484beefd647c3ea0e8305484658248c8238a347f +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.30.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.30.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..d0743b7b13a262d47d3c95ff5f00bcf70dca3937 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.30.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00143d530f528cfdded636568772b1ac564990d10d52c943463e8198b0f45b22 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.30.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.30.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..649ffe4f3c74051e77a62d2bd111b1c8956635a4 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.30.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6881934dda1754f8b7bdb5619bed9e9ec7cd819080a5080d36c545274e7563bd +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.30.ffn_norm.weight b/triton_models/interactive/1/weights/layers.30.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..19611f78c82d05c2fa778fc4099462db96768018 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.30.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c07830c7b5e53981d0d97e28af650885ba42b1395e88e2a8b553c080258be805 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.30.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.30.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..ebf0f2ce5ad46a9897b292cf74ea4074253d9e00 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.30.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7a7079eaefe501289467f67ff3ec35deb358c17022eff2a2d77c011d87a7485 +size 16 diff --git a/triton_models/interactive/1/weights/layers.31.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.31.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..33f1f7e919ab93f0f093697cc6564c8041cf7c9a --- /dev/null +++ b/triton_models/interactive/1/weights/layers.31.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42e8c9373e34e9f38c5aa5b7f9e7282f283dd138fa488699361a998289d4f0b8 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.31.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.31.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..51b423248b2e8762a232cb9f6524cc2d2882e6a1 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.31.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e74870d817de1f15c0b372de19d9049754192d574290aa47cc2da4114e02fbe3 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.31.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.31.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..7976fa7add831d946d9634761ff8db4d07f69a6b --- /dev/null +++ b/triton_models/interactive/1/weights/layers.31.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:882c11872607c376a08d0e7ab4025ebae8050ca0a958b4678fa7c5f5fe34af8c +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.31.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.31.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..87b74517a018f5d65e974fc575140a80f0cf2f63 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.31.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:780d8a3fc0d41d7e42ab7524e0e8eb3a5044627584cb749954a08d74e8889cc2 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.31.attention_norm.weight b/triton_models/interactive/1/weights/layers.31.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..9e1759f5a7b8ce3bcbdf54ac4a167aa2a3836eeb --- /dev/null +++ b/triton_models/interactive/1/weights/layers.31.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13b79fca3496315c35d45be930b96ac34c0616ae9bb69018d41d4fe7d77fa1c3 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.31.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.31.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..fa724a72baf441d9817165d242ae54e77b819e7d --- /dev/null +++ b/triton_models/interactive/1/weights/layers.31.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d6490623b97868d9d81417ecbbc40bbcf24f872882ca23b74a76f6f384082cd +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.31.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.31.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..4e046750532412be4588ab28e7285c8f68bccf2f --- /dev/null +++ b/triton_models/interactive/1/weights/layers.31.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b03dd848d3c92adda40904bb369f812d1a2de1d72e53600bdf89cf3002aa5e4 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.31.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.31.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..7954c17e1c4aac980fc31bc92786998b66007879 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.31.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f491d3ff06bae3646c8cabbf8c8b6e14963e909e5a3f2cadd84931bb1acc076 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.31.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.31.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..1f95fe4038958211cbda9224b4161cae99e0c2e5 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.31.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7895c436da989422f207c0631685485aada8b0cf45d0db3bbf0cb18b8573d8f4 +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.31.ffn_norm.weight b/triton_models/interactive/1/weights/layers.31.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..491eadebff5c76dbdda444c927fd0bb153d54dbd --- /dev/null +++ b/triton_models/interactive/1/weights/layers.31.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b010068e8df791fcfd32ddefe46198f72adc5cb104f59512820541ed232ed52 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.31.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.31.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..9ed6ce58e195ff81f658649f8fbf99311dad0183 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.31.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcd30ad8a1a6ae548b3b6cdbe2b3693c1d260fcf73e63e4cb201f4ff3a9216e8 +size 16 diff --git a/triton_models/interactive/1/weights/layers.4.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.4.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..9efa7ae8526ee807be03ca3903436c1c4e096b2a --- /dev/null +++ b/triton_models/interactive/1/weights/layers.4.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd04897e691fff067678bfb5826f8c0dae0914c4a822266312a9fd08f9c8dfb9 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.4.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.4.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..b717a0bccf881f43c4dd4849aa9abac991f829b7 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.4.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a4e0a9b4313f6f28361952f5e1c00250e0bc8d8e348238f634679cc9983d4b0 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.4.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.4.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..bbc885705f67c282413e4e10b430177fa24c64d1 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.4.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83ef42f037338f04aa63a71554b631e20e2cc1f4c44d0498061891de5d46dfec +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.4.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.4.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..0dea56a4d1087a93efcf6c1d4c45d4eddcffd41d --- /dev/null +++ b/triton_models/interactive/1/weights/layers.4.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92669ba1e130035258630c4bb58a6ae23088baa4c818edb89d18126368fdd2b1 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.4.attention_norm.weight b/triton_models/interactive/1/weights/layers.4.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..85901d7d4381bcdd1d25c69d8652668e9e82e4d7 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.4.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4392ba124c790351e1e804e3f6954b04df59cabe55918fb2ab208b9fcb1a25d4 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.4.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.4.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2eecef389220ebcbbb1b399d81d28d5c7123895d --- /dev/null +++ b/triton_models/interactive/1/weights/layers.4.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efab7d32785919b64059b2e20f610eae03ee8a2ba95bcd5c2d786e3074f66875 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.4.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.4.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..203aad693c83911b91ea533a372c2414914f0c33 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.4.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:624fd673a1cb8d5eed0814f7d0ebcfa6de1f0933f2c808a43fe9915863d06992 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.4.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.4.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..22624a1646b9f3bc812053a3e4eccd3aa066e8cc --- /dev/null +++ b/triton_models/interactive/1/weights/layers.4.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2a9bc1f9a857eb51f12e913af082a9d065232ad278a46bf3312fee70b57c929 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.4.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.4.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..ba1d032b1632c72d516bf607d69ef9d858ec3f69 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.4.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f5a160ff8d293e97b6037541c207caf6ea4b15e625bd94dba7be81f1aa3052f +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.4.ffn_norm.weight b/triton_models/interactive/1/weights/layers.4.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..10fdc6cff9055cfb29be992fd58fec67e3a1e156 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.4.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7584bdc2460f81e60ad3db90f314b1c3c0bb458b724ad5a8ef2f6b87991871f +size 8192 diff --git a/triton_models/interactive/1/weights/layers.4.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.4.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..8ab0548585972c0f9a19539e4f0246ed192f0042 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.4.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:734c894776290dd532cb25f542e38b56c9151c45fb751e1d58f5aba3c1cf86ce +size 16 diff --git a/triton_models/interactive/1/weights/layers.5.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.5.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..04ab0a16f4f6b5b500d30b4b27152a073d6efffb --- /dev/null +++ b/triton_models/interactive/1/weights/layers.5.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76f7240f7f94715ffc2e22da1e1986a7738b3a81d2803a89fa8d467ab37d52f3 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.5.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.5.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..35b017f6b8442ef2ed28b4f1d7f2aab7e6c8f3d4 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.5.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f30a98755d5e88115a8343930c20bbfd34ef8095694f4c0709b299e0ee587b25 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.5.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.5.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..4b270cc9d0768c5834bf5dee3db2ae53b9d1a2db --- /dev/null +++ b/triton_models/interactive/1/weights/layers.5.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c2c8b87162bc3f8d4c6044cbbba5bff1a0b4d484418966d683cd8edd5ffe289 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.5.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.5.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..2170f6316f894a43c57df7c6f3b6435d6d290e59 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.5.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8a0bc293e079e00c8fb29ea166613fb81fc7a51dfae01bda404298bd3541858 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.5.attention_norm.weight b/triton_models/interactive/1/weights/layers.5.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..e56c76ec2f895f4ab09e315bcb026a0cd110898e --- /dev/null +++ b/triton_models/interactive/1/weights/layers.5.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e322bf9e96c707a007b6cf18e95291034a7b4acc28cc9c868ba72a2067f42a4a +size 8192 diff --git a/triton_models/interactive/1/weights/layers.5.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.5.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c0603e429404aebb532d112009658a498d6a25d2 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.5.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b111a37c3e4700a7ac8bcc755e22baf0cdd205a4f64cce28587b12e6bf542fa5 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.5.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.5.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..54720e241e1c6574c937ac39760a84933da14ee8 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.5.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccbdd88d473982cb63c5daa191f2956e0826feff876c6303ad46054ce474a9f3 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.5.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.5.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f69f281b519e24e86576e49e914a3f29b9833837 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.5.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d055b75469902bb480fb2470766fc359100caf6f512e030d846c895cb23501e +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.5.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.5.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..39d27ba627be29fdb76869d39b5a02b38030a6a9 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.5.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf2b8068885689ca049003d3dff4bc8e68b47ddb9be7d7fdd56b39582b7fd61e +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.5.ffn_norm.weight b/triton_models/interactive/1/weights/layers.5.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..8f90bb2bd06c0ff2405bb8ca61c65441dc384653 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.5.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c886bfe39172273f70831164b7b87f48054c0da65cd1724be839673c817009b9 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.5.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.5.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..0032439aec9359a437391315477b7201d232b7ba --- /dev/null +++ b/triton_models/interactive/1/weights/layers.5.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b479855806803e6c485764401a2ed76b362ac09f2606a6d58fbba9b134ee186 +size 16 diff --git a/triton_models/interactive/1/weights/layers.6.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.6.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..08c09cae235117db0cf2be801f075c4236bd6ba2 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.6.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebf9ddd2465c02a1a37bafe82e009127d6cbbcf0bec3b323eece36934bb6eeff +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.6.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.6.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..78b67e25716cf86de09b47dc537db6ec420fd21a --- /dev/null +++ b/triton_models/interactive/1/weights/layers.6.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b39acb9cc4de067c3ef5b0128c253ad0b646756445766d91f2421ca30ab6e272 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.6.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.6.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2d2cd5ddae6f67b08f6610fd6bfd8fe17ff43ad7 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.6.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81ad5a0787961305a05ec9b7c0fb89cc2aa70589a36efea39557a8ff33be93c9 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.6.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.6.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..380b6dedbd40afe6240e0271cfd0000ef9f17b01 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.6.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edadc4493b3568ab5ebe758a1aedc2ef5fefcd688f5a78eb1866379967ca1cd6 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.6.attention_norm.weight b/triton_models/interactive/1/weights/layers.6.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..68cf1e82a5f3d60ef2c37bde39437efe411c0263 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.6.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dcd4367593812ecec39d8b1ff7cd21912c1283686db24be488384fd2453162c +size 8192 diff --git a/triton_models/interactive/1/weights/layers.6.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.6.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f66c0c431c68905f3cc431d2b266b628bcc1f9b1 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.6.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3cc20446684f9b809fd52c40bda9d32c115789c650575c0e54f5ab030b7ceed +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.6.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.6.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..d158d234d215899f80ded95207cff364e20e0c1d --- /dev/null +++ b/triton_models/interactive/1/weights/layers.6.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f01f13b1cd0cd8080d7c4906d71e44200b8053aa605a37069f1a9e1034a81f93 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.6.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.6.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..0bee7d213091341bc193cd21b808a3776987b7dd --- /dev/null +++ b/triton_models/interactive/1/weights/layers.6.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95865a00e74b9d37ba9c21241922979b4f26eb06b78b84b25be12bcfba617657 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.6.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.6.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..80f3f7257450ba5de9d4dabaa61b516c7c807046 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.6.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0dcaefa2acb86a25aedc25d60558af179bbf8968f1fd023b20343dad73b0184 +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.6.ffn_norm.weight b/triton_models/interactive/1/weights/layers.6.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..b56799656e38d049d14d02b2d7e4ab1e470bac6d --- /dev/null +++ b/triton_models/interactive/1/weights/layers.6.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e458ef7058c9d7734737447072dc2908dea9ebf64a2ebcef932e4d6832057f5b +size 8192 diff --git a/triton_models/interactive/1/weights/layers.6.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.6.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..70c460d32701c69c43ce43977e55d4c5e407b1c8 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.6.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa3e886e06b35057d676139206ed116fafd8c8dd29244eff07cf1221837e8807 +size 16 diff --git a/triton_models/interactive/1/weights/layers.7.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.7.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..4bd1b6da8292c5b10b20dbee8e2ee7e95a46637d --- /dev/null +++ b/triton_models/interactive/1/weights/layers.7.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0c4ca025a4e163c0dc2da98d463549125001a9cc93654f37907cce2a9882d52 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.7.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.7.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..8846088f9a04128c3626ebdde6d6747d1d663587 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.7.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c086c5de28164657905ed6eaed423d6244ae0368c6180aa26fc0a6eb89724a83 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.7.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.7.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c4891059c086711d0200456b57dc31f93418ba81 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.7.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efcb8926a09d3f78acbff4e19e2e5bafad04172d17321a6af2b4fe7974c40fe1 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.7.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.7.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..a08abb8652ecda43c661807290bbefa793fb0160 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.7.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c0cdf8402670c6998b317082c140f0eb51c4bb0b41ca4e6386c6f1648f56a76 +size 524288 diff --git a/triton_models/interactive/1/weights/layers.7.attention_norm.weight b/triton_models/interactive/1/weights/layers.7.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..37c18cd18f7054a248d6352d4d5a25ac9a4175e5 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.7.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28cf5e25d536f7d9180c2eb1d7dcfd7d4bb749816849f75c5e09f0210cdbc417 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.7.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.7.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..9b50669a9dc81bf91e567a299ee57d333907a007 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.7.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0581fd7f812265f9b47b8eab7621664a046c4c6f98279676df767aaf339eee7 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.7.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.7.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..93d6f40d2e5bcd8b2a2da3d12418121279963070 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.7.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f86e5d5f8bd7d8eded5bf5a5cbefc9b1b3242cdb2b486f6b1b0289d75f4df828 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.7.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.7.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..9d07164c18362f5b0879cc88dbb43ef395f284f2 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.7.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b02b881d979d0fb77a4d705ed4bc68ca58e7cfa84a504d90b9e816ddd99a6b0 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.7.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.7.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..b95f34d475e6c10781aca4639fbcadc9e706fc5a --- /dev/null +++ b/triton_models/interactive/1/weights/layers.7.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0c7e60168198f2ac9347ac8eb4fc59ea42fe0380e24550cd4fa2e989a2d90b4 +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.7.ffn_norm.weight b/triton_models/interactive/1/weights/layers.7.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..7669f396fbea22312892ecc7e69f5847e3e3d0f7 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.7.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bce0233aef9e8401ea7eaddce5b44f2a28b6fd1018023ec3f2cae495f4d205b6 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.7.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.7.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..d2b299db6620c0abf87b67b228dd03b696854499 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.7.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae08ed15fa296e998f7e93b866fb5536103b357ca8fd0e8ee44423c4fe3ea4d3 +size 16 diff --git a/triton_models/interactive/1/weights/layers.8.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.8.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..9a071d9e1c24a362c04a0f4335000d1eeeadbfea --- /dev/null +++ b/triton_models/interactive/1/weights/layers.8.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:312a5231076c36e023c30c18761d4793c7aaf2d1658f740a4ed6fe3ab9fb9532 +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.8.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.8.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..b756258fc2694a8580c1d6d55d73c1aae4f88737 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.8.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:045eb164e9d18487951013b4a69dab786f034139e232a0c079e6c6de0b84d445 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.8.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.8.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..79dcacb0bc5ed37629a105bb0afdc20c383e1736 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.8.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:917ac6b4102a88cb5fe47a13834f30fb45329e8234e6bf4a6d5def09acfca138 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.8.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.8.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..3f21f5d05d73002cb0251350fce183ec3b6f82cc --- /dev/null +++ b/triton_models/interactive/1/weights/layers.8.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:075ca25071e36779993618787bcad51f47a6210b5c7efb13836b9f0c39113c7b +size 524288 diff --git a/triton_models/interactive/1/weights/layers.8.attention_norm.weight b/triton_models/interactive/1/weights/layers.8.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..6441edc914d86ab07b46c530e63df5e212499fbf --- /dev/null +++ b/triton_models/interactive/1/weights/layers.8.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7645c5cc08248a97031708e37a8869793e72e86be7d529ee2d38214aa125f326 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.8.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.8.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..6b623d7f4ebef4670369d48905c1f66aa9b3fd94 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.8.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0a76bb17ba96c365a1bf660f901c21c3fc1d15165b0532e97c7ad86158513f0 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.8.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.8.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..f7b56f5fefdb81227823903289604a2f9e33cbf6 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.8.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f6cc9bf35da7c08e89248a2d1151ca84f97e0d44fda2f474fbe090fa2b71bc6 +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.8.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.8.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c837700cdf510ee1df94f861174695bb0e1ccfc8 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.8.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67d6a461146ce6fca245beab647f837c7718f50c1ae6d48f852becd4b88ecd68 +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.8.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.8.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..63ba13362b7c68d37224b01f241452a27cf8717a --- /dev/null +++ b/triton_models/interactive/1/weights/layers.8.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22f763f7c06275a5821c55ab0428986c7982da93d02ec561c4c1cf0bc83cb82a +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.8.ffn_norm.weight b/triton_models/interactive/1/weights/layers.8.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..c4ec482ee099d1dd8d7b2633b38f9546642f8c04 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.8.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97f607d08fdcc7d4a7048194e994afa25c34242bddec4d56534a779484534dec +size 8192 diff --git a/triton_models/interactive/1/weights/layers.8.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.8.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..dae30d205782945d230c044159736e88b8c261e0 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.8.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55e7e6e9663622f872cb332c414eac32a102e97ffdf3f5a2b6afa6f8371e1a5f +size 16 diff --git a/triton_models/interactive/1/weights/layers.9.attention.w_qkv.0.qweight b/triton_models/interactive/1/weights/layers.9.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..502cfce88cfb73bd839f1fb667fba672259c4294 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.9.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ad1c9bfda707333f5860de8512ec7db789721d5f17e96ec0c1f79f98533c42c +size 12582912 diff --git a/triton_models/interactive/1/weights/layers.9.attention.w_qkv.0.scales_zeros b/triton_models/interactive/1/weights/layers.9.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..47605d66d4acddffb2885150c9d68d184f94a9c6 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.9.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b5179dc3fba3abadb58abf409bfef33b382dc7373a002c3c43da9785c86f614 +size 786432 diff --git a/triton_models/interactive/1/weights/layers.9.attention.wo.0.qweight b/triton_models/interactive/1/weights/layers.9.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..0c3613bd080dd0fe0abbe07c8a567bf85e48e33d --- /dev/null +++ b/triton_models/interactive/1/weights/layers.9.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:535eb0ed2a008590448c38ddcfcf990219dd0c1752e28d11fe3310cdf4039d57 +size 8388608 diff --git a/triton_models/interactive/1/weights/layers.9.attention.wo.0.scales_zeros b/triton_models/interactive/1/weights/layers.9.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..bc68d0462949d41fb22495d6fc4d8a2c6c21b6a6 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.9.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee2d02d009e36ca78d86a48ea408c2017c21903b64400397a77f437f495d936c +size 524288 diff --git a/triton_models/interactive/1/weights/layers.9.attention_norm.weight b/triton_models/interactive/1/weights/layers.9.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..8493ee9741dd897107d9fe3cea7c2d01fdd4dee5 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.9.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcacb811b4cf62144e1ac2d3eadbafab30083e3420c46a92df1ab21840b29fe5 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.9.feed_forward.w13.0.qweight b/triton_models/interactive/1/weights/layers.9.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..bcb62122ef3b2bf1d13099eb7e64cd4f6266f02c --- /dev/null +++ b/triton_models/interactive/1/weights/layers.9.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aca67258bcd3c39f17fb15a14b72cfe8ca597aeb30e0f4f298efa5eb093abcf3 +size 58720256 diff --git a/triton_models/interactive/1/weights/layers.9.feed_forward.w13.0.scales_zeros b/triton_models/interactive/1/weights/layers.9.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..3e0e6af0add56eeb2e1cf7bc0142e52be7a5ae29 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.9.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4b60ceaccc0af57c36de7cd69acf05d8c307f2d6d27a7e765e0f132ae95d17a +size 3670016 diff --git a/triton_models/interactive/1/weights/layers.9.feed_forward.w2.0.qweight b/triton_models/interactive/1/weights/layers.9.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..399c1fc8d6cc43a27e802ca067c88fc4f9a3bc73 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.9.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e07e422f44ddda11dc7404b257cacd675b2b7f44491941e6754155df3a31d2e +size 29360128 diff --git a/triton_models/interactive/1/weights/layers.9.feed_forward.w2.0.scales_zeros b/triton_models/interactive/1/weights/layers.9.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..9509fd872d04e11bf53f07f99129e785b2056187 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.9.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cc346804097116087236c77f2e2c018922efba4f2e32d8a71ddf8b026c9d34d +size 1835008 diff --git a/triton_models/interactive/1/weights/layers.9.ffn_norm.weight b/triton_models/interactive/1/weights/layers.9.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..185031880012c613c2cf8937d4aa159e1c93a4c0 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.9.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98119ccde8c54eacba56311e43a7c74e62e30e0d7302b011202dea6a6348ba66 +size 8192 diff --git a/triton_models/interactive/1/weights/layers.9.past_kv_scale.0.weight b/triton_models/interactive/1/weights/layers.9.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..0ec9f90c9c5be11398b7b1bdba1df5b0975ab0d4 --- /dev/null +++ b/triton_models/interactive/1/weights/layers.9.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62cf0a7960b56038dd17b81e2a1c38a016c2b78bd7272299dee18ae8e53e5c92 +size 16 diff --git a/triton_models/interactive/1/weights/norm.weight b/triton_models/interactive/1/weights/norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..906361178f72cf7bd1f01447accc35bf0e1b633a --- /dev/null +++ b/triton_models/interactive/1/weights/norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efcd3fb0c1c5225c17e0eeb5b46068bb7311f716a4908d5a39d79b37985b58e7 +size 8192 diff --git a/triton_models/interactive/1/weights/output.weight b/triton_models/interactive/1/weights/output.weight new file mode 100644 index 0000000000000000000000000000000000000000..04e8f86f0b46051b3db62d5eefcbebda87641472 --- /dev/null +++ b/triton_models/interactive/1/weights/output.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b0ed41b4df8f91647fc8bdd2aa61f55c39e09b6e063c8bd509b591797293919 +size 758120448 diff --git a/triton_models/interactive/1/weights/tok_embeddings.weight b/triton_models/interactive/1/weights/tok_embeddings.weight new file mode 100644 index 0000000000000000000000000000000000000000..0b3edbd16fbb690f7c781043ea905fd4380e5f04 --- /dev/null +++ b/triton_models/interactive/1/weights/tok_embeddings.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8986115ad7e59813a41c88c0d601235fa36138d6c15e5657a050cf4ec40fb037 +size 758120448 diff --git a/triton_models/interactive/config.pbtxt b/triton_models/interactive/config.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..f139d5b2234c0dfa94e3792dda985f9e8034a5a8 --- /dev/null +++ b/triton_models/interactive/config.pbtxt @@ -0,0 +1,293 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "turbomind" +backend: "turbomind" +default_model_filename: "weights" +max_batch_size: 1 + +model_transaction_policy { + decoupled: True +} + +instance_group [ + { + # max concurrent instances + count: 48 + kind: KIND_CPU + } +] + +input [ + { + name: "input_ids" + data_type: TYPE_UINT32 + dims: [ -1 ] + # allow_ragged_batch: true + }, + { + name: "input_lengths" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + }, + { + name: "request_output_len" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "input_embeddings" + data_type: TYPE_INT8 + dims: [ -1 ] + optional: true + }, + { + name: "input_embedding_ranges" + data_type: TYPE_UINT32 + dims: [ -1, 2 ] + optional: true + }, + { + name: "step" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "session_len" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_k" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "beam_search_diversity_rate" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "temperature" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "len_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "repetition_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "random_seed" + data_type: TYPE_UINT64 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "is_return_log_probs" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "beam_width" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "start_id" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "end_id" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "bad_words_list" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + optional: true + }, + { + name: "stop_words_list" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + optional: true + }, + { + name: "prompt_learning_task_name_ids" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "top_p_decay" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "top_p_min" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "top_p_reset_ids" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "START" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "END" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "STOP" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "CORRID" + data_type: TYPE_UINT64 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + } +] +output [ + { + name: "output_ids" + data_type: TYPE_UINT32 + dims: [ -1, -1 ] + }, + { + name: "sequence_length" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "cum_log_probs" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "output_log_probs" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + } +] + +parameters { + key: "pipeline_para_size" + value: { + string_value: "1" + } +} +parameters { + key: "data_type" + value: { + string_value: "fp16" + } +} +parameters { + key: "model_type" + value: { + string_value: "Llama" + } +} + +parameters { + key: "enable_custom_all_reduce" + value: { + string_value: "0" + } +} +parameters { + key: "tensor_para_size" + value: { + string_value: "1" + } +} +parameters { + key: "model_name" + value: { + string_value: "internlm2-chat-7b" + } +} diff --git a/triton_models/postprocessing/1/__pycache__/model.cpython-310.pyc b/triton_models/postprocessing/1/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa0ac1382a7864add3a9bb04e6b328fa6995f67d Binary files /dev/null and b/triton_models/postprocessing/1/__pycache__/model.cpython-310.pyc differ diff --git a/triton_models/postprocessing/1/model.py b/triton_models/postprocessing/1/model.py new file mode 100644 index 0000000000000000000000000000000000000000..20de97595195da5dedc044a31c6086c1f49892da --- /dev/null +++ b/triton_models/postprocessing/1/model.py @@ -0,0 +1,129 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp +from pathlib import Path + +import numpy as np +import triton_python_backend_utils as pb_utils + +# This tokenizer is `lmdeploy/turbomind/tokenizer.py`. When an LLM is served +# by triton inference server, it has to be converted first by running +# `python lmdeploy/serve/turbomind/deploy.py`. Then +# `lmdeploy/turbomind/tokenizer.py` will be copied to `tokenizer/tokenizer.py` +from .tokenizer.tokenizer import Tokenizer + + +class TritonPythonModel: + """Your Python model must use the same class name. + + Every Python model that is created must have "TritonPythonModel" as the + class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device + ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # Parse model configs + self.model_config = model_config = json.loads(args['model_config']) + + # Parse model output configs + output_config = pb_utils.get_output_config_by_name( + model_config, 'OUTPUT') + + # Convert Triton types to numpy types + self.output_dtype = pb_utils.triton_string_to_numpy( + output_config['data_type']) + + cur_folder = Path(__file__).parent + + self.tokenizer = Tokenizer( + osp.join( + cur_folder, self.model_config['parameters']['tokenizer_path'] + ['string_value'])) + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + for idx, request in enumerate(requests): + # Get input tensors + tokens_batch = pb_utils.get_input_tensor_by_name( + request, 'TOKENS_BATCH').as_numpy() + sequence_length = pb_utils.get_input_tensor_by_name( + request, 'sequence_length').as_numpy() + + # Postprocessing output data. + outputs = self._postprocessing(tokens_batch.tolist(), + sequence_length) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + output_tensor = pb_utils.Tensor( + 'OUTPUT', + np.array(outputs).astype(self.output_dtype)) + + # Create InferenceResponse. You can set an error here in case + # there was a problem with handling this inference request. + # Below is an example of how you can set errors in inference + # response: + # + # pb_utils.InferenceResponse( + # output_tensors=..., TritonError("An error occurred")) + inference_response = pb_utils.InferenceResponse( + output_tensors=[output_tensor]) + responses.append(inference_response) + + # You should return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + + Implementing `finalize` function is optional. This function allows the + model to perform any necessary clean ups before exit. + """ + print('Cleaning up...') + + def _postprocessing(self, tokens_batch, sequence_length): + """decode token ids into texts.""" + outputs = [] + for beam_tokens, beam_len in zip(tokens_batch, sequence_length): + for tokens, _len in zip(beam_tokens, beam_len): + output = self.tokenizer.decode(tokens, _len) + output = output.encode('utf8') + outputs.append(output) + return outputs diff --git a/triton_models/postprocessing/1/tokenizer/config.json b/triton_models/postprocessing/1/tokenizer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..84235b8a1a9618cc0ac265caf61ea4088780e3b1 --- /dev/null +++ b/triton_models/postprocessing/1/tokenizer/config.json @@ -0,0 +1,37 @@ +{ + "_name_or_path": "/root/psy/internlm2-7b/work_dirs/internlm2_chat_7b_qlora_oasst1_512_e3_copy/hf_2/merge", + "architectures": [ + "InternLM2ForCausalLM" + ], + "attn_implementation": "eager", + "auto_map": { + "AutoConfig": "configuration_internlm.InternLMConfig", + "AutoModel": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM" + }, + "bias": false, + "bos_token_id": 1, + "eos_token_id": 2, + "fp16": true, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "model_type": "internlm", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pad_token_id": 2, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 2.0, + "type": "dynamic" + }, + "rope_theta": 1000000, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.37.2", + "use_cache": false, + "vocab_size": 92544 +} diff --git a/triton_models/postprocessing/1/tokenizer/configuration_internlm.py b/triton_models/postprocessing/1/tokenizer/configuration_internlm.py new file mode 100644 index 0000000000000000000000000000000000000000..4d013582feaa1f9970a4256c4a0f77000fa645de --- /dev/null +++ b/triton_models/postprocessing/1/tokenizer/configuration_internlm.py @@ -0,0 +1,164 @@ +# coding=utf-8 +# Copyright (c) InternLM. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" InternLM model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {} + + +class InternLMConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate + an InternLM model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the InternLM-7B. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`InternLMModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings(`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + Example: + + ```python + >>> from transformers import InternLMModel, InternLMConfig + + >>> # Initializing a InternLM internlm-7b style configuration + >>> configuration = InternLMConfig() + + >>> # Initializing a model from the internlm-7b style configuration + >>> model = InternLMModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "internlm" + _auto_class = "AutoConfig" + + def __init__( # pylint: disable=W0102 + self, + vocab_size=103168, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + bias=True, + rope_theta=10000, + rope_scaling=None, + attn_implementation="eager", + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.bias = bias + + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + + self.attn_implementation = attn_implementation + if self.attn_implementation is None: + self.attn_implementation = "eager" + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}") diff --git a/triton_models/postprocessing/1/tokenizer/generation_config.json b/triton_models/postprocessing/1/tokenizer/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cc5efeadd3bf2caa4462a3be79d580690f410668 --- /dev/null +++ b/triton_models/postprocessing/1/tokenizer/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2, + "transformers_version": "4.37.2" +} diff --git a/triton_models/postprocessing/1/tokenizer/modeling_internlm2.py b/triton_models/postprocessing/1/tokenizer/modeling_internlm2.py new file mode 100644 index 0000000000000000000000000000000000000000..39d6f71d2933385988ec05f845d3f6386c97f74b --- /dev/null +++ b/triton_models/postprocessing/1/tokenizer/modeling_internlm2.py @@ -0,0 +1,1385 @@ +# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved. +# +# This code is based on transformers/src/transformers/models/llama/modeling_llama.py +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch InternLM2 model.""" +import math +import queue +import threading +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from einops import rearrange +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from transformers.activations import ACT2FN +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) + +try: + from transformers.generation.streamers import BaseStreamer +except: # noqa # pylint: disable=bare-except + BaseStreamer = None + +from .configuration_internlm import InternLMConfig as InternLM2Config + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "InternLM2Config" + +flash_attn_func, flash_attn_varlen_func = None, None +pad_input, index_first_axis, unpad_input = None, None, None +def _import_flash_attn(): + global flash_attn_func, flash_attn_varlen_func + global pad_input, index_first_axis, unpad_input + try: + from flash_attn import flash_attn_func as _flash_attn_func, flash_attn_varlen_func as _flash_attn_varlen_func + from flash_attn.bert_padding import pad_input as _pad_input, index_first_axis as _index_first_axis, unpad_input as _unpad_input + flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func + pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input + except ImportError: + raise ImportError("flash_attn is not installed.") + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.bart.modeling_bart._make_causal_mask +def _make_causal_mask( + input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 +): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM2 +class InternLM2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + InternLM2RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +# Copied from transformers.model.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM2 +class InternLM2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=torch.float32) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.model.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->InternLM2 +class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding): + """InternLM2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + t = t / self.scaling_factor + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.model.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM2 +class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding): + """InternLM2RotaryEmbedding extended with Dynamic NTK scaling. + Credits to the Reddit users /u/bloc97 and /u/emozilla. + """ + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.model.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors.""" + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class InternLM2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.w1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.w3 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.w2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x)) + + return down_proj + + +# Copied from transformers.model.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +# Modified from transformers.model.llama.modeling_llama.LlamaAttention +class InternLM2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: InternLM2Config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.wqkv = nn.Linear( + self.hidden_size, + (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim, + bias=config.bias, + ) + + self.wo = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = InternLM2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "dynamic": + self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, + scaling_factor=scaling_factor, + ) + elif scaling_type == "linear": + self.rotary_emb = InternLM2LinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, + scaling_factor=scaling_factor, + ) + else: + raise ValueError("Currently we only support rotary embedding's type being 'dynamic' or 'linear'.") + return self.rotary_emb + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + + bsz, q_len, _ = hidden_states.size() + + qkv_states = self.wqkv(hidden_states) + + qkv_states = rearrange( + qkv_states, + "b q (h gs d) -> b q h gs d", + gs=2 + self.num_key_value_groups, + d=self.head_dim, + ) + + query_states = qkv_states[..., : self.num_key_value_groups, :] + query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d") + key_states = qkv_states[..., -2, :] + value_states = qkv_states[..., -1, :] + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.wo(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +# Modified from transformers.model.llama.modeling_llama.InternLM2FlashAttention2 +class InternLM2FlashAttention2(InternLM2Attention): + """ + InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # InternLM2FlashAttention2 attention does not support output_attentions + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + qkv_states = self.wqkv(hidden_states) + + qkv_states = rearrange( + qkv_states, + "b q (h gs d) -> b q h gs d", + gs=2 + self.num_key_value_groups, + d=self.head_dim, + ) + + query_states = qkv_states[..., : self.num_key_value_groups, :] + query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d") + key_states = qkv_states[..., -2, :] + value_states = qkv_states[..., -1, :] + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, q_len + ) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.wo(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + # Contains at least one padding token in the sequence + causal = self.is_causal and query_length != 1 + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q.to(torch.int64), + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + +INTERNLM2_ATTENTION_CLASSES = { + "eager": InternLM2Attention, + "flash_attention_2": InternLM2FlashAttention2, +} + +# Modified from transformers.model.llama.modeling_llama.LlamaDecoderLayer +class InternLM2DecoderLayer(nn.Module): + def __init__(self, config: InternLM2Config): + super().__init__() + self.hidden_size = config.hidden_size + + self.attention = INTERNLM2_ATTENTION_CLASSES[config.attn_implementation](config=config) + + self.feed_forward = InternLM2MLP(config) + self.attention_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.ffn_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + + residual = hidden_states + + hidden_states = self.attention_norm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.attention( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.ffn_norm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +InternLM2_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`InternLM2Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2 +@add_start_docstrings( + "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.", + InternLM2_START_DOCSTRING, +) +class InternLM2PreTrainedModel(PreTrainedModel): + config_class = InternLM2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["InternLM2DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +InternLM2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or + when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Modified from transformers.model.llama.modeling_llama.LlamaModel +@add_start_docstrings( + "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.", + InternLM2_START_DOCSTRING, +) +class InternLM2Model(InternLM2PreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLM2DecoderLayer`] + + Args: + config: InternLM2Config + """ + + _auto_class = "AutoModel" + + def __init__(self, config: InternLM2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.config = config + + self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + + self.layers = nn.ModuleList([InternLM2DecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.tok_embeddings + + def set_input_embeddings(self, value): + self.tok_embeddings = value + + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + device=inputs_embeds.device, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( + inputs_embeds.device + ) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.attn_implementation == "flash_attention_2": + _import_flash_attn() + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + seq_length_with_past = seq_length + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.tok_embeddings(input_ids) + + if self.config.attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + else: + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + # embed positions + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, output_attentions, None) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +# Modified from transformers.model.llama.modeling_llama.LlamaForCausalLM +class InternLM2ForCausalLM(InternLM2PreTrainedModel): + _auto_class = "AutoModelForCausalLM" + + _tied_weights_keys = ["output.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = InternLM2Model(config) + self.vocab_size = config.vocab_size + self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.tok_embeddings + + def set_input_embeddings(self, value): + self.model.tok_embeddings = value + + def get_output_embeddings(self): + return self.output + + def set_output_embeddings(self, new_embeddings): + self.output = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, InternLM2ForCausalLM + + >>> model = InternLM2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.output(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values is not None: + past_length = past_key_values[0][0].shape[2] + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""): + prompt = "" + if meta_instruction: + prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n""" + else: + prompt += "" + for record in history: + prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n""" + prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n""" + return tokenizer([prompt], return_tensors="pt") + + @torch.no_grad() + def chat( + self, + tokenizer, + query: str, + history: List[Tuple[str, str]] = [], + streamer: Optional[BaseStreamer] = None, + max_new_tokens: int = 1024, + do_sample: bool = True, + temperature: float = 0.8, + top_p: float = 0.8, + meta_instruction: str = "You are an AI assistant whose name is InternLM (书生·浦语).\n" + "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n" + "- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.", + **kwargs, + ): + inputs = self.build_inputs(tokenizer, query, history, meta_instruction) + inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)} + # also add end-of-assistant token in eos token id to avoid unnecessary generation + eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]] + outputs = self.generate( + **inputs, + streamer=streamer, + max_new_tokens=max_new_tokens, + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + eos_token_id=eos_token_id, + **kwargs, + ) + outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :] + response = tokenizer.decode(outputs, skip_special_tokens=True) + response = response.split("<|im_end|>")[0] + history = history + [(query, response)] + return response, history + + @torch.no_grad() + def stream_chat( + self, + tokenizer, + query: str, + history: List[Tuple[str, str]] = [], + max_new_tokens: int = 1024, + do_sample: bool = True, + temperature: float = 0.8, + top_p: float = 0.8, + **kwargs, + ): + """ + Return a generator in format: (response, history) + Eg. + ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')]) + ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')]) + """ + if BaseStreamer is None: + raise ModuleNotFoundError( + "The version of `transformers` is too low. Please make sure " + "that you have installed `transformers>=4.28.0`." + ) + + response_queue = queue.Queue(maxsize=20) + + class ChatStreamer(BaseStreamer): + def __init__(self, tokenizer) -> None: + super().__init__() + self.tokenizer = tokenizer + self.queue = response_queue + self.query = query + self.history = history + self.response = "" + self.received_inputs = False + self.queue.put((self.response, history + [(self.query, self.response)])) + + def put(self, value): + if len(value.shape) > 1 and value.shape[0] > 1: + raise ValueError("ChatStreamer only supports batch size 1") + elif len(value.shape) > 1: + value = value[0] + + if not self.received_inputs: + # The first received value is input_ids, ignore here + self.received_inputs = True + return + + token = self.tokenizer.decode([value[-1]], skip_special_tokens=True) + if token.strip() != "<|im_end|>": + self.response = self.response + token + history = self.history + [(self.query, self.response)] + self.queue.put((self.response, history)) + + def end(self): + self.queue.put(None) + + def stream_producer(): + return self.chat( + tokenizer=tokenizer, + query=query, + streamer=ChatStreamer(tokenizer=tokenizer), + history=history, + max_new_tokens=max_new_tokens, + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + **kwargs, + ) + + def consumer(): + producer = threading.Thread(target=stream_producer) + producer.start() + while True: + res = response_queue.get() + if res is None: + return + yield res + + return consumer() + + +# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2 +@add_start_docstrings( + """ + The InternLM2 Model transformer with a sequence classification head on top (linear layer). + + [`InternLM2ForSequenceClassification`] uses the last token in order to do the classification, + as other causal models (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + InternLM2_START_DOCSTRING, +) +class InternLM2ForSequenceClassification(InternLM2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = InternLM2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.tok_embeddings + + def set_input_embeddings(self, value): + self.model.tok_embeddings = value + + @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to( + logits.device + ) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/triton_models/postprocessing/1/tokenizer/placeholder b/triton_models/postprocessing/1/tokenizer/placeholder new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/triton_models/postprocessing/1/tokenizer/pytorch_model.bin.index.json b/triton_models/postprocessing/1/tokenizer/pytorch_model.bin.index.json new file mode 100644 index 0000000000000000000000000000000000000000..7d95cf180df4c423e817c55f30f5ce93ac80e220 --- /dev/null +++ b/triton_models/postprocessing/1/tokenizer/pytorch_model.bin.index.json @@ -0,0 +1,554 @@ +{ + "metadata": { + "total_size": 5251801088 + }, + "weight_map": { + "model.layers.0.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.10.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.10.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.10.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.2.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.20.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.27.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.27.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.27.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.3.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.30.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.4.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.norm.weight": "pytorch_model-00003-of-00003.bin", + "model.tok_embeddings.weight": "pytorch_model-00001-of-00003.bin", + "output.weight": "pytorch_model-00003-of-00003.bin" + } +} diff --git a/triton_models/postprocessing/1/tokenizer/special_tokens_map.json b/triton_models/postprocessing/1/tokenizer/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..492d4b2966a1763442d426d880dbc29f94906e4c --- /dev/null +++ b/triton_models/postprocessing/1/tokenizer/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/triton_models/postprocessing/1/tokenizer/tokenization_internlm.py b/triton_models/postprocessing/1/tokenizer/tokenization_internlm.py new file mode 100644 index 0000000000000000000000000000000000000000..9792349c7fed6fc64476eabdd9dad7a84640c3ee --- /dev/null +++ b/triton_models/postprocessing/1/tokenizer/tokenization_internlm.py @@ -0,0 +1,240 @@ +# coding=utf-8 +# Copyright (c) InternLM. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tokenization classes for IntermLM.""" +import os +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm +from transformers.tokenization_utils import PreTrainedTokenizer +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"} + +PRETRAINED_VOCAB_FILES_MAP = {} + + +class InternLMTokenizer(PreTrainedTokenizer): + """ + Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + model_input_names = ["input_ids", "attention_mask"] + _auto_class = "AutoTokenizer" + + def __init__( + self, + vocab_file, + unk_token="", + bos_token="", + eos_token="", + pad_token="", + sp_model_kwargs: Optional[Dict[str, Any]] = None, + add_bos_token=True, + add_eos_token=False, + decode_with_prefix_space=False, + clean_up_tokenization_spaces=False, + **kwargs, + ): + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.vocab_file = vocab_file + self.add_bos_token = add_bos_token + self.add_eos_token = add_eos_token + self.decode_with_prefix_space = decode_with_prefix_space + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + self._no_prefix_space_tokens = None + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + """ Initialization""" + + @property + def no_prefix_space_tokens(self): + if self._no_prefix_space_tokens is None: + vocab = self.convert_ids_to_tokens(list(range(self.vocab_size))) + self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")} + return self._no_prefix_space_tokens + + @property + def vocab_size(self): + """Returns vocab size""" + return self.sp_model.get_piece_size() + + @property + def bos_token_id(self) -> Optional[int]: + return self.sp_model.bos_id() + + @property + def eos_token_id(self) -> Optional[int]: + return self.sp_model.eos_id() + + def get_vocab(self): + """Returns vocab as a dict""" + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text): + """Returns a tokenized string.""" + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.piece_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + token = self.sp_model.IdToPiece(index) + return token + + def _maybe_add_prefix_space(self, tokens, decoded): + if tokens and tokens[0] not in self.no_prefix_space_tokens: + return " " + decoded + else: + return decoded + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + out_string = self.clean_up_tokenization(out_string) + out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string) + return out_string[1:] + + def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + if self.add_bos_token: + bos_token_ids = [self.bos_token_id] + else: + bos_token_ids = [] + + output = bos_token_ids + token_ids_0 + + if token_ids_1 is not None: + output = output + token_ids_1 + + if self.add_eos_token: + output = output + [self.eos_token_id] + + return output + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make + use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + eos = [self.eos_token_id] + + if token_ids_1 is None: + return len(token_ids_0 + eos) * [0] + return len(token_ids_0 + eos + token_ids_1 + eos) * [0] diff --git a/triton_models/postprocessing/1/tokenizer/tokenizer.model b/triton_models/postprocessing/1/tokenizer/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6600712949ca9c4ffb50f25275993a21fba0b408 --- /dev/null +++ b/triton_models/postprocessing/1/tokenizer/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b +size 1477754 diff --git a/triton_models/postprocessing/1/tokenizer/tokenizer.py b/triton_models/postprocessing/1/tokenizer/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..db936a5501cb07d33d56083656dbd734ba7431bf --- /dev/null +++ b/triton_models/postprocessing/1/tokenizer/tokenizer.py @@ -0,0 +1,400 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os +import os.path as osp +from collections import deque +from typing import List, Optional, Sequence, Union + +import torch + +from lmdeploy.utils import get_logger + +# this file will be copied to triton server, make sure all +# importing are starting from the package root lmdeploy + + +class SentencePieceTokenizer: + """Tokenizer of sentencepiece. + + Args: + model_file (str): the path of the tokenizer model + """ + + def __init__(self, model_file: str): + from sentencepiece import SentencePieceProcessor + self.model = SentencePieceProcessor(model_file=model_file) + self._prefix_space_tokens = None + # for stop words + self._maybe_decode_bytes: bool = None + # TODO maybe lack a constant.py + self._indexes_tokens_deque = deque(maxlen=10) + self.max_indexes_num = 5 + self.logger = get_logger('lmdeploy') + + @property + def vocab_size(self): + """vocabulary size.""" + return self.model.vocab_size() + + @property + def bos_token_id(self): + """begine of the sentence token id.""" + return self.model.bos_id() + + @property + def eos_token_id(self): + """end of the sentence token id.""" + return self.model.eos_id() + + @property + def prefix_space_tokens(self): + """tokens without prefix space.""" + if self._prefix_space_tokens is None: + vocab = self.model.IdToPiece(list(range(self.vocab_size))) + self._prefix_space_tokens = { + i + for i, tok in enumerate(vocab) if tok.startswith('▁') + } + return self._prefix_space_tokens + + def _maybe_add_prefix_space(self, tokens, decoded): + """maybe add prefix space for incremental decoding.""" + if len(tokens) and not decoded.startswith(' ') and\ + tokens[0] in self.prefix_space_tokens: + return ' ' + decoded + else: + return decoded + + def indexes_containing_token(self, token: str): + """Return all the possible indexes, whose decoding output may contain + the input token.""" + # traversing vocab is time consuming, can not be accelerated with + # multi threads (computation) or multi process (can't pickle tokenizer) + # so, we maintain latest 10 stop words and return directly if matched + for _token, _indexes in self._indexes_tokens_deque: + if token == _token: + return _indexes + if token == ' ': # ' ' is special + token = '▁' + vocab = self.model.IdToPiece(list(range(self.vocab_size))) + indexes = [i for i, voc in enumerate(vocab) if token in voc] + if len(indexes) > self.max_indexes_num: + indexes = self.encode(token, add_bos=False)[-1:] + self.logger.warning( + f'There are too many(>{self.max_indexes_num}) possible ' + f'indexes may decoding {token}, we will use {indexes} only') + self._indexes_tokens_deque.append((token, indexes)) + return indexes + + def encode(self, s: str, add_bos: bool = True, **kwargs): + """Tokenize a prompt. + + Args: + s (str): a prompt + Returns: + list[int]: token ids + """ + return self.model.Encode(s, add_bos=add_bos, **kwargs) + + def decode(self, t: Sequence[int], offset: Optional[int] = None): + """De-tokenize. + + Args: + t (List[int]): a list of token ids + offset (int): for incrementally decoding. Default to None, which + means not applied. + Returns: + str: text of decoding tokens + """ + if isinstance(t, torch.Tensor): + t = t.tolist() + t = t[offset:] + out_string = self.model.Decode(t) + if offset: + out_string = self._maybe_add_prefix_space(t, out_string) + return out_string + + def __call__(self, s: Union[str, Sequence[str]]): + """Tokenize prompts. + + Args: + s (str): prompts + Returns: + list[int]: token ids + """ + import addict + add_bos = False + add_eos = False + + input_ids = self.model.Encode(s, add_bos=add_bos, add_eos=add_eos) + return addict.Addict(input_ids=input_ids) + + +class HuggingFaceTokenizer: + """Tokenizer of sentencepiece. + + Args: + model_dir (str): the directory of the tokenizer model + """ + + def __init__(self, model_dir: str): + from transformers import AutoTokenizer + model_file = osp.join(model_dir, 'tokenizer.model') + backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json') + model_file_exists = osp.exists(model_file) + self.logger = get_logger('lmdeploy') + if not osp.exists(backend_tokenizer_file) and model_file_exists: + self.logger.warning( + 'Can not find tokenizer.json. ' + 'It may take long time to initialize the tokenizer.') + self.model = AutoTokenizer.from_pretrained(model_dir, + trust_remote_code=True) + self._prefix_space_tokens = None + # save tokenizer.json to reuse + if not osp.exists(backend_tokenizer_file) and model_file_exists: + if hasattr(self.model, 'backend_tokenizer'): + if os.access(model_dir, os.W_OK): + self.model.backend_tokenizer.save(backend_tokenizer_file) + + if self.model.eos_token_id is None: + generation_config_file = osp.join(model_dir, + 'generation_config.json') + if osp.exists(generation_config_file): + with open(generation_config_file, 'r') as f: + cfg = json.load(f) + self.model.eos_token_id = cfg['eos_token_id'] + elif hasattr(self.model, 'eod_id'): # Qwen remote + self.model.eos_token_id = self.model.eod_id + + # for stop words + self._vocab_size_with_added: int = None + self._maybe_decode_bytes: bool = None + # TODO maybe lack a constant.py + self._indexes_tokens_deque = deque(maxlen=10) + self.max_indexes_num = 5 + self.token2id = {} + + @property + def vocab_size(self): + """vocabulary size.""" + return self.model.vocab_size + + @property + def vocab_size_with_added(self): + """vocabulary size with added vocab.""" + if self._vocab_size_with_added is not None: + return self._vocab_size_with_added + self._vocab_size_with_added = len(self.model.get_vocab()) + return self._vocab_size_with_added + + @property + def bos_token_id(self): + """begine of the sentence token id.""" + return self.model.bos_token_id + + @property + def eos_token_id(self): + """end of the sentence token id.""" + return self.model.eos_token_id + + @property + def prefix_space_tokens(self): + """tokens without prefix space.""" + if self._prefix_space_tokens is None: + vocab = self.model.convert_ids_to_tokens( + list(range(self.vocab_size))) + self._prefix_space_tokens = { + i + for i, tok in enumerate(vocab) + if tok.startswith('▁' if isinstance(tok, str) else b' ') + } + return self._prefix_space_tokens + + def _maybe_add_prefix_space(self, tokens: List[int], decoded: str): + """maybe add prefix space for incremental decoding.""" + if len(tokens) and not decoded.startswith(' ') and\ + tokens[0] in self.prefix_space_tokens: + return ' ' + decoded + else: + return decoded + + @property + def maybe_decode_bytes(self): + """Check if self.model.convert_ids_to_tokens return not a str value.""" + if self._maybe_decode_bytes is None: + self._maybe_decode_bytes = False + vocab = self.model.convert_ids_to_tokens( + list(range(self.vocab_size))) + for tok in vocab: + if not isinstance(tok, str): + self._maybe_decode_bytes = True + break + return self._maybe_decode_bytes + + def indexes_containing_token(self, token: str): + """Return all the possible indexes, whose decoding output may contain + the input token.""" + # traversing vocab is time consuming, can not be accelerated with + # multi threads (computation) or multi process (can't pickle tokenizer) + # so, we maintain latest 10 stop words and return directly if matched + for _token, _indexes in self._indexes_tokens_deque: + if token == _token: + return _indexes + + if self.token2id == {}: + # decode is slower than convert_ids_to_tokens + if self.maybe_decode_bytes: + self.token2id = { + self.model.decode(i): i + for i in range(self.vocab_size) + } + else: + self.token2id = { + self.model.convert_ids_to_tokens(i): i + for i in range(self.vocab_size) + } + if token == ' ': # ' ' is special + token = '▁' + indexes = [i for _token, i in self.token2id.items() if token in _token] + if len(indexes) > self.max_indexes_num: + indexes = self.encode(token, add_bos=False)[-1:] + self.logger.warning( + f'There are too many(>{self.max_indexes_num}) possible ' + f'indexes may decoding {token}, we will use {indexes} only') + # there might be token id that exceeds self.vocab_size + if len(indexes) == 0: + indexes = self.encode(token, False) + if len(indexes) != 1: + self.logger.warning( + f'The token {token}, its length of indexes {indexes} is ' + 'not 1. Currently, it can not be used as stop words') + indexes = [] + self._indexes_tokens_deque.append((token, indexes)) + return indexes + + def encode(self, s: str, add_bos: bool = True, **kwargs): + """Tokenize a prompt. + + Args: + s (str): a prompt + Returns: + list[int]: token ids + """ + encoded = self.model.encode(s, **kwargs) + if not add_bos: + # in the middle of a session + if len(encoded) and encoded[0] == self.bos_token_id: + encoded = encoded[1:] + return encoded + + def decode(self, t: Sequence[int], offset: Optional[int] = None): + """De-tokenize. + + Args: + t (List[int]): a list of token ids + offset (int): for incrementally decoding. Default to None, which + means not applied. + Returns: + str: text of decoding tokens + """ + skip_special_tokens = True + t = t[offset:] + out_string = self.model.decode(t, + skip_special_tokens=skip_special_tokens) + if offset: + out_string = self._maybe_add_prefix_space(t, out_string) + return out_string + + def __call__(self, s: Union[str, Sequence[str]]): + """Tokenize prompts. + + Args: + s (str): prompts + Returns: + list[int]: token ids + """ + add_special_tokens = False + return self.model(s, add_special_tokens=add_special_tokens) + + +class Tokenizer: + """Tokenize prompts or de-tokenize tokens into texts. + + Args: + model_file (str): the path of the tokenizer model + """ + + def __init__(self, model_file: str): + if model_file.endswith('.model'): + model_folder = osp.split(model_file)[0] + else: + model_folder = model_file + model_file = osp.join(model_folder, 'tokenizer.model') + tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json') + + model_file_exists = osp.exists(model_file) + config_exists = osp.exists(tokenizer_config_file) + use_hf_model = config_exists or not model_file_exists + self.logger = get_logger('lmdeploy') + if not use_hf_model: + self.model = SentencePieceTokenizer(model_file) + else: + self.model = HuggingFaceTokenizer(model_folder) + + @property + def vocab_size(self): + """vocabulary size.""" + return self.model.vocab_size + + @property + def bos_token_id(self): + """begine of the sentence token id.""" + return self.model.bos_token_id + + @property + def eos_token_id(self): + """end of the sentence token id.""" + return self.model.eos_token_id + + def encode(self, s: str, add_bos: bool = True, **kwargs): + """Tokenize a prompt. + + Args: + s (str): a prompt + Returns: + list[int]: token ids + """ + return self.model.encode(s, add_bos, **kwargs) + + def decode(self, t: Sequence[int], offset: Optional[int] = None): + """De-tokenize. + + Args: + t (List[int]): a list of token ids + offset (int): for incrementally decoding. Default to None, which + means not applied. + Returns: + str: text of decoding tokens + """ + return self.model.decode(t, offset) + + def __call__(self, s: Union[str, Sequence[str]]): + """Tokenize prompts. + + Args: + s (str): prompts + Returns: + list[int]: token ids + """ + return self.model(s) + + def indexes_containing_token(self, token): + """Return all the possible indexes, whose decoding output may contain + the input token.""" + encoded = self.encode(token, add_bos=False) + if len(encoded) > 1: + self.logger.warning( + f'The token {token}, its length of indexes {encoded} is over ' + 'than 1. Currently, it can not be used as stop words') + return [] + return self.model.indexes_containing_token(token) diff --git a/triton_models/postprocessing/1/tokenizer/tokenizer_config.json b/triton_models/postprocessing/1/tokenizer/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f133449013be570f08fdf7c70f1a2c8ccb4724da --- /dev/null +++ b/triton_models/postprocessing/1/tokenizer/tokenizer_config.json @@ -0,0 +1,90 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92538": { + "content": "<|plugin|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92539": { + "content": "<|interpreter|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92540": { + "content": "<|action_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92541": { + "content": "<|action_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92542": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92543": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "auto_map": { + "AutoTokenizer": [ + "tokenization_internlm.InternLMTokenizer", + null + ] + }, + "bos_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "tokenizer_class": "InternLMTokenizer", + "unk_token": "" +} diff --git a/triton_models/postprocessing/config.pbtxt b/triton_models/postprocessing/config.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..a4c3fd1041dcd03dc5c18b3fc28533cb82ac5653 --- /dev/null +++ b/triton_models/postprocessing/config.pbtxt @@ -0,0 +1,36 @@ +name: "postprocessing" +backend: "python" +max_batch_size: 1 +input [ + { + name: "TOKENS_BATCH" + data_type: TYPE_UINT32 + dims: [ -1, -1 ] + }, + { + name: "sequence_length" + data_type: TYPE_UINT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_STRING + dims: [ -1, -1 ] + } +] + +instance_group [ + { + count: 16 + kind: KIND_CPU + } +] + +parameters { + key: "tokenizer_path" + value: { + string_value: "tokenizer/tokenizer.model" + } +} diff --git a/triton_models/preprocessing/1/__pycache__/model.cpython-310.pyc b/triton_models/preprocessing/1/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..447bea773ddcc3daff21ef636ce8437c6632fed8 Binary files /dev/null and b/triton_models/preprocessing/1/__pycache__/model.cpython-310.pyc differ diff --git a/triton_models/preprocessing/1/model.py b/triton_models/preprocessing/1/model.py new file mode 100644 index 0000000000000000000000000000000000000000..7e659fbae01737bd0a83980faf0e1eff9e607c3f --- /dev/null +++ b/triton_models/preprocessing/1/model.py @@ -0,0 +1,151 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp +from pathlib import Path + +import numpy as np +import torch +import triton_python_backend_utils as pb_utils +from torch.nn.utils.rnn import pad_sequence + +# This tokenizer is `lmdeploy/turbomind/tokenizer.py`. When an LLM is served +# by triton inference server, it has to be converted first by running +# `python lmdeploy/serve/turbomind/deploy.py`. Then +# `lmdeploy/turbomind/tokenizer.py` will be copied to `tokenizer/tokenizer.py` +from .tokenizer.tokenizer import Tokenizer + + +class TritonPythonModel: + """Your Python model must use the same class name. + + Every Python model that is created must have "TritonPythonModel" as the + class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device + ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # Parse model configs + self.model_config = model_config = json.loads(args['model_config']) + + # Parse model output configs and convert Triton types to numpy types + input_names = ['INPUT_ID', 'REQUEST_INPUT_LEN'] + for input_name in input_names: + setattr( + self, + input_name.lower() + '_dtype', + pb_utils.triton_string_to_numpy( + pb_utils.get_output_config_by_name( + model_config, input_name)['data_type'])) + + cur_folder = Path(__file__).parent + self.tokenizer = Tokenizer( + osp.join( + cur_folder, self.model_config['parameters']['tokenizer_path'] + ['string_value'])) + self.start_id = self.tokenizer.bos_token_id + self.end_id = self.tokenizer.eos_token_id + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + for idx, request in enumerate(requests): + # Get input tensors + query = pb_utils.get_input_tensor_by_name(request, + 'QUERY').as_numpy() + + # Preprocessing input data. + input_id, request_input_len = self._create_request(query) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + input_id_tensor = pb_utils.Tensor( + 'INPUT_ID', + np.array(input_id).astype(self.input_id_dtype)) + request_input_len_tensor = pb_utils.Tensor( + 'REQUEST_INPUT_LEN', + np.array(request_input_len).astype( + self.request_input_len_dtype)) + + # Create InferenceResponse. You can set an error here in case + # there was a problem with handling this inference request. + # Below is an example of how you can set errors in inference + # response: + # + # pb_utils.InferenceResponse( + # output_tensors=..., TritonError("An error occurred")) + inference_response = pb_utils.InferenceResponse( + output_tensors=[input_id_tensor, request_input_len_tensor]) + responses.append(inference_response) + + # You should return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + + Implementing `finalize` function is optional. This function allows the + model to perform any necessary clean ups before exit. + """ + print('Cleaning up...') + + def _create_request(self, query): + """Tokenize prompts and return the token ids and their length. + + Args: + query (List[str]): a list of prompt + Returns: + tuple: token ids and their length + """ + start_ids = [] + for s in query: + _s = s[0].decode() + if _s == '': + start_id = [self.start_id + ] if self.start_id is not None else [-1] + elif _s == '': + start_id = [self.end_id] if self.end_id is not None else [-1] + else: + start_id = self.tokenizer.encode(_s) + start_ids.append(torch.IntTensor(start_id)) + + start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids]) + start_ids = pad_sequence(start_ids, + batch_first=True, + padding_value=self.end_id) + return start_ids, start_lengths diff --git a/triton_models/preprocessing/1/tokenizer/config.json b/triton_models/preprocessing/1/tokenizer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..84235b8a1a9618cc0ac265caf61ea4088780e3b1 --- /dev/null +++ b/triton_models/preprocessing/1/tokenizer/config.json @@ -0,0 +1,37 @@ +{ + "_name_or_path": "/root/psy/internlm2-7b/work_dirs/internlm2_chat_7b_qlora_oasst1_512_e3_copy/hf_2/merge", + "architectures": [ + "InternLM2ForCausalLM" + ], + "attn_implementation": "eager", + "auto_map": { + "AutoConfig": "configuration_internlm.InternLMConfig", + "AutoModel": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM" + }, + "bias": false, + "bos_token_id": 1, + "eos_token_id": 2, + "fp16": true, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "model_type": "internlm", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pad_token_id": 2, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 2.0, + "type": "dynamic" + }, + "rope_theta": 1000000, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.37.2", + "use_cache": false, + "vocab_size": 92544 +} diff --git a/triton_models/preprocessing/1/tokenizer/configuration_internlm.py b/triton_models/preprocessing/1/tokenizer/configuration_internlm.py new file mode 100644 index 0000000000000000000000000000000000000000..4d013582feaa1f9970a4256c4a0f77000fa645de --- /dev/null +++ b/triton_models/preprocessing/1/tokenizer/configuration_internlm.py @@ -0,0 +1,164 @@ +# coding=utf-8 +# Copyright (c) InternLM. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" InternLM model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {} + + +class InternLMConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate + an InternLM model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the InternLM-7B. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`InternLMModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings(`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + Example: + + ```python + >>> from transformers import InternLMModel, InternLMConfig + + >>> # Initializing a InternLM internlm-7b style configuration + >>> configuration = InternLMConfig() + + >>> # Initializing a model from the internlm-7b style configuration + >>> model = InternLMModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "internlm" + _auto_class = "AutoConfig" + + def __init__( # pylint: disable=W0102 + self, + vocab_size=103168, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + bias=True, + rope_theta=10000, + rope_scaling=None, + attn_implementation="eager", + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.bias = bias + + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + + self.attn_implementation = attn_implementation + if self.attn_implementation is None: + self.attn_implementation = "eager" + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}") diff --git a/triton_models/preprocessing/1/tokenizer/generation_config.json b/triton_models/preprocessing/1/tokenizer/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cc5efeadd3bf2caa4462a3be79d580690f410668 --- /dev/null +++ b/triton_models/preprocessing/1/tokenizer/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2, + "transformers_version": "4.37.2" +} diff --git a/triton_models/preprocessing/1/tokenizer/modeling_internlm2.py b/triton_models/preprocessing/1/tokenizer/modeling_internlm2.py new file mode 100644 index 0000000000000000000000000000000000000000..39d6f71d2933385988ec05f845d3f6386c97f74b --- /dev/null +++ b/triton_models/preprocessing/1/tokenizer/modeling_internlm2.py @@ -0,0 +1,1385 @@ +# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved. +# +# This code is based on transformers/src/transformers/models/llama/modeling_llama.py +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch InternLM2 model.""" +import math +import queue +import threading +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from einops import rearrange +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from transformers.activations import ACT2FN +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) + +try: + from transformers.generation.streamers import BaseStreamer +except: # noqa # pylint: disable=bare-except + BaseStreamer = None + +from .configuration_internlm import InternLMConfig as InternLM2Config + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "InternLM2Config" + +flash_attn_func, flash_attn_varlen_func = None, None +pad_input, index_first_axis, unpad_input = None, None, None +def _import_flash_attn(): + global flash_attn_func, flash_attn_varlen_func + global pad_input, index_first_axis, unpad_input + try: + from flash_attn import flash_attn_func as _flash_attn_func, flash_attn_varlen_func as _flash_attn_varlen_func + from flash_attn.bert_padding import pad_input as _pad_input, index_first_axis as _index_first_axis, unpad_input as _unpad_input + flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func + pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input + except ImportError: + raise ImportError("flash_attn is not installed.") + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.bart.modeling_bart._make_causal_mask +def _make_causal_mask( + input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 +): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM2 +class InternLM2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + InternLM2RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +# Copied from transformers.model.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM2 +class InternLM2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=torch.float32) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.model.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->InternLM2 +class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding): + """InternLM2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + t = t / self.scaling_factor + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.model.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM2 +class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding): + """InternLM2RotaryEmbedding extended with Dynamic NTK scaling. + Credits to the Reddit users /u/bloc97 and /u/emozilla. + """ + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.model.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors.""" + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class InternLM2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.w1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.w3 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.w2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x)) + + return down_proj + + +# Copied from transformers.model.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +# Modified from transformers.model.llama.modeling_llama.LlamaAttention +class InternLM2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: InternLM2Config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.wqkv = nn.Linear( + self.hidden_size, + (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim, + bias=config.bias, + ) + + self.wo = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = InternLM2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "dynamic": + self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, + scaling_factor=scaling_factor, + ) + elif scaling_type == "linear": + self.rotary_emb = InternLM2LinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, + scaling_factor=scaling_factor, + ) + else: + raise ValueError("Currently we only support rotary embedding's type being 'dynamic' or 'linear'.") + return self.rotary_emb + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + + bsz, q_len, _ = hidden_states.size() + + qkv_states = self.wqkv(hidden_states) + + qkv_states = rearrange( + qkv_states, + "b q (h gs d) -> b q h gs d", + gs=2 + self.num_key_value_groups, + d=self.head_dim, + ) + + query_states = qkv_states[..., : self.num_key_value_groups, :] + query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d") + key_states = qkv_states[..., -2, :] + value_states = qkv_states[..., -1, :] + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.wo(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +# Modified from transformers.model.llama.modeling_llama.InternLM2FlashAttention2 +class InternLM2FlashAttention2(InternLM2Attention): + """ + InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # InternLM2FlashAttention2 attention does not support output_attentions + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + qkv_states = self.wqkv(hidden_states) + + qkv_states = rearrange( + qkv_states, + "b q (h gs d) -> b q h gs d", + gs=2 + self.num_key_value_groups, + d=self.head_dim, + ) + + query_states = qkv_states[..., : self.num_key_value_groups, :] + query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d") + key_states = qkv_states[..., -2, :] + value_states = qkv_states[..., -1, :] + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, q_len + ) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.wo(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + # Contains at least one padding token in the sequence + causal = self.is_causal and query_length != 1 + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q.to(torch.int64), + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + +INTERNLM2_ATTENTION_CLASSES = { + "eager": InternLM2Attention, + "flash_attention_2": InternLM2FlashAttention2, +} + +# Modified from transformers.model.llama.modeling_llama.LlamaDecoderLayer +class InternLM2DecoderLayer(nn.Module): + def __init__(self, config: InternLM2Config): + super().__init__() + self.hidden_size = config.hidden_size + + self.attention = INTERNLM2_ATTENTION_CLASSES[config.attn_implementation](config=config) + + self.feed_forward = InternLM2MLP(config) + self.attention_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.ffn_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + + residual = hidden_states + + hidden_states = self.attention_norm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.attention( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.ffn_norm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +InternLM2_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`InternLM2Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2 +@add_start_docstrings( + "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.", + InternLM2_START_DOCSTRING, +) +class InternLM2PreTrainedModel(PreTrainedModel): + config_class = InternLM2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["InternLM2DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +InternLM2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or + when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Modified from transformers.model.llama.modeling_llama.LlamaModel +@add_start_docstrings( + "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.", + InternLM2_START_DOCSTRING, +) +class InternLM2Model(InternLM2PreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLM2DecoderLayer`] + + Args: + config: InternLM2Config + """ + + _auto_class = "AutoModel" + + def __init__(self, config: InternLM2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.config = config + + self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + + self.layers = nn.ModuleList([InternLM2DecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.tok_embeddings + + def set_input_embeddings(self, value): + self.tok_embeddings = value + + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + device=inputs_embeds.device, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( + inputs_embeds.device + ) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.attn_implementation == "flash_attention_2": + _import_flash_attn() + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + seq_length_with_past = seq_length + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.tok_embeddings(input_ids) + + if self.config.attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + else: + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + # embed positions + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, output_attentions, None) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +# Modified from transformers.model.llama.modeling_llama.LlamaForCausalLM +class InternLM2ForCausalLM(InternLM2PreTrainedModel): + _auto_class = "AutoModelForCausalLM" + + _tied_weights_keys = ["output.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = InternLM2Model(config) + self.vocab_size = config.vocab_size + self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.tok_embeddings + + def set_input_embeddings(self, value): + self.model.tok_embeddings = value + + def get_output_embeddings(self): + return self.output + + def set_output_embeddings(self, new_embeddings): + self.output = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, InternLM2ForCausalLM + + >>> model = InternLM2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.output(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values is not None: + past_length = past_key_values[0][0].shape[2] + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""): + prompt = "" + if meta_instruction: + prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n""" + else: + prompt += "" + for record in history: + prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n""" + prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n""" + return tokenizer([prompt], return_tensors="pt") + + @torch.no_grad() + def chat( + self, + tokenizer, + query: str, + history: List[Tuple[str, str]] = [], + streamer: Optional[BaseStreamer] = None, + max_new_tokens: int = 1024, + do_sample: bool = True, + temperature: float = 0.8, + top_p: float = 0.8, + meta_instruction: str = "You are an AI assistant whose name is InternLM (书生·浦语).\n" + "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n" + "- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.", + **kwargs, + ): + inputs = self.build_inputs(tokenizer, query, history, meta_instruction) + inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)} + # also add end-of-assistant token in eos token id to avoid unnecessary generation + eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]] + outputs = self.generate( + **inputs, + streamer=streamer, + max_new_tokens=max_new_tokens, + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + eos_token_id=eos_token_id, + **kwargs, + ) + outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :] + response = tokenizer.decode(outputs, skip_special_tokens=True) + response = response.split("<|im_end|>")[0] + history = history + [(query, response)] + return response, history + + @torch.no_grad() + def stream_chat( + self, + tokenizer, + query: str, + history: List[Tuple[str, str]] = [], + max_new_tokens: int = 1024, + do_sample: bool = True, + temperature: float = 0.8, + top_p: float = 0.8, + **kwargs, + ): + """ + Return a generator in format: (response, history) + Eg. + ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')]) + ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')]) + """ + if BaseStreamer is None: + raise ModuleNotFoundError( + "The version of `transformers` is too low. Please make sure " + "that you have installed `transformers>=4.28.0`." + ) + + response_queue = queue.Queue(maxsize=20) + + class ChatStreamer(BaseStreamer): + def __init__(self, tokenizer) -> None: + super().__init__() + self.tokenizer = tokenizer + self.queue = response_queue + self.query = query + self.history = history + self.response = "" + self.received_inputs = False + self.queue.put((self.response, history + [(self.query, self.response)])) + + def put(self, value): + if len(value.shape) > 1 and value.shape[0] > 1: + raise ValueError("ChatStreamer only supports batch size 1") + elif len(value.shape) > 1: + value = value[0] + + if not self.received_inputs: + # The first received value is input_ids, ignore here + self.received_inputs = True + return + + token = self.tokenizer.decode([value[-1]], skip_special_tokens=True) + if token.strip() != "<|im_end|>": + self.response = self.response + token + history = self.history + [(self.query, self.response)] + self.queue.put((self.response, history)) + + def end(self): + self.queue.put(None) + + def stream_producer(): + return self.chat( + tokenizer=tokenizer, + query=query, + streamer=ChatStreamer(tokenizer=tokenizer), + history=history, + max_new_tokens=max_new_tokens, + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + **kwargs, + ) + + def consumer(): + producer = threading.Thread(target=stream_producer) + producer.start() + while True: + res = response_queue.get() + if res is None: + return + yield res + + return consumer() + + +# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2 +@add_start_docstrings( + """ + The InternLM2 Model transformer with a sequence classification head on top (linear layer). + + [`InternLM2ForSequenceClassification`] uses the last token in order to do the classification, + as other causal models (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + InternLM2_START_DOCSTRING, +) +class InternLM2ForSequenceClassification(InternLM2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = InternLM2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.tok_embeddings + + def set_input_embeddings(self, value): + self.model.tok_embeddings = value + + @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to( + logits.device + ) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/triton_models/preprocessing/1/tokenizer/placeholder b/triton_models/preprocessing/1/tokenizer/placeholder new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/triton_models/preprocessing/1/tokenizer/pytorch_model.bin.index.json b/triton_models/preprocessing/1/tokenizer/pytorch_model.bin.index.json new file mode 100644 index 0000000000000000000000000000000000000000..7d95cf180df4c423e817c55f30f5ce93ac80e220 --- /dev/null +++ b/triton_models/preprocessing/1/tokenizer/pytorch_model.bin.index.json @@ -0,0 +1,554 @@ +{ + "metadata": { + "total_size": 5251801088 + }, + "weight_map": { + "model.layers.0.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.10.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.10.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.10.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.2.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.20.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.27.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.27.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.27.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.3.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.30.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.4.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.norm.weight": "pytorch_model-00003-of-00003.bin", + "model.tok_embeddings.weight": "pytorch_model-00001-of-00003.bin", + "output.weight": "pytorch_model-00003-of-00003.bin" + } +} diff --git a/triton_models/preprocessing/1/tokenizer/special_tokens_map.json b/triton_models/preprocessing/1/tokenizer/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..492d4b2966a1763442d426d880dbc29f94906e4c --- /dev/null +++ b/triton_models/preprocessing/1/tokenizer/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/triton_models/preprocessing/1/tokenizer/tokenization_internlm.py b/triton_models/preprocessing/1/tokenizer/tokenization_internlm.py new file mode 100644 index 0000000000000000000000000000000000000000..9792349c7fed6fc64476eabdd9dad7a84640c3ee --- /dev/null +++ b/triton_models/preprocessing/1/tokenizer/tokenization_internlm.py @@ -0,0 +1,240 @@ +# coding=utf-8 +# Copyright (c) InternLM. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tokenization classes for IntermLM.""" +import os +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm +from transformers.tokenization_utils import PreTrainedTokenizer +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"} + +PRETRAINED_VOCAB_FILES_MAP = {} + + +class InternLMTokenizer(PreTrainedTokenizer): + """ + Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + model_input_names = ["input_ids", "attention_mask"] + _auto_class = "AutoTokenizer" + + def __init__( + self, + vocab_file, + unk_token="", + bos_token="", + eos_token="", + pad_token="", + sp_model_kwargs: Optional[Dict[str, Any]] = None, + add_bos_token=True, + add_eos_token=False, + decode_with_prefix_space=False, + clean_up_tokenization_spaces=False, + **kwargs, + ): + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.vocab_file = vocab_file + self.add_bos_token = add_bos_token + self.add_eos_token = add_eos_token + self.decode_with_prefix_space = decode_with_prefix_space + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + self._no_prefix_space_tokens = None + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + """ Initialization""" + + @property + def no_prefix_space_tokens(self): + if self._no_prefix_space_tokens is None: + vocab = self.convert_ids_to_tokens(list(range(self.vocab_size))) + self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")} + return self._no_prefix_space_tokens + + @property + def vocab_size(self): + """Returns vocab size""" + return self.sp_model.get_piece_size() + + @property + def bos_token_id(self) -> Optional[int]: + return self.sp_model.bos_id() + + @property + def eos_token_id(self) -> Optional[int]: + return self.sp_model.eos_id() + + def get_vocab(self): + """Returns vocab as a dict""" + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text): + """Returns a tokenized string.""" + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.piece_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + token = self.sp_model.IdToPiece(index) + return token + + def _maybe_add_prefix_space(self, tokens, decoded): + if tokens and tokens[0] not in self.no_prefix_space_tokens: + return " " + decoded + else: + return decoded + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + out_string = self.clean_up_tokenization(out_string) + out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string) + return out_string[1:] + + def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + if self.add_bos_token: + bos_token_ids = [self.bos_token_id] + else: + bos_token_ids = [] + + output = bos_token_ids + token_ids_0 + + if token_ids_1 is not None: + output = output + token_ids_1 + + if self.add_eos_token: + output = output + [self.eos_token_id] + + return output + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make + use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + eos = [self.eos_token_id] + + if token_ids_1 is None: + return len(token_ids_0 + eos) * [0] + return len(token_ids_0 + eos + token_ids_1 + eos) * [0] diff --git a/triton_models/preprocessing/1/tokenizer/tokenizer.model b/triton_models/preprocessing/1/tokenizer/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6600712949ca9c4ffb50f25275993a21fba0b408 --- /dev/null +++ b/triton_models/preprocessing/1/tokenizer/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b +size 1477754 diff --git a/triton_models/preprocessing/1/tokenizer/tokenizer.py b/triton_models/preprocessing/1/tokenizer/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..db936a5501cb07d33d56083656dbd734ba7431bf --- /dev/null +++ b/triton_models/preprocessing/1/tokenizer/tokenizer.py @@ -0,0 +1,400 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os +import os.path as osp +from collections import deque +from typing import List, Optional, Sequence, Union + +import torch + +from lmdeploy.utils import get_logger + +# this file will be copied to triton server, make sure all +# importing are starting from the package root lmdeploy + + +class SentencePieceTokenizer: + """Tokenizer of sentencepiece. + + Args: + model_file (str): the path of the tokenizer model + """ + + def __init__(self, model_file: str): + from sentencepiece import SentencePieceProcessor + self.model = SentencePieceProcessor(model_file=model_file) + self._prefix_space_tokens = None + # for stop words + self._maybe_decode_bytes: bool = None + # TODO maybe lack a constant.py + self._indexes_tokens_deque = deque(maxlen=10) + self.max_indexes_num = 5 + self.logger = get_logger('lmdeploy') + + @property + def vocab_size(self): + """vocabulary size.""" + return self.model.vocab_size() + + @property + def bos_token_id(self): + """begine of the sentence token id.""" + return self.model.bos_id() + + @property + def eos_token_id(self): + """end of the sentence token id.""" + return self.model.eos_id() + + @property + def prefix_space_tokens(self): + """tokens without prefix space.""" + if self._prefix_space_tokens is None: + vocab = self.model.IdToPiece(list(range(self.vocab_size))) + self._prefix_space_tokens = { + i + for i, tok in enumerate(vocab) if tok.startswith('▁') + } + return self._prefix_space_tokens + + def _maybe_add_prefix_space(self, tokens, decoded): + """maybe add prefix space for incremental decoding.""" + if len(tokens) and not decoded.startswith(' ') and\ + tokens[0] in self.prefix_space_tokens: + return ' ' + decoded + else: + return decoded + + def indexes_containing_token(self, token: str): + """Return all the possible indexes, whose decoding output may contain + the input token.""" + # traversing vocab is time consuming, can not be accelerated with + # multi threads (computation) or multi process (can't pickle tokenizer) + # so, we maintain latest 10 stop words and return directly if matched + for _token, _indexes in self._indexes_tokens_deque: + if token == _token: + return _indexes + if token == ' ': # ' ' is special + token = '▁' + vocab = self.model.IdToPiece(list(range(self.vocab_size))) + indexes = [i for i, voc in enumerate(vocab) if token in voc] + if len(indexes) > self.max_indexes_num: + indexes = self.encode(token, add_bos=False)[-1:] + self.logger.warning( + f'There are too many(>{self.max_indexes_num}) possible ' + f'indexes may decoding {token}, we will use {indexes} only') + self._indexes_tokens_deque.append((token, indexes)) + return indexes + + def encode(self, s: str, add_bos: bool = True, **kwargs): + """Tokenize a prompt. + + Args: + s (str): a prompt + Returns: + list[int]: token ids + """ + return self.model.Encode(s, add_bos=add_bos, **kwargs) + + def decode(self, t: Sequence[int], offset: Optional[int] = None): + """De-tokenize. + + Args: + t (List[int]): a list of token ids + offset (int): for incrementally decoding. Default to None, which + means not applied. + Returns: + str: text of decoding tokens + """ + if isinstance(t, torch.Tensor): + t = t.tolist() + t = t[offset:] + out_string = self.model.Decode(t) + if offset: + out_string = self._maybe_add_prefix_space(t, out_string) + return out_string + + def __call__(self, s: Union[str, Sequence[str]]): + """Tokenize prompts. + + Args: + s (str): prompts + Returns: + list[int]: token ids + """ + import addict + add_bos = False + add_eos = False + + input_ids = self.model.Encode(s, add_bos=add_bos, add_eos=add_eos) + return addict.Addict(input_ids=input_ids) + + +class HuggingFaceTokenizer: + """Tokenizer of sentencepiece. + + Args: + model_dir (str): the directory of the tokenizer model + """ + + def __init__(self, model_dir: str): + from transformers import AutoTokenizer + model_file = osp.join(model_dir, 'tokenizer.model') + backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json') + model_file_exists = osp.exists(model_file) + self.logger = get_logger('lmdeploy') + if not osp.exists(backend_tokenizer_file) and model_file_exists: + self.logger.warning( + 'Can not find tokenizer.json. ' + 'It may take long time to initialize the tokenizer.') + self.model = AutoTokenizer.from_pretrained(model_dir, + trust_remote_code=True) + self._prefix_space_tokens = None + # save tokenizer.json to reuse + if not osp.exists(backend_tokenizer_file) and model_file_exists: + if hasattr(self.model, 'backend_tokenizer'): + if os.access(model_dir, os.W_OK): + self.model.backend_tokenizer.save(backend_tokenizer_file) + + if self.model.eos_token_id is None: + generation_config_file = osp.join(model_dir, + 'generation_config.json') + if osp.exists(generation_config_file): + with open(generation_config_file, 'r') as f: + cfg = json.load(f) + self.model.eos_token_id = cfg['eos_token_id'] + elif hasattr(self.model, 'eod_id'): # Qwen remote + self.model.eos_token_id = self.model.eod_id + + # for stop words + self._vocab_size_with_added: int = None + self._maybe_decode_bytes: bool = None + # TODO maybe lack a constant.py + self._indexes_tokens_deque = deque(maxlen=10) + self.max_indexes_num = 5 + self.token2id = {} + + @property + def vocab_size(self): + """vocabulary size.""" + return self.model.vocab_size + + @property + def vocab_size_with_added(self): + """vocabulary size with added vocab.""" + if self._vocab_size_with_added is not None: + return self._vocab_size_with_added + self._vocab_size_with_added = len(self.model.get_vocab()) + return self._vocab_size_with_added + + @property + def bos_token_id(self): + """begine of the sentence token id.""" + return self.model.bos_token_id + + @property + def eos_token_id(self): + """end of the sentence token id.""" + return self.model.eos_token_id + + @property + def prefix_space_tokens(self): + """tokens without prefix space.""" + if self._prefix_space_tokens is None: + vocab = self.model.convert_ids_to_tokens( + list(range(self.vocab_size))) + self._prefix_space_tokens = { + i + for i, tok in enumerate(vocab) + if tok.startswith('▁' if isinstance(tok, str) else b' ') + } + return self._prefix_space_tokens + + def _maybe_add_prefix_space(self, tokens: List[int], decoded: str): + """maybe add prefix space for incremental decoding.""" + if len(tokens) and not decoded.startswith(' ') and\ + tokens[0] in self.prefix_space_tokens: + return ' ' + decoded + else: + return decoded + + @property + def maybe_decode_bytes(self): + """Check if self.model.convert_ids_to_tokens return not a str value.""" + if self._maybe_decode_bytes is None: + self._maybe_decode_bytes = False + vocab = self.model.convert_ids_to_tokens( + list(range(self.vocab_size))) + for tok in vocab: + if not isinstance(tok, str): + self._maybe_decode_bytes = True + break + return self._maybe_decode_bytes + + def indexes_containing_token(self, token: str): + """Return all the possible indexes, whose decoding output may contain + the input token.""" + # traversing vocab is time consuming, can not be accelerated with + # multi threads (computation) or multi process (can't pickle tokenizer) + # so, we maintain latest 10 stop words and return directly if matched + for _token, _indexes in self._indexes_tokens_deque: + if token == _token: + return _indexes + + if self.token2id == {}: + # decode is slower than convert_ids_to_tokens + if self.maybe_decode_bytes: + self.token2id = { + self.model.decode(i): i + for i in range(self.vocab_size) + } + else: + self.token2id = { + self.model.convert_ids_to_tokens(i): i + for i in range(self.vocab_size) + } + if token == ' ': # ' ' is special + token = '▁' + indexes = [i for _token, i in self.token2id.items() if token in _token] + if len(indexes) > self.max_indexes_num: + indexes = self.encode(token, add_bos=False)[-1:] + self.logger.warning( + f'There are too many(>{self.max_indexes_num}) possible ' + f'indexes may decoding {token}, we will use {indexes} only') + # there might be token id that exceeds self.vocab_size + if len(indexes) == 0: + indexes = self.encode(token, False) + if len(indexes) != 1: + self.logger.warning( + f'The token {token}, its length of indexes {indexes} is ' + 'not 1. Currently, it can not be used as stop words') + indexes = [] + self._indexes_tokens_deque.append((token, indexes)) + return indexes + + def encode(self, s: str, add_bos: bool = True, **kwargs): + """Tokenize a prompt. + + Args: + s (str): a prompt + Returns: + list[int]: token ids + """ + encoded = self.model.encode(s, **kwargs) + if not add_bos: + # in the middle of a session + if len(encoded) and encoded[0] == self.bos_token_id: + encoded = encoded[1:] + return encoded + + def decode(self, t: Sequence[int], offset: Optional[int] = None): + """De-tokenize. + + Args: + t (List[int]): a list of token ids + offset (int): for incrementally decoding. Default to None, which + means not applied. + Returns: + str: text of decoding tokens + """ + skip_special_tokens = True + t = t[offset:] + out_string = self.model.decode(t, + skip_special_tokens=skip_special_tokens) + if offset: + out_string = self._maybe_add_prefix_space(t, out_string) + return out_string + + def __call__(self, s: Union[str, Sequence[str]]): + """Tokenize prompts. + + Args: + s (str): prompts + Returns: + list[int]: token ids + """ + add_special_tokens = False + return self.model(s, add_special_tokens=add_special_tokens) + + +class Tokenizer: + """Tokenize prompts or de-tokenize tokens into texts. + + Args: + model_file (str): the path of the tokenizer model + """ + + def __init__(self, model_file: str): + if model_file.endswith('.model'): + model_folder = osp.split(model_file)[0] + else: + model_folder = model_file + model_file = osp.join(model_folder, 'tokenizer.model') + tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json') + + model_file_exists = osp.exists(model_file) + config_exists = osp.exists(tokenizer_config_file) + use_hf_model = config_exists or not model_file_exists + self.logger = get_logger('lmdeploy') + if not use_hf_model: + self.model = SentencePieceTokenizer(model_file) + else: + self.model = HuggingFaceTokenizer(model_folder) + + @property + def vocab_size(self): + """vocabulary size.""" + return self.model.vocab_size + + @property + def bos_token_id(self): + """begine of the sentence token id.""" + return self.model.bos_token_id + + @property + def eos_token_id(self): + """end of the sentence token id.""" + return self.model.eos_token_id + + def encode(self, s: str, add_bos: bool = True, **kwargs): + """Tokenize a prompt. + + Args: + s (str): a prompt + Returns: + list[int]: token ids + """ + return self.model.encode(s, add_bos, **kwargs) + + def decode(self, t: Sequence[int], offset: Optional[int] = None): + """De-tokenize. + + Args: + t (List[int]): a list of token ids + offset (int): for incrementally decoding. Default to None, which + means not applied. + Returns: + str: text of decoding tokens + """ + return self.model.decode(t, offset) + + def __call__(self, s: Union[str, Sequence[str]]): + """Tokenize prompts. + + Args: + s (str): prompts + Returns: + list[int]: token ids + """ + return self.model(s) + + def indexes_containing_token(self, token): + """Return all the possible indexes, whose decoding output may contain + the input token.""" + encoded = self.encode(token, add_bos=False) + if len(encoded) > 1: + self.logger.warning( + f'The token {token}, its length of indexes {encoded} is over ' + 'than 1. Currently, it can not be used as stop words') + return [] + return self.model.indexes_containing_token(token) diff --git a/triton_models/preprocessing/1/tokenizer/tokenizer_config.json b/triton_models/preprocessing/1/tokenizer/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f133449013be570f08fdf7c70f1a2c8ccb4724da --- /dev/null +++ b/triton_models/preprocessing/1/tokenizer/tokenizer_config.json @@ -0,0 +1,90 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92538": { + "content": "<|plugin|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92539": { + "content": "<|interpreter|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92540": { + "content": "<|action_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92541": { + "content": "<|action_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92542": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92543": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "auto_map": { + "AutoTokenizer": [ + "tokenization_internlm.InternLMTokenizer", + null + ] + }, + "bos_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "tokenizer_class": "InternLMTokenizer", + "unk_token": "" +} diff --git a/triton_models/preprocessing/config.pbtxt b/triton_models/preprocessing/config.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..997ba399ba04f1f521bdbf088815d1dd3c26f696 --- /dev/null +++ b/triton_models/preprocessing/config.pbtxt @@ -0,0 +1,37 @@ +name: "preprocessing" +backend: "python" +max_batch_size: 1 + +input [ + { + name: "QUERY" + data_type: TYPE_STRING + dims: [ -1 ] + } +] +output [ + { + name: "INPUT_ID" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "REQUEST_INPUT_LEN" + data_type: TYPE_UINT32 + dims: [ 1 ] + } +] + +instance_group [ + { + count: 4 + kind: KIND_CPU + } +] + +parameters { + key: "tokenizer_path" + value: { + string_value: "tokenizer/tokenizer.model" + } +} diff --git a/triton_models/tokenizer/config.json b/triton_models/tokenizer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..84235b8a1a9618cc0ac265caf61ea4088780e3b1 --- /dev/null +++ b/triton_models/tokenizer/config.json @@ -0,0 +1,37 @@ +{ + "_name_or_path": "/root/psy/internlm2-7b/work_dirs/internlm2_chat_7b_qlora_oasst1_512_e3_copy/hf_2/merge", + "architectures": [ + "InternLM2ForCausalLM" + ], + "attn_implementation": "eager", + "auto_map": { + "AutoConfig": "configuration_internlm.InternLMConfig", + "AutoModel": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM" + }, + "bias": false, + "bos_token_id": 1, + "eos_token_id": 2, + "fp16": true, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "model_type": "internlm", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pad_token_id": 2, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 2.0, + "type": "dynamic" + }, + "rope_theta": 1000000, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.37.2", + "use_cache": false, + "vocab_size": 92544 +} diff --git a/triton_models/tokenizer/configuration_internlm.py b/triton_models/tokenizer/configuration_internlm.py new file mode 100644 index 0000000000000000000000000000000000000000..4d013582feaa1f9970a4256c4a0f77000fa645de --- /dev/null +++ b/triton_models/tokenizer/configuration_internlm.py @@ -0,0 +1,164 @@ +# coding=utf-8 +# Copyright (c) InternLM. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" InternLM model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {} + + +class InternLMConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate + an InternLM model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the InternLM-7B. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`InternLMModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings(`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + Example: + + ```python + >>> from transformers import InternLMModel, InternLMConfig + + >>> # Initializing a InternLM internlm-7b style configuration + >>> configuration = InternLMConfig() + + >>> # Initializing a model from the internlm-7b style configuration + >>> model = InternLMModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "internlm" + _auto_class = "AutoConfig" + + def __init__( # pylint: disable=W0102 + self, + vocab_size=103168, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + bias=True, + rope_theta=10000, + rope_scaling=None, + attn_implementation="eager", + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.bias = bias + + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + + self.attn_implementation = attn_implementation + if self.attn_implementation is None: + self.attn_implementation = "eager" + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}") diff --git a/triton_models/tokenizer/generation_config.json b/triton_models/tokenizer/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cc5efeadd3bf2caa4462a3be79d580690f410668 --- /dev/null +++ b/triton_models/tokenizer/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2, + "transformers_version": "4.37.2" +} diff --git a/triton_models/tokenizer/modeling_internlm2.py b/triton_models/tokenizer/modeling_internlm2.py new file mode 100644 index 0000000000000000000000000000000000000000..39d6f71d2933385988ec05f845d3f6386c97f74b --- /dev/null +++ b/triton_models/tokenizer/modeling_internlm2.py @@ -0,0 +1,1385 @@ +# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved. +# +# This code is based on transformers/src/transformers/models/llama/modeling_llama.py +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch InternLM2 model.""" +import math +import queue +import threading +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from einops import rearrange +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from transformers.activations import ACT2FN +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) + +try: + from transformers.generation.streamers import BaseStreamer +except: # noqa # pylint: disable=bare-except + BaseStreamer = None + +from .configuration_internlm import InternLMConfig as InternLM2Config + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "InternLM2Config" + +flash_attn_func, flash_attn_varlen_func = None, None +pad_input, index_first_axis, unpad_input = None, None, None +def _import_flash_attn(): + global flash_attn_func, flash_attn_varlen_func + global pad_input, index_first_axis, unpad_input + try: + from flash_attn import flash_attn_func as _flash_attn_func, flash_attn_varlen_func as _flash_attn_varlen_func + from flash_attn.bert_padding import pad_input as _pad_input, index_first_axis as _index_first_axis, unpad_input as _unpad_input + flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func + pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input + except ImportError: + raise ImportError("flash_attn is not installed.") + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.bart.modeling_bart._make_causal_mask +def _make_causal_mask( + input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 +): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM2 +class InternLM2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + InternLM2RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +# Copied from transformers.model.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM2 +class InternLM2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=torch.float32) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.model.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->InternLM2 +class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding): + """InternLM2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + t = t / self.scaling_factor + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.model.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM2 +class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding): + """InternLM2RotaryEmbedding extended with Dynamic NTK scaling. + Credits to the Reddit users /u/bloc97 and /u/emozilla. + """ + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.model.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors.""" + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class InternLM2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.w1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.w3 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.w2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x)) + + return down_proj + + +# Copied from transformers.model.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +# Modified from transformers.model.llama.modeling_llama.LlamaAttention +class InternLM2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: InternLM2Config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.wqkv = nn.Linear( + self.hidden_size, + (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim, + bias=config.bias, + ) + + self.wo = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = InternLM2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "dynamic": + self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, + scaling_factor=scaling_factor, + ) + elif scaling_type == "linear": + self.rotary_emb = InternLM2LinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, + scaling_factor=scaling_factor, + ) + else: + raise ValueError("Currently we only support rotary embedding's type being 'dynamic' or 'linear'.") + return self.rotary_emb + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + + bsz, q_len, _ = hidden_states.size() + + qkv_states = self.wqkv(hidden_states) + + qkv_states = rearrange( + qkv_states, + "b q (h gs d) -> b q h gs d", + gs=2 + self.num_key_value_groups, + d=self.head_dim, + ) + + query_states = qkv_states[..., : self.num_key_value_groups, :] + query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d") + key_states = qkv_states[..., -2, :] + value_states = qkv_states[..., -1, :] + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.wo(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +# Modified from transformers.model.llama.modeling_llama.InternLM2FlashAttention2 +class InternLM2FlashAttention2(InternLM2Attention): + """ + InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # InternLM2FlashAttention2 attention does not support output_attentions + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + qkv_states = self.wqkv(hidden_states) + + qkv_states = rearrange( + qkv_states, + "b q (h gs d) -> b q h gs d", + gs=2 + self.num_key_value_groups, + d=self.head_dim, + ) + + query_states = qkv_states[..., : self.num_key_value_groups, :] + query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d") + key_states = qkv_states[..., -2, :] + value_states = qkv_states[..., -1, :] + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, q_len + ) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.wo(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + # Contains at least one padding token in the sequence + causal = self.is_causal and query_length != 1 + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q.to(torch.int64), + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + +INTERNLM2_ATTENTION_CLASSES = { + "eager": InternLM2Attention, + "flash_attention_2": InternLM2FlashAttention2, +} + +# Modified from transformers.model.llama.modeling_llama.LlamaDecoderLayer +class InternLM2DecoderLayer(nn.Module): + def __init__(self, config: InternLM2Config): + super().__init__() + self.hidden_size = config.hidden_size + + self.attention = INTERNLM2_ATTENTION_CLASSES[config.attn_implementation](config=config) + + self.feed_forward = InternLM2MLP(config) + self.attention_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.ffn_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + + residual = hidden_states + + hidden_states = self.attention_norm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.attention( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.ffn_norm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +InternLM2_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`InternLM2Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2 +@add_start_docstrings( + "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.", + InternLM2_START_DOCSTRING, +) +class InternLM2PreTrainedModel(PreTrainedModel): + config_class = InternLM2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["InternLM2DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +InternLM2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or + when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Modified from transformers.model.llama.modeling_llama.LlamaModel +@add_start_docstrings( + "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.", + InternLM2_START_DOCSTRING, +) +class InternLM2Model(InternLM2PreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLM2DecoderLayer`] + + Args: + config: InternLM2Config + """ + + _auto_class = "AutoModel" + + def __init__(self, config: InternLM2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.config = config + + self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + + self.layers = nn.ModuleList([InternLM2DecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.tok_embeddings + + def set_input_embeddings(self, value): + self.tok_embeddings = value + + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + device=inputs_embeds.device, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( + inputs_embeds.device + ) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.attn_implementation == "flash_attention_2": + _import_flash_attn() + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + seq_length_with_past = seq_length + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.tok_embeddings(input_ids) + + if self.config.attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + else: + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + # embed positions + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, output_attentions, None) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +# Modified from transformers.model.llama.modeling_llama.LlamaForCausalLM +class InternLM2ForCausalLM(InternLM2PreTrainedModel): + _auto_class = "AutoModelForCausalLM" + + _tied_weights_keys = ["output.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = InternLM2Model(config) + self.vocab_size = config.vocab_size + self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.tok_embeddings + + def set_input_embeddings(self, value): + self.model.tok_embeddings = value + + def get_output_embeddings(self): + return self.output + + def set_output_embeddings(self, new_embeddings): + self.output = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, InternLM2ForCausalLM + + >>> model = InternLM2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.output(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values is not None: + past_length = past_key_values[0][0].shape[2] + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""): + prompt = "" + if meta_instruction: + prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n""" + else: + prompt += "" + for record in history: + prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n""" + prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n""" + return tokenizer([prompt], return_tensors="pt") + + @torch.no_grad() + def chat( + self, + tokenizer, + query: str, + history: List[Tuple[str, str]] = [], + streamer: Optional[BaseStreamer] = None, + max_new_tokens: int = 1024, + do_sample: bool = True, + temperature: float = 0.8, + top_p: float = 0.8, + meta_instruction: str = "You are an AI assistant whose name is InternLM (书生·浦语).\n" + "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n" + "- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.", + **kwargs, + ): + inputs = self.build_inputs(tokenizer, query, history, meta_instruction) + inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)} + # also add end-of-assistant token in eos token id to avoid unnecessary generation + eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]] + outputs = self.generate( + **inputs, + streamer=streamer, + max_new_tokens=max_new_tokens, + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + eos_token_id=eos_token_id, + **kwargs, + ) + outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :] + response = tokenizer.decode(outputs, skip_special_tokens=True) + response = response.split("<|im_end|>")[0] + history = history + [(query, response)] + return response, history + + @torch.no_grad() + def stream_chat( + self, + tokenizer, + query: str, + history: List[Tuple[str, str]] = [], + max_new_tokens: int = 1024, + do_sample: bool = True, + temperature: float = 0.8, + top_p: float = 0.8, + **kwargs, + ): + """ + Return a generator in format: (response, history) + Eg. + ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')]) + ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')]) + """ + if BaseStreamer is None: + raise ModuleNotFoundError( + "The version of `transformers` is too low. Please make sure " + "that you have installed `transformers>=4.28.0`." + ) + + response_queue = queue.Queue(maxsize=20) + + class ChatStreamer(BaseStreamer): + def __init__(self, tokenizer) -> None: + super().__init__() + self.tokenizer = tokenizer + self.queue = response_queue + self.query = query + self.history = history + self.response = "" + self.received_inputs = False + self.queue.put((self.response, history + [(self.query, self.response)])) + + def put(self, value): + if len(value.shape) > 1 and value.shape[0] > 1: + raise ValueError("ChatStreamer only supports batch size 1") + elif len(value.shape) > 1: + value = value[0] + + if not self.received_inputs: + # The first received value is input_ids, ignore here + self.received_inputs = True + return + + token = self.tokenizer.decode([value[-1]], skip_special_tokens=True) + if token.strip() != "<|im_end|>": + self.response = self.response + token + history = self.history + [(self.query, self.response)] + self.queue.put((self.response, history)) + + def end(self): + self.queue.put(None) + + def stream_producer(): + return self.chat( + tokenizer=tokenizer, + query=query, + streamer=ChatStreamer(tokenizer=tokenizer), + history=history, + max_new_tokens=max_new_tokens, + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + **kwargs, + ) + + def consumer(): + producer = threading.Thread(target=stream_producer) + producer.start() + while True: + res = response_queue.get() + if res is None: + return + yield res + + return consumer() + + +# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2 +@add_start_docstrings( + """ + The InternLM2 Model transformer with a sequence classification head on top (linear layer). + + [`InternLM2ForSequenceClassification`] uses the last token in order to do the classification, + as other causal models (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + InternLM2_START_DOCSTRING, +) +class InternLM2ForSequenceClassification(InternLM2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = InternLM2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.tok_embeddings + + def set_input_embeddings(self, value): + self.model.tok_embeddings = value + + @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to( + logits.device + ) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/triton_models/tokenizer/placeholder b/triton_models/tokenizer/placeholder new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/triton_models/tokenizer/pytorch_model.bin.index.json b/triton_models/tokenizer/pytorch_model.bin.index.json new file mode 100644 index 0000000000000000000000000000000000000000..7d95cf180df4c423e817c55f30f5ce93ac80e220 --- /dev/null +++ b/triton_models/tokenizer/pytorch_model.bin.index.json @@ -0,0 +1,554 @@ +{ + "metadata": { + "total_size": 5251801088 + }, + "weight_map": { + "model.layers.0.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.0.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.0.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.1.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.1.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.10.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.10.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.10.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.10.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.10.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.10.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.11.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.11.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.12.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.12.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.13.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.13.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.14.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.14.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.15.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.15.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.16.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.16.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.17.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.17.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.18.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.18.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.19.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.19.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.2.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.2.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.2.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.20.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.20.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.20.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.21.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.21.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.22.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.22.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.23.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.23.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.24.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.24.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.25.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.25.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.attention_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w2.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w2.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w2.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w3.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w3.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.26.feed_forward.w3.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.26.ffn_norm.weight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wo.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wo.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wo.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wqkv.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wqkv.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention.wqkv.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.27.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w1.qweight": "pytorch_model-00002-of-00003.bin", + "model.layers.27.feed_forward.w1.qzeros": "pytorch_model-00002-of-00003.bin", + "model.layers.27.feed_forward.w1.scales": "pytorch_model-00002-of-00003.bin", + "model.layers.27.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.27.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.27.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.28.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.28.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.29.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.29.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.3.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.3.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.3.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.30.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.30.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.30.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wo.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wo.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wo.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wqkv.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wqkv.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention.wqkv.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.attention_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w1.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w1.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w1.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w2.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w2.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w2.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w3.qweight": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w3.qzeros": "pytorch_model-00003-of-00003.bin", + "model.layers.31.feed_forward.w3.scales": "pytorch_model-00003-of-00003.bin", + "model.layers.31.ffn_norm.weight": "pytorch_model-00003-of-00003.bin", + "model.layers.4.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.4.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.4.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.5.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.5.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.6.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.6.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.7.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.7.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.8.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.8.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wo.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wo.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wo.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wqkv.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wqkv.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention.wqkv.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.attention_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w1.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w1.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w1.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w2.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w2.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w2.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w3.qweight": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w3.qzeros": "pytorch_model-00001-of-00003.bin", + "model.layers.9.feed_forward.w3.scales": "pytorch_model-00001-of-00003.bin", + "model.layers.9.ffn_norm.weight": "pytorch_model-00001-of-00003.bin", + "model.norm.weight": "pytorch_model-00003-of-00003.bin", + "model.tok_embeddings.weight": "pytorch_model-00001-of-00003.bin", + "output.weight": "pytorch_model-00003-of-00003.bin" + } +} diff --git a/triton_models/tokenizer/special_tokens_map.json b/triton_models/tokenizer/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..492d4b2966a1763442d426d880dbc29f94906e4c --- /dev/null +++ b/triton_models/tokenizer/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/triton_models/tokenizer/tokenization_internlm.py b/triton_models/tokenizer/tokenization_internlm.py new file mode 100644 index 0000000000000000000000000000000000000000..9792349c7fed6fc64476eabdd9dad7a84640c3ee --- /dev/null +++ b/triton_models/tokenizer/tokenization_internlm.py @@ -0,0 +1,240 @@ +# coding=utf-8 +# Copyright (c) InternLM. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tokenization classes for IntermLM.""" +import os +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm +from transformers.tokenization_utils import PreTrainedTokenizer +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"} + +PRETRAINED_VOCAB_FILES_MAP = {} + + +class InternLMTokenizer(PreTrainedTokenizer): + """ + Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + model_input_names = ["input_ids", "attention_mask"] + _auto_class = "AutoTokenizer" + + def __init__( + self, + vocab_file, + unk_token="", + bos_token="", + eos_token="", + pad_token="", + sp_model_kwargs: Optional[Dict[str, Any]] = None, + add_bos_token=True, + add_eos_token=False, + decode_with_prefix_space=False, + clean_up_tokenization_spaces=False, + **kwargs, + ): + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.vocab_file = vocab_file + self.add_bos_token = add_bos_token + self.add_eos_token = add_eos_token + self.decode_with_prefix_space = decode_with_prefix_space + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + self._no_prefix_space_tokens = None + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + """ Initialization""" + + @property + def no_prefix_space_tokens(self): + if self._no_prefix_space_tokens is None: + vocab = self.convert_ids_to_tokens(list(range(self.vocab_size))) + self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")} + return self._no_prefix_space_tokens + + @property + def vocab_size(self): + """Returns vocab size""" + return self.sp_model.get_piece_size() + + @property + def bos_token_id(self) -> Optional[int]: + return self.sp_model.bos_id() + + @property + def eos_token_id(self) -> Optional[int]: + return self.sp_model.eos_id() + + def get_vocab(self): + """Returns vocab as a dict""" + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text): + """Returns a tokenized string.""" + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.piece_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + token = self.sp_model.IdToPiece(index) + return token + + def _maybe_add_prefix_space(self, tokens, decoded): + if tokens and tokens[0] not in self.no_prefix_space_tokens: + return " " + decoded + else: + return decoded + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + out_string = self.clean_up_tokenization(out_string) + out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string) + return out_string[1:] + + def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + if self.add_bos_token: + bos_token_ids = [self.bos_token_id] + else: + bos_token_ids = [] + + output = bos_token_ids + token_ids_0 + + if token_ids_1 is not None: + output = output + token_ids_1 + + if self.add_eos_token: + output = output + [self.eos_token_id] + + return output + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make + use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + eos = [self.eos_token_id] + + if token_ids_1 is None: + return len(token_ids_0 + eos) * [0] + return len(token_ids_0 + eos + token_ids_1 + eos) * [0] diff --git a/triton_models/tokenizer/tokenizer.model b/triton_models/tokenizer/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6600712949ca9c4ffb50f25275993a21fba0b408 --- /dev/null +++ b/triton_models/tokenizer/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b +size 1477754 diff --git a/triton_models/tokenizer/tokenizer.py b/triton_models/tokenizer/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..db936a5501cb07d33d56083656dbd734ba7431bf --- /dev/null +++ b/triton_models/tokenizer/tokenizer.py @@ -0,0 +1,400 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os +import os.path as osp +from collections import deque +from typing import List, Optional, Sequence, Union + +import torch + +from lmdeploy.utils import get_logger + +# this file will be copied to triton server, make sure all +# importing are starting from the package root lmdeploy + + +class SentencePieceTokenizer: + """Tokenizer of sentencepiece. + + Args: + model_file (str): the path of the tokenizer model + """ + + def __init__(self, model_file: str): + from sentencepiece import SentencePieceProcessor + self.model = SentencePieceProcessor(model_file=model_file) + self._prefix_space_tokens = None + # for stop words + self._maybe_decode_bytes: bool = None + # TODO maybe lack a constant.py + self._indexes_tokens_deque = deque(maxlen=10) + self.max_indexes_num = 5 + self.logger = get_logger('lmdeploy') + + @property + def vocab_size(self): + """vocabulary size.""" + return self.model.vocab_size() + + @property + def bos_token_id(self): + """begine of the sentence token id.""" + return self.model.bos_id() + + @property + def eos_token_id(self): + """end of the sentence token id.""" + return self.model.eos_id() + + @property + def prefix_space_tokens(self): + """tokens without prefix space.""" + if self._prefix_space_tokens is None: + vocab = self.model.IdToPiece(list(range(self.vocab_size))) + self._prefix_space_tokens = { + i + for i, tok in enumerate(vocab) if tok.startswith('▁') + } + return self._prefix_space_tokens + + def _maybe_add_prefix_space(self, tokens, decoded): + """maybe add prefix space for incremental decoding.""" + if len(tokens) and not decoded.startswith(' ') and\ + tokens[0] in self.prefix_space_tokens: + return ' ' + decoded + else: + return decoded + + def indexes_containing_token(self, token: str): + """Return all the possible indexes, whose decoding output may contain + the input token.""" + # traversing vocab is time consuming, can not be accelerated with + # multi threads (computation) or multi process (can't pickle tokenizer) + # so, we maintain latest 10 stop words and return directly if matched + for _token, _indexes in self._indexes_tokens_deque: + if token == _token: + return _indexes + if token == ' ': # ' ' is special + token = '▁' + vocab = self.model.IdToPiece(list(range(self.vocab_size))) + indexes = [i for i, voc in enumerate(vocab) if token in voc] + if len(indexes) > self.max_indexes_num: + indexes = self.encode(token, add_bos=False)[-1:] + self.logger.warning( + f'There are too many(>{self.max_indexes_num}) possible ' + f'indexes may decoding {token}, we will use {indexes} only') + self._indexes_tokens_deque.append((token, indexes)) + return indexes + + def encode(self, s: str, add_bos: bool = True, **kwargs): + """Tokenize a prompt. + + Args: + s (str): a prompt + Returns: + list[int]: token ids + """ + return self.model.Encode(s, add_bos=add_bos, **kwargs) + + def decode(self, t: Sequence[int], offset: Optional[int] = None): + """De-tokenize. + + Args: + t (List[int]): a list of token ids + offset (int): for incrementally decoding. Default to None, which + means not applied. + Returns: + str: text of decoding tokens + """ + if isinstance(t, torch.Tensor): + t = t.tolist() + t = t[offset:] + out_string = self.model.Decode(t) + if offset: + out_string = self._maybe_add_prefix_space(t, out_string) + return out_string + + def __call__(self, s: Union[str, Sequence[str]]): + """Tokenize prompts. + + Args: + s (str): prompts + Returns: + list[int]: token ids + """ + import addict + add_bos = False + add_eos = False + + input_ids = self.model.Encode(s, add_bos=add_bos, add_eos=add_eos) + return addict.Addict(input_ids=input_ids) + + +class HuggingFaceTokenizer: + """Tokenizer of sentencepiece. + + Args: + model_dir (str): the directory of the tokenizer model + """ + + def __init__(self, model_dir: str): + from transformers import AutoTokenizer + model_file = osp.join(model_dir, 'tokenizer.model') + backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json') + model_file_exists = osp.exists(model_file) + self.logger = get_logger('lmdeploy') + if not osp.exists(backend_tokenizer_file) and model_file_exists: + self.logger.warning( + 'Can not find tokenizer.json. ' + 'It may take long time to initialize the tokenizer.') + self.model = AutoTokenizer.from_pretrained(model_dir, + trust_remote_code=True) + self._prefix_space_tokens = None + # save tokenizer.json to reuse + if not osp.exists(backend_tokenizer_file) and model_file_exists: + if hasattr(self.model, 'backend_tokenizer'): + if os.access(model_dir, os.W_OK): + self.model.backend_tokenizer.save(backend_tokenizer_file) + + if self.model.eos_token_id is None: + generation_config_file = osp.join(model_dir, + 'generation_config.json') + if osp.exists(generation_config_file): + with open(generation_config_file, 'r') as f: + cfg = json.load(f) + self.model.eos_token_id = cfg['eos_token_id'] + elif hasattr(self.model, 'eod_id'): # Qwen remote + self.model.eos_token_id = self.model.eod_id + + # for stop words + self._vocab_size_with_added: int = None + self._maybe_decode_bytes: bool = None + # TODO maybe lack a constant.py + self._indexes_tokens_deque = deque(maxlen=10) + self.max_indexes_num = 5 + self.token2id = {} + + @property + def vocab_size(self): + """vocabulary size.""" + return self.model.vocab_size + + @property + def vocab_size_with_added(self): + """vocabulary size with added vocab.""" + if self._vocab_size_with_added is not None: + return self._vocab_size_with_added + self._vocab_size_with_added = len(self.model.get_vocab()) + return self._vocab_size_with_added + + @property + def bos_token_id(self): + """begine of the sentence token id.""" + return self.model.bos_token_id + + @property + def eos_token_id(self): + """end of the sentence token id.""" + return self.model.eos_token_id + + @property + def prefix_space_tokens(self): + """tokens without prefix space.""" + if self._prefix_space_tokens is None: + vocab = self.model.convert_ids_to_tokens( + list(range(self.vocab_size))) + self._prefix_space_tokens = { + i + for i, tok in enumerate(vocab) + if tok.startswith('▁' if isinstance(tok, str) else b' ') + } + return self._prefix_space_tokens + + def _maybe_add_prefix_space(self, tokens: List[int], decoded: str): + """maybe add prefix space for incremental decoding.""" + if len(tokens) and not decoded.startswith(' ') and\ + tokens[0] in self.prefix_space_tokens: + return ' ' + decoded + else: + return decoded + + @property + def maybe_decode_bytes(self): + """Check if self.model.convert_ids_to_tokens return not a str value.""" + if self._maybe_decode_bytes is None: + self._maybe_decode_bytes = False + vocab = self.model.convert_ids_to_tokens( + list(range(self.vocab_size))) + for tok in vocab: + if not isinstance(tok, str): + self._maybe_decode_bytes = True + break + return self._maybe_decode_bytes + + def indexes_containing_token(self, token: str): + """Return all the possible indexes, whose decoding output may contain + the input token.""" + # traversing vocab is time consuming, can not be accelerated with + # multi threads (computation) or multi process (can't pickle tokenizer) + # so, we maintain latest 10 stop words and return directly if matched + for _token, _indexes in self._indexes_tokens_deque: + if token == _token: + return _indexes + + if self.token2id == {}: + # decode is slower than convert_ids_to_tokens + if self.maybe_decode_bytes: + self.token2id = { + self.model.decode(i): i + for i in range(self.vocab_size) + } + else: + self.token2id = { + self.model.convert_ids_to_tokens(i): i + for i in range(self.vocab_size) + } + if token == ' ': # ' ' is special + token = '▁' + indexes = [i for _token, i in self.token2id.items() if token in _token] + if len(indexes) > self.max_indexes_num: + indexes = self.encode(token, add_bos=False)[-1:] + self.logger.warning( + f'There are too many(>{self.max_indexes_num}) possible ' + f'indexes may decoding {token}, we will use {indexes} only') + # there might be token id that exceeds self.vocab_size + if len(indexes) == 0: + indexes = self.encode(token, False) + if len(indexes) != 1: + self.logger.warning( + f'The token {token}, its length of indexes {indexes} is ' + 'not 1. Currently, it can not be used as stop words') + indexes = [] + self._indexes_tokens_deque.append((token, indexes)) + return indexes + + def encode(self, s: str, add_bos: bool = True, **kwargs): + """Tokenize a prompt. + + Args: + s (str): a prompt + Returns: + list[int]: token ids + """ + encoded = self.model.encode(s, **kwargs) + if not add_bos: + # in the middle of a session + if len(encoded) and encoded[0] == self.bos_token_id: + encoded = encoded[1:] + return encoded + + def decode(self, t: Sequence[int], offset: Optional[int] = None): + """De-tokenize. + + Args: + t (List[int]): a list of token ids + offset (int): for incrementally decoding. Default to None, which + means not applied. + Returns: + str: text of decoding tokens + """ + skip_special_tokens = True + t = t[offset:] + out_string = self.model.decode(t, + skip_special_tokens=skip_special_tokens) + if offset: + out_string = self._maybe_add_prefix_space(t, out_string) + return out_string + + def __call__(self, s: Union[str, Sequence[str]]): + """Tokenize prompts. + + Args: + s (str): prompts + Returns: + list[int]: token ids + """ + add_special_tokens = False + return self.model(s, add_special_tokens=add_special_tokens) + + +class Tokenizer: + """Tokenize prompts or de-tokenize tokens into texts. + + Args: + model_file (str): the path of the tokenizer model + """ + + def __init__(self, model_file: str): + if model_file.endswith('.model'): + model_folder = osp.split(model_file)[0] + else: + model_folder = model_file + model_file = osp.join(model_folder, 'tokenizer.model') + tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json') + + model_file_exists = osp.exists(model_file) + config_exists = osp.exists(tokenizer_config_file) + use_hf_model = config_exists or not model_file_exists + self.logger = get_logger('lmdeploy') + if not use_hf_model: + self.model = SentencePieceTokenizer(model_file) + else: + self.model = HuggingFaceTokenizer(model_folder) + + @property + def vocab_size(self): + """vocabulary size.""" + return self.model.vocab_size + + @property + def bos_token_id(self): + """begine of the sentence token id.""" + return self.model.bos_token_id + + @property + def eos_token_id(self): + """end of the sentence token id.""" + return self.model.eos_token_id + + def encode(self, s: str, add_bos: bool = True, **kwargs): + """Tokenize a prompt. + + Args: + s (str): a prompt + Returns: + list[int]: token ids + """ + return self.model.encode(s, add_bos, **kwargs) + + def decode(self, t: Sequence[int], offset: Optional[int] = None): + """De-tokenize. + + Args: + t (List[int]): a list of token ids + offset (int): for incrementally decoding. Default to None, which + means not applied. + Returns: + str: text of decoding tokens + """ + return self.model.decode(t, offset) + + def __call__(self, s: Union[str, Sequence[str]]): + """Tokenize prompts. + + Args: + s (str): prompts + Returns: + list[int]: token ids + """ + return self.model(s) + + def indexes_containing_token(self, token): + """Return all the possible indexes, whose decoding output may contain + the input token.""" + encoded = self.encode(token, add_bos=False) + if len(encoded) > 1: + self.logger.warning( + f'The token {token}, its length of indexes {encoded} is over ' + 'than 1. Currently, it can not be used as stop words') + return [] + return self.model.indexes_containing_token(token) diff --git a/triton_models/tokenizer/tokenizer_config.json b/triton_models/tokenizer/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f133449013be570f08fdf7c70f1a2c8ccb4724da --- /dev/null +++ b/triton_models/tokenizer/tokenizer_config.json @@ -0,0 +1,90 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92538": { + "content": "<|plugin|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92539": { + "content": "<|interpreter|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92540": { + "content": "<|action_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92541": { + "content": "<|action_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92542": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92543": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "auto_map": { + "AutoTokenizer": [ + "tokenization_internlm.InternLMTokenizer", + null + ] + }, + "bos_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "tokenizer_class": "InternLMTokenizer", + "unk_token": "" +} diff --git a/triton_models/weights/config.ini b/triton_models/weights/config.ini new file mode 100644 index 0000000000000000000000000000000000000000..88f3d40970a1e663689736be546f8d3d64bb8734 --- /dev/null +++ b/triton_models/weights/config.ini @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c8358cd3fffcb86829f6b600bdd0ba77b6147eed572f88700ec4d914db070d6 +size 645 diff --git a/triton_models/weights/layers.0.attention.w_qkv.0.qweight b/triton_models/weights/layers.0.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..4f5435a75963ce7ce17b0536f500c8ebf8ca4220 --- /dev/null +++ b/triton_models/weights/layers.0.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1763929a6e7bbdafdb81d39ebfa08263351ccea12347aa68b292b1b7c458e45 +size 12582912 diff --git a/triton_models/weights/layers.0.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.0.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..52107ec494683ad0e0403e4189bcceed1ceabdcb --- /dev/null +++ b/triton_models/weights/layers.0.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ed40e83191f5304fd2df93ff5b90ae9a165bbe489af8020e06948fbbb289d7d +size 786432 diff --git a/triton_models/weights/layers.0.attention.wo.0.qweight b/triton_models/weights/layers.0.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..6e21231bbe43b92e43a0d2600ed6969f6c00e767 --- /dev/null +++ b/triton_models/weights/layers.0.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6710235be94402052aaaae809e488f433d75d6d33acf546e2d0bf7aae4d8f0f +size 8388608 diff --git a/triton_models/weights/layers.0.attention.wo.0.scales_zeros b/triton_models/weights/layers.0.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..4961bf6cfbf6ae7592675c56d719924794d8da68 --- /dev/null +++ b/triton_models/weights/layers.0.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c069c91ef3a796ac2e9e0230319fabb6bc8433c68284c6e5ca71baa477a3438 +size 524288 diff --git a/triton_models/weights/layers.0.attention_norm.weight b/triton_models/weights/layers.0.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..51dd734ab95204a4ce7fd026707a375f1a85219f --- /dev/null +++ b/triton_models/weights/layers.0.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dde3cfe82d02d87660f40c667186249cd17a5ee5924ab2a3ea0385919a2d0f3b +size 8192 diff --git a/triton_models/weights/layers.0.feed_forward.w13.0.qweight b/triton_models/weights/layers.0.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f3167a75e6defd59aa396437f58c797bb5cf1b2c --- /dev/null +++ b/triton_models/weights/layers.0.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26bc912102aa2b487baf312f3bfd8f97dc46ba6761c2328bfd3e45581bfbcfd4 +size 58720256 diff --git a/triton_models/weights/layers.0.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.0.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..68343cbdcbc17ec725af43c1a1d53b62bc5c32c0 --- /dev/null +++ b/triton_models/weights/layers.0.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:309c93937a8778e4e4dce879efd1e0673f4bb7701644628abbaa8420e5b24cf0 +size 3670016 diff --git a/triton_models/weights/layers.0.feed_forward.w2.0.qweight b/triton_models/weights/layers.0.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..3e82c77a6ba7b16d19d55f544f872223d33fba6d --- /dev/null +++ b/triton_models/weights/layers.0.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d096d08769d4b05f7483b4ed024224e0d4d35772231e757157e69c9c0dc1c6ef +size 29360128 diff --git a/triton_models/weights/layers.0.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.0.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..fee7031bc4703588c99d993aaf4e1c0f1d080e5b --- /dev/null +++ b/triton_models/weights/layers.0.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdb73c0a0f614f1033850266d6ff4311374557a2653e0fa7857f8507ca87058e +size 1835008 diff --git a/triton_models/weights/layers.0.ffn_norm.weight b/triton_models/weights/layers.0.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..e8f321d4e16161bcdf7f2b6979e9f90b8aa04ef3 --- /dev/null +++ b/triton_models/weights/layers.0.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5b414270e0d50fbec62cdab6ecd217c2f688872d5ed7d9f91bb75dfff46651b +size 8192 diff --git a/triton_models/weights/layers.0.past_kv_scale.0.weight b/triton_models/weights/layers.0.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..e376c6acc6ad65b07267f834beda69a889c5f0b1 --- /dev/null +++ b/triton_models/weights/layers.0.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25f7250671024d0129c45c3f3d8f57887921d219c280350697d41e9170925c77 +size 16 diff --git a/triton_models/weights/layers.1.attention.w_qkv.0.qweight b/triton_models/weights/layers.1.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..bb3ebc7beaa1d925c4a14fbad6d2df2ec6bad94f --- /dev/null +++ b/triton_models/weights/layers.1.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a125e82d7ee989858902abca2bec9dc3f4ad74008f5307a1e7a635d148c53f3a +size 12582912 diff --git a/triton_models/weights/layers.1.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.1.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..bc0ed1f6f8ef00629e07ce4989e2ddde96723c08 --- /dev/null +++ b/triton_models/weights/layers.1.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f96d91127194d8a8404809f81602727e59903c86473ee27012bb303f83cdf77 +size 786432 diff --git a/triton_models/weights/layers.1.attention.wo.0.qweight b/triton_models/weights/layers.1.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2eaa43207863db980e17ed160bc4613b175baf27 --- /dev/null +++ b/triton_models/weights/layers.1.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4905342d79812e6bd9d6d993443ee6b30df2f80cef44176d1398dc884c458bad +size 8388608 diff --git a/triton_models/weights/layers.1.attention.wo.0.scales_zeros b/triton_models/weights/layers.1.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c136a82b25947dc950216cf643734a4a5ee81a36 --- /dev/null +++ b/triton_models/weights/layers.1.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c7971bdedd76bbe5630fd97b2badbdd26d22055ffe6fe0374fff051af9feb80 +size 524288 diff --git a/triton_models/weights/layers.1.attention_norm.weight b/triton_models/weights/layers.1.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..abe49b3b4fe282cbcf269cc92e4a1b03f8304d1b --- /dev/null +++ b/triton_models/weights/layers.1.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d589a6b27b707580d37c4b198dc952071bb1a34967ebd9175f9055ac012bc781 +size 8192 diff --git a/triton_models/weights/layers.1.feed_forward.w13.0.qweight b/triton_models/weights/layers.1.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..7d2bbd8d926a99dd1ba3adf0859660ace736b884 --- /dev/null +++ b/triton_models/weights/layers.1.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dd761cf75a1f95c5a55a245fbe1a8bca8967be0d7a03dd12108d0be835d7682 +size 58720256 diff --git a/triton_models/weights/layers.1.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.1.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..9fb67e07dca86f3c043855b520b84ed83c9b4930 --- /dev/null +++ b/triton_models/weights/layers.1.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d4fdfeee03517f7896aadab5adec50c8449a2e1bda2f0cf5b8725b26057d1f6 +size 3670016 diff --git a/triton_models/weights/layers.1.feed_forward.w2.0.qweight b/triton_models/weights/layers.1.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..83348571bf69b92747b68f25d3755c7b2146e4c5 --- /dev/null +++ b/triton_models/weights/layers.1.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0c42be27fe2e9f48473b5cc4ec63cd06575ade857ea8699b4bd05eb4f801dc6 +size 29360128 diff --git a/triton_models/weights/layers.1.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.1.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..7f8d31081aee57241eed23ae114dd5e39f9e6bbf --- /dev/null +++ b/triton_models/weights/layers.1.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe915a8697f98fe80270d235325b469219fac1c8a4529052fd15f6b1ee8f13e6 +size 1835008 diff --git a/triton_models/weights/layers.1.ffn_norm.weight b/triton_models/weights/layers.1.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..6db47869baaf62ea10c904bb39ca2fd8dcb35aa5 --- /dev/null +++ b/triton_models/weights/layers.1.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90fa27f32ad04b368d7110fb689b24ea02904efb2f2b7a9f9be876c331fc7212 +size 8192 diff --git a/triton_models/weights/layers.1.past_kv_scale.0.weight b/triton_models/weights/layers.1.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..87ba80c2080cfc64bd645133d99c4fb0f602b920 --- /dev/null +++ b/triton_models/weights/layers.1.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08456e5241a0fbd14699cb889680261c9e0ca7d30051066d899e99be24e15d52 +size 16 diff --git a/triton_models/weights/layers.10.attention.w_qkv.0.qweight b/triton_models/weights/layers.10.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..35f6c98510eb157f0971d9d241b2ec765cd3c834 --- /dev/null +++ b/triton_models/weights/layers.10.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4d8d7ae69eea66730a10e906758105f2c99b16d082b9ea84d7e7cd8afcdbd4c +size 12582912 diff --git a/triton_models/weights/layers.10.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.10.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..77eb52490f504dbd5b089674f267142c27e7acc0 --- /dev/null +++ b/triton_models/weights/layers.10.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2885240377b91bd85bbe4ee6f67b8ca23233584c35ce71b752f9f3bbb66e266c +size 786432 diff --git a/triton_models/weights/layers.10.attention.wo.0.qweight b/triton_models/weights/layers.10.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..440d3e309d85cdfb81736fd024a2834f4d0ce308 --- /dev/null +++ b/triton_models/weights/layers.10.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae5115820467dcb2720eeb7abbdaf3ecd5edb56d9d7453fb0bf4f6b65323445a +size 8388608 diff --git a/triton_models/weights/layers.10.attention.wo.0.scales_zeros b/triton_models/weights/layers.10.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..18b5ecc65f6f8133a1821de0925d37622a67af48 --- /dev/null +++ b/triton_models/weights/layers.10.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4438217ed5de15cb91f4e30f0644b08952e981d25015dd4b75c4a0cae83517c2 +size 524288 diff --git a/triton_models/weights/layers.10.attention_norm.weight b/triton_models/weights/layers.10.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..4f0f39a02bb84010dd644e2fc96ef3b46d4c2820 --- /dev/null +++ b/triton_models/weights/layers.10.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cd2c0d884542c0a881ef8fcfc9fbcc1feb67afbff0a8befc9bb741e2d8ea2af +size 8192 diff --git a/triton_models/weights/layers.10.feed_forward.w13.0.qweight b/triton_models/weights/layers.10.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..bf50b623e7b1f4520d761286edd1db51a109c4c6 --- /dev/null +++ b/triton_models/weights/layers.10.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a1258ea1e97e4c41db26a363eddedd3bd47c6d49f7bf738703c5746c54f4e37 +size 58720256 diff --git a/triton_models/weights/layers.10.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.10.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..ee36f684587a649d68d9579441ca3e90af8d7d6e --- /dev/null +++ b/triton_models/weights/layers.10.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48e7492a7d4447980961b5891a0997f2568bdbe10ed15ba0998f8ca1bdaf0a4c +size 3670016 diff --git a/triton_models/weights/layers.10.feed_forward.w2.0.qweight b/triton_models/weights/layers.10.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..b0cce8413321f6074dc61c7a28bc92377f4c7ab2 --- /dev/null +++ b/triton_models/weights/layers.10.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fb81b3c6a3f7b674506b003621b7e92925754e97d23ecb1209003f2232e33cb +size 29360128 diff --git a/triton_models/weights/layers.10.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.10.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..ce1603f2d10d9ae9ef7251cb66a02c3e0cba6b67 --- /dev/null +++ b/triton_models/weights/layers.10.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:773b9c8eb4a3818b2667162b3169bd4fe813f2fcba5c708a49b79fa5c5053c61 +size 1835008 diff --git a/triton_models/weights/layers.10.ffn_norm.weight b/triton_models/weights/layers.10.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..bbe9a16316f0db34745e41ef00224f94b9237fee --- /dev/null +++ b/triton_models/weights/layers.10.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b576f4d059d0f37a4fd3e626e640dad540ff4758aa449bafe55a78046a01dc9b +size 8192 diff --git a/triton_models/weights/layers.10.past_kv_scale.0.weight b/triton_models/weights/layers.10.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..da0421db9e924c29c37c13c09376487aaa383c8d --- /dev/null +++ b/triton_models/weights/layers.10.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:430d675f2f2e4512591d558ea6f29e42dd38c55ffcd8d21873a12e9ff90e15b2 +size 16 diff --git a/triton_models/weights/layers.11.attention.w_qkv.0.qweight b/triton_models/weights/layers.11.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..d5058e0b21a7342d2379f3a9315e85ef9bbe7682 --- /dev/null +++ b/triton_models/weights/layers.11.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2871ddd112a88bb89a549de3bf1c53af525e962e118eb7ad0feac6a56599a26e +size 12582912 diff --git a/triton_models/weights/layers.11.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.11.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..92844164ec6f5b42e8222c577ce94bae5314a9c9 --- /dev/null +++ b/triton_models/weights/layers.11.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de7017bdedc110df3a9f9fab19466968a5488b9ab3ad533f0908f2d368371adb +size 786432 diff --git a/triton_models/weights/layers.11.attention.wo.0.qweight b/triton_models/weights/layers.11.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c67e6d4b3e11faa456791b77155fef70589e246f --- /dev/null +++ b/triton_models/weights/layers.11.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:530e3110fadceb664c29ff9da577cf401128e93ae21601affd1c62137b04db35 +size 8388608 diff --git a/triton_models/weights/layers.11.attention.wo.0.scales_zeros b/triton_models/weights/layers.11.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..4e0d310e48ae8ebd9b629872134eb3687a55e341 --- /dev/null +++ b/triton_models/weights/layers.11.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1725da8fac86700a95c4ee9d40cf9ebf0d1ebabb4b145c2d57c4a31c42299cb8 +size 524288 diff --git a/triton_models/weights/layers.11.attention_norm.weight b/triton_models/weights/layers.11.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..f57dfc1e256d2fca8f1c8d59982ea28fb2f209c8 --- /dev/null +++ b/triton_models/weights/layers.11.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cb24612b49347f84741d6daab9a90b828aab924fc9b21fd2d2ca6b67abf8ea8 +size 8192 diff --git a/triton_models/weights/layers.11.feed_forward.w13.0.qweight b/triton_models/weights/layers.11.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..27905dc8bb55b6305cefdf0135d72eda3e7e17d9 --- /dev/null +++ b/triton_models/weights/layers.11.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0af7f58d1e58e6610b5b56291bf697d79471c1eeaefdff9466fdc87996c3c86 +size 58720256 diff --git a/triton_models/weights/layers.11.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.11.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..474796975c206470856a63e5627806fdd1a9d0e4 --- /dev/null +++ b/triton_models/weights/layers.11.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46e2d6846839f995e9434c35519a1152c52285d29672febe66e9f07b0e7523e5 +size 3670016 diff --git a/triton_models/weights/layers.11.feed_forward.w2.0.qweight b/triton_models/weights/layers.11.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..b8e4a4f967601a2151a7eb5da1c126599eea4743 --- /dev/null +++ b/triton_models/weights/layers.11.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ae182cb83af72cac11a76113fc5492ae4ccda1cd45df36facac10e65369d22c +size 29360128 diff --git a/triton_models/weights/layers.11.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.11.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..aac9a3ac0afb93d279461dacd82e1fd80dfb6161 --- /dev/null +++ b/triton_models/weights/layers.11.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54af6ef8d3b0aaa32183d5fb176a4d2097bd043e44ebea37ba43ac4021e18253 +size 1835008 diff --git a/triton_models/weights/layers.11.ffn_norm.weight b/triton_models/weights/layers.11.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..6f958acb3e97bbc263ba99adb14ceb897dc7e573 --- /dev/null +++ b/triton_models/weights/layers.11.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ae646b4e03481a9e0eccf0a151deeae360012b79d455f413d6b4c8c05ead016 +size 8192 diff --git a/triton_models/weights/layers.11.past_kv_scale.0.weight b/triton_models/weights/layers.11.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..3bf7aed58e43958ad08d6b6e8beffe072f7e15e6 --- /dev/null +++ b/triton_models/weights/layers.11.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:114046d9b18a39823a18019529563163f191e5a74c65e959db74c96b77c9b4b9 +size 16 diff --git a/triton_models/weights/layers.12.attention.w_qkv.0.qweight b/triton_models/weights/layers.12.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..b026bcfd8643c18461670a5a2980cf9a8539bb2b --- /dev/null +++ b/triton_models/weights/layers.12.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d30b7fa1db362abf3186072da75c305cd7e79f90f4b1eea6095014d9f7989da7 +size 12582912 diff --git a/triton_models/weights/layers.12.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.12.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..011903f321dd322447298b693e1eedb17f35c3ac --- /dev/null +++ b/triton_models/weights/layers.12.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:654fe994288ed138b388cb0e14a9c4e7124b601ac4efa404788e3267ed137307 +size 786432 diff --git a/triton_models/weights/layers.12.attention.wo.0.qweight b/triton_models/weights/layers.12.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..fd89f748d1ea906c6617d240a4e123d243105b64 --- /dev/null +++ b/triton_models/weights/layers.12.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:069d9e054d6cd0171b229e37a70b6a2fca364783cc8e80de9f81060931964e0b +size 8388608 diff --git a/triton_models/weights/layers.12.attention.wo.0.scales_zeros b/triton_models/weights/layers.12.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..b46cd92e96aa0e40ba260aea37674bdb9fbf1fd6 --- /dev/null +++ b/triton_models/weights/layers.12.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:394968e46096fa0f50701fe0d09193561276359f023ea5dbc3a16bb3f1aff8b8 +size 524288 diff --git a/triton_models/weights/layers.12.attention_norm.weight b/triton_models/weights/layers.12.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..0020f8c429974d047571347728c95d5259c0da58 --- /dev/null +++ b/triton_models/weights/layers.12.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:020a5a9ed0a5065303d1079d24ce7252b639f6f76bf49c7b8fb5fac3bc93fc1b +size 8192 diff --git a/triton_models/weights/layers.12.feed_forward.w13.0.qweight b/triton_models/weights/layers.12.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f5cd9ca940d4417db1082cb6b445b56fc3ed304e --- /dev/null +++ b/triton_models/weights/layers.12.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9293f916e4009deb3dd715ac0fea08afe5be75548d2fe2e70a67fd5826664cea +size 58720256 diff --git a/triton_models/weights/layers.12.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.12.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..be6c9b7b29a56d2d3afaec63b36099fc29d1ba80 --- /dev/null +++ b/triton_models/weights/layers.12.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89899a4751211dda4328e2380ceec5d62d0d0b13fd164ccb7c9f5e189409a08f +size 3670016 diff --git a/triton_models/weights/layers.12.feed_forward.w2.0.qweight b/triton_models/weights/layers.12.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..53e4822e263ce179450dcfacefe7dd882447324d --- /dev/null +++ b/triton_models/weights/layers.12.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f0f0481d3c7eeecc2717614f38dcd54163c287431e82da95a1e8d5fd182cc27 +size 29360128 diff --git a/triton_models/weights/layers.12.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.12.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..2f8d90a6c38370788887ee529f4ad8c7b4fd6593 --- /dev/null +++ b/triton_models/weights/layers.12.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:690b11e4c0f825ec39db6b53fc1ccdd51d051c752199195f2cff8079ef3b980d +size 1835008 diff --git a/triton_models/weights/layers.12.ffn_norm.weight b/triton_models/weights/layers.12.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..35e00aeee302ec1726ef04c71f2a2f429fe0d23e --- /dev/null +++ b/triton_models/weights/layers.12.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce6abd982c6b4b398f13a6113cfaefff0fe65190ff1b232c8b9a68acb30fbfdb +size 8192 diff --git a/triton_models/weights/layers.12.past_kv_scale.0.weight b/triton_models/weights/layers.12.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..8fb69a827363200f7cd82be1b4f35bab6e143bb7 --- /dev/null +++ b/triton_models/weights/layers.12.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3cee21f879722a16a454f6455c8d8c3aec77cbfdba6cbebac9c4762d1d03bb2 +size 16 diff --git a/triton_models/weights/layers.13.attention.w_qkv.0.qweight b/triton_models/weights/layers.13.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..63d098e6067e1aac3d4f6083c34f967abcfb40f4 --- /dev/null +++ b/triton_models/weights/layers.13.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:983fa35043fba20d8f39610fc859862486472388df708d85176e198b9493f194 +size 12582912 diff --git a/triton_models/weights/layers.13.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.13.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..f78fb596aaf17a70c0fc17098a02d2fbd9f8b12e --- /dev/null +++ b/triton_models/weights/layers.13.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcfbdb8a6f2d86500e49d21e3d0cf88dda2e18b505be8459e46962f1a5403902 +size 786432 diff --git a/triton_models/weights/layers.13.attention.wo.0.qweight b/triton_models/weights/layers.13.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..d0443fc30519b3ca74b5e3d4e0317af1dbe8b32d --- /dev/null +++ b/triton_models/weights/layers.13.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e76d5b55510b3111a4c8068f8bf2abe8372c9868a5346fd03831633817f49a3 +size 8388608 diff --git a/triton_models/weights/layers.13.attention.wo.0.scales_zeros b/triton_models/weights/layers.13.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..6cbcd17aed1ae804e9e87a936274b99c9ad81296 --- /dev/null +++ b/triton_models/weights/layers.13.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da85282928c5b1723c48e93cdadc416b400deb61bb90f28c4675989ab7d2f4f8 +size 524288 diff --git a/triton_models/weights/layers.13.attention_norm.weight b/triton_models/weights/layers.13.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..58edee2f8e729e06965c92f434900ae4f75e1a49 --- /dev/null +++ b/triton_models/weights/layers.13.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:592d7039e973372cadcf8b3f717c19ecbcb911e2f40140d617855643bf2bfa3f +size 8192 diff --git a/triton_models/weights/layers.13.feed_forward.w13.0.qweight b/triton_models/weights/layers.13.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..0f2f191246be551220b2b9df11e88d070f4b63c7 --- /dev/null +++ b/triton_models/weights/layers.13.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1cbe619508e858a2637045e1e07f9cb0ec4c6020d6041e40bc9558aaa9fd290 +size 58720256 diff --git a/triton_models/weights/layers.13.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.13.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..8114a135ab96b7c28393bb44bad7050a71bd712c --- /dev/null +++ b/triton_models/weights/layers.13.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c555740ee91741c87411db09bc23b419caa191a4ac0ccf7e34b00fe64e614493 +size 3670016 diff --git a/triton_models/weights/layers.13.feed_forward.w2.0.qweight b/triton_models/weights/layers.13.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..efc53988aa0826924baa6153c20d1fb1abae3183 --- /dev/null +++ b/triton_models/weights/layers.13.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5434cecf17636b9bbdf1df6ae4b6d1eb6c06a611c93fe0291ad0d3892d850a81 +size 29360128 diff --git a/triton_models/weights/layers.13.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.13.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c927886fb77c90e7e2afb11bb38945c179e779cd --- /dev/null +++ b/triton_models/weights/layers.13.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c89194f222aef9d0488e0677d654d9f4cc783cebad2ba76e9013ef99684a1c2c +size 1835008 diff --git a/triton_models/weights/layers.13.ffn_norm.weight b/triton_models/weights/layers.13.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..0044a510f007c3e66e363ee02bbc25f4c26cb6a6 --- /dev/null +++ b/triton_models/weights/layers.13.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75cc6d0e292ec019791db0f7ef63b0508d8a5d19404fadb09c1b06a8dcae7cdb +size 8192 diff --git a/triton_models/weights/layers.13.past_kv_scale.0.weight b/triton_models/weights/layers.13.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..313f047a7db61ca9b3fed45b948aad24958ec896 --- /dev/null +++ b/triton_models/weights/layers.13.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e86a948027461837c94daa03c444ddaa2a484bdadcab47a89f78d0d332ba0370 +size 16 diff --git a/triton_models/weights/layers.14.attention.w_qkv.0.qweight b/triton_models/weights/layers.14.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..d34a88071016d52838a914b177b787d6b7f5e989 --- /dev/null +++ b/triton_models/weights/layers.14.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd65317b8701a195eabe835058a9366309ad055eebd4354fe994187573dcfcb4 +size 12582912 diff --git a/triton_models/weights/layers.14.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.14.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..dbf55a9dd11b2bb29fb5f7a2ec180b89f6372195 --- /dev/null +++ b/triton_models/weights/layers.14.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a8b7af909bb0ee02940f92c80cde0a7a869e60bd4778c7eb5934ed7134b1e56 +size 786432 diff --git a/triton_models/weights/layers.14.attention.wo.0.qweight b/triton_models/weights/layers.14.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f2e7385fd3b0a6c38260980964dfd035abe25f95 --- /dev/null +++ b/triton_models/weights/layers.14.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f17aa0c464ae8e87100f9946574744e554c50847775d5e3cc888584c920b51bf +size 8388608 diff --git a/triton_models/weights/layers.14.attention.wo.0.scales_zeros b/triton_models/weights/layers.14.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..cca81645ed7af2fd8f2039c751f0856ab6332929 --- /dev/null +++ b/triton_models/weights/layers.14.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac63fb5629b386babfc0cf09324e8388735c894def38688f57e5fa413a76a6b6 +size 524288 diff --git a/triton_models/weights/layers.14.attention_norm.weight b/triton_models/weights/layers.14.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..a2e5c82b9d622524d9390c76957ed9e8994aa2b8 --- /dev/null +++ b/triton_models/weights/layers.14.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9d54e43cc40808a7a12fb34802e7e3fa239938943e4f247ea54556f65191e0e +size 8192 diff --git a/triton_models/weights/layers.14.feed_forward.w13.0.qweight b/triton_models/weights/layers.14.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..efb7ccb2234e6b179d310051c53ba547a39f7b6b --- /dev/null +++ b/triton_models/weights/layers.14.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f974af156ac932cd0619e0e86095071dccc8cd0608319df5c1042492b2002e9d +size 58720256 diff --git a/triton_models/weights/layers.14.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.14.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..8d916976c94c174148b04db334b907ec77c7d638 --- /dev/null +++ b/triton_models/weights/layers.14.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5be3c8f04a42c5e0c9de9d00508fbb981849cf188dba80cf6127d8f4b4b712d +size 3670016 diff --git a/triton_models/weights/layers.14.feed_forward.w2.0.qweight b/triton_models/weights/layers.14.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c926dcac71d930076be55189beacbb36cfb1a777 --- /dev/null +++ b/triton_models/weights/layers.14.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c33e3534172410d4656b1a244becc400d680dc19664a6fe5d2531f0733b24b1 +size 29360128 diff --git a/triton_models/weights/layers.14.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.14.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..78c574771e660fcfc3a237c9d56afe57b62f1ea0 --- /dev/null +++ b/triton_models/weights/layers.14.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3be2e077ef369c828ac8f31826249f327d120baaaf9d0141f67b9a814f95a57b +size 1835008 diff --git a/triton_models/weights/layers.14.ffn_norm.weight b/triton_models/weights/layers.14.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..3094bf1d424cd5ba8300cb6dddb32e4bc9d78073 --- /dev/null +++ b/triton_models/weights/layers.14.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdb3dd1a12abaf094e03a1d933aa4ab506d5c4c0cd21cf0802c04f4a0d5a85c7 +size 8192 diff --git a/triton_models/weights/layers.14.past_kv_scale.0.weight b/triton_models/weights/layers.14.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..a1ff0007bbe4e1f0abfdccce67158196a9b3ba13 --- /dev/null +++ b/triton_models/weights/layers.14.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39dfb751ce93881ea2c4e2f68155583024cfcf9e85b5705781348b079cc29b0d +size 16 diff --git a/triton_models/weights/layers.15.attention.w_qkv.0.qweight b/triton_models/weights/layers.15.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..8d981e2ef18ba6fa67894151d2e5d33aec76e769 --- /dev/null +++ b/triton_models/weights/layers.15.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f2d6afe6100ef0eb47d5b379ce3faa38ec1063ba36d47d9526647ea7fa4bda2 +size 12582912 diff --git a/triton_models/weights/layers.15.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.15.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..92d62c8db383b4e459224b1370a1d87eaa416096 --- /dev/null +++ b/triton_models/weights/layers.15.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8abb8c1bad2acba915885821b231c1884cd63fd978d62d23a25775671c97f9b +size 786432 diff --git a/triton_models/weights/layers.15.attention.wo.0.qweight b/triton_models/weights/layers.15.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..43781b59b7834c4758226fadd3757cd458eb9001 --- /dev/null +++ b/triton_models/weights/layers.15.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fca2dec7e83b35a6b582edfc05ddf49890b234aeba53a3d88384a436cc96c4c1 +size 8388608 diff --git a/triton_models/weights/layers.15.attention.wo.0.scales_zeros b/triton_models/weights/layers.15.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..51a58827bb1c84c5a11deab1134c99e4cd37f472 --- /dev/null +++ b/triton_models/weights/layers.15.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83bb55b56df6d0d2c1f6f04d894e5d6e63d476b8fffe1dd0441a892eed850502 +size 524288 diff --git a/triton_models/weights/layers.15.attention_norm.weight b/triton_models/weights/layers.15.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..7e895dc7fffaa82cf585391595f009adf667e4cd --- /dev/null +++ b/triton_models/weights/layers.15.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06c4e4b6e08466593216c5fffe5bb16fbe296be7d83b8d67084a728b4f0d26d0 +size 8192 diff --git a/triton_models/weights/layers.15.feed_forward.w13.0.qweight b/triton_models/weights/layers.15.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..8dfc85e4b6b9e369447163acf76550539913fb5a --- /dev/null +++ b/triton_models/weights/layers.15.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b271e071ebc5f1e37284433f76d394ee2ba20920d64e64355f6c37672bd68f3 +size 58720256 diff --git a/triton_models/weights/layers.15.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.15.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c0f10138fba546a8c454600fd6a73289e0a7f8fd --- /dev/null +++ b/triton_models/weights/layers.15.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b42f1cdd3b5b76e04cd4154950ade000eff8bfc44853c827ff351d00526201bc +size 3670016 diff --git a/triton_models/weights/layers.15.feed_forward.w2.0.qweight b/triton_models/weights/layers.15.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e0d0b67b1d9d4d9530690ac220e426dedaddb1fc --- /dev/null +++ b/triton_models/weights/layers.15.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c44d9731ffc2bbd8a368f60064a8e8e85f50b04677d059c25fce70aae38dc81 +size 29360128 diff --git a/triton_models/weights/layers.15.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.15.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..a99be30bc9c12257d3764ef09722a06f15ef0437 --- /dev/null +++ b/triton_models/weights/layers.15.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:287e909a7bd9bcc0b456c57c361a614c1898383785bccf9f57eee7f91599e3b3 +size 1835008 diff --git a/triton_models/weights/layers.15.ffn_norm.weight b/triton_models/weights/layers.15.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..630c4372de835971e521542c84649a00c3b2e403 --- /dev/null +++ b/triton_models/weights/layers.15.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8dafc8ea6132b5caec667dde3f6dda741e7ff23e40b8ff5f5ccc59232ca434b +size 8192 diff --git a/triton_models/weights/layers.15.past_kv_scale.0.weight b/triton_models/weights/layers.15.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..a47b7192fa2a190ceb02a526a527aed679e93740 --- /dev/null +++ b/triton_models/weights/layers.15.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c945e5779fcddbf5dff47a4c3502bce9ba0bace5158abc583e852d1418f9513a +size 16 diff --git a/triton_models/weights/layers.16.attention.w_qkv.0.qweight b/triton_models/weights/layers.16.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..b17d911138bd69b5faa2b303479e7cca9c12b659 --- /dev/null +++ b/triton_models/weights/layers.16.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf8c2d841b0c3dfd0a4349bb4aa84c0d85141c14277e879c033484e225096715 +size 12582912 diff --git a/triton_models/weights/layers.16.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.16.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..bd4333af13bff4ad87c753e24461be8ab19102ab --- /dev/null +++ b/triton_models/weights/layers.16.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a54b05a6ce8083736ca7db382672bb83d215649338920308cf0edd2e4f1ae07 +size 786432 diff --git a/triton_models/weights/layers.16.attention.wo.0.qweight b/triton_models/weights/layers.16.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e09e8104c2418067fc961e4fa84dc074da5eaa81 --- /dev/null +++ b/triton_models/weights/layers.16.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b8f9b5eb6ea1827048eb48661af27f66fbf5f510055f7dfc813f28f79967c83 +size 8388608 diff --git a/triton_models/weights/layers.16.attention.wo.0.scales_zeros b/triton_models/weights/layers.16.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..a056f4943ce26b8bb7e3c8d3d052feb2f324a4d8 --- /dev/null +++ b/triton_models/weights/layers.16.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3031c7a07ae7554fdc02af0112aaf4f343c164f1da7e65ac0926e0b33ec1daf +size 524288 diff --git a/triton_models/weights/layers.16.attention_norm.weight b/triton_models/weights/layers.16.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..710904f88b607829b98f69d31a704b5ccb2180d3 --- /dev/null +++ b/triton_models/weights/layers.16.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0996c709a45131cb25cd72865a06e38920f31941b25f83f2d78ed5751645c284 +size 8192 diff --git a/triton_models/weights/layers.16.feed_forward.w13.0.qweight b/triton_models/weights/layers.16.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..ea56d48779234f87b2b0a859e2cb110d0718e2b9 --- /dev/null +++ b/triton_models/weights/layers.16.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50fe105dfc87e7a2f06e12b9d1d92899b4b20106d29198eb7f8156c888b57620 +size 58720256 diff --git a/triton_models/weights/layers.16.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.16.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..5773631e90c5be54da0f5ca15e355b6bf855b4e3 --- /dev/null +++ b/triton_models/weights/layers.16.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8081c981a8cc02210f42ffa6b41e8f8a018cc273f18dd184e7a76ea6a14af908 +size 3670016 diff --git a/triton_models/weights/layers.16.feed_forward.w2.0.qweight b/triton_models/weights/layers.16.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..5a19b7dd919248c1d8f24d12508ffb36be409a0b --- /dev/null +++ b/triton_models/weights/layers.16.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b58ad7e7bd4aaf5109590b6f4b500643cea2e5ee7ecf3de2f2bafd931fecbba +size 29360128 diff --git a/triton_models/weights/layers.16.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.16.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..17e81af1aaa097a81bf4407a23e87dfb0810ba73 --- /dev/null +++ b/triton_models/weights/layers.16.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05659661021dfb93c23ca810756fba0afa33f7dc7103bb74e79a5b5cee0630c2 +size 1835008 diff --git a/triton_models/weights/layers.16.ffn_norm.weight b/triton_models/weights/layers.16.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..f45d501c72951cd1746375922f7e113162bef097 --- /dev/null +++ b/triton_models/weights/layers.16.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:990398b91f28bd4d0ea10d21a8f911746291d93d353659c273a0d263f3f8b26f +size 8192 diff --git a/triton_models/weights/layers.16.past_kv_scale.0.weight b/triton_models/weights/layers.16.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..cc7a02ca2638e540d970eba9c8c2ca40c599f58e --- /dev/null +++ b/triton_models/weights/layers.16.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a46e5538c6531808ab35a4aa3f8acc92997393bf5778110738282e7d0b5a6253 +size 16 diff --git a/triton_models/weights/layers.17.attention.w_qkv.0.qweight b/triton_models/weights/layers.17.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..b7d289a0a181f768648b3388209609a158c0d194 --- /dev/null +++ b/triton_models/weights/layers.17.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a874ceb40f2cd87b1fbadffe4f336e766e4632d1486bae80a524aca3884a760 +size 12582912 diff --git a/triton_models/weights/layers.17.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.17.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..02676e7729a5ae2a782c7397622f5661a55ae306 --- /dev/null +++ b/triton_models/weights/layers.17.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3e383f96fe0c11172a8eb7c833e16437243ddf5083fe742f2f5267c606bf46f +size 786432 diff --git a/triton_models/weights/layers.17.attention.wo.0.qweight b/triton_models/weights/layers.17.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f5d248ed5bb53bc83690b851c4850179affe3a1e --- /dev/null +++ b/triton_models/weights/layers.17.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ba47e294f57c2391d17559990d81c10b3febf1ac79cdaf9646ea4b5b1efe9ae +size 8388608 diff --git a/triton_models/weights/layers.17.attention.wo.0.scales_zeros b/triton_models/weights/layers.17.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..cec2b0826f0458f462a1f155b2420afe3cade230 --- /dev/null +++ b/triton_models/weights/layers.17.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19655fc3273537cb5a737021f0914fcaba9f520ae85a241b6943a1e375859c5a +size 524288 diff --git a/triton_models/weights/layers.17.attention_norm.weight b/triton_models/weights/layers.17.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..493203ace8591c626f3ddd92a1d30a132fb91f7c --- /dev/null +++ b/triton_models/weights/layers.17.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f60382d336b8fe223742bf477d6e1d6b03a426c1397370821017d77560828a40 +size 8192 diff --git a/triton_models/weights/layers.17.feed_forward.w13.0.qweight b/triton_models/weights/layers.17.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..fada103f386b9576504b44aad9effb7227b81161 --- /dev/null +++ b/triton_models/weights/layers.17.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6347e704f461d7d6ee0ae21b790cdd6180debf826b736f1862a27bc9ced0045 +size 58720256 diff --git a/triton_models/weights/layers.17.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.17.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..e34de3f6584cca7245e62f91730286274c18de9f --- /dev/null +++ b/triton_models/weights/layers.17.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13d6a83305e5bb3038ce5829693b70573fbcbfd18ef9251f42334a92a864f2f2 +size 3670016 diff --git a/triton_models/weights/layers.17.feed_forward.w2.0.qweight b/triton_models/weights/layers.17.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..62706b91c086f1c95651471ed13767ce01618e08 --- /dev/null +++ b/triton_models/weights/layers.17.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62bbff754141a2d1cf72617d73f2522333bb2694a88e8a5b37c1aca6b22b17a0 +size 29360128 diff --git a/triton_models/weights/layers.17.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.17.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..7d16b3f60264de0aab7805c342d890386aa3c7ec --- /dev/null +++ b/triton_models/weights/layers.17.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2aced42506d0f633676edf55b7de564b795eb6de86d8c0f6c0f1d1301233312 +size 1835008 diff --git a/triton_models/weights/layers.17.ffn_norm.weight b/triton_models/weights/layers.17.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..2115ea8bcc2774631a370c71a768d54242473864 --- /dev/null +++ b/triton_models/weights/layers.17.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7866c4443b210b814e1bcca660a34c2b78f21172253d2c53300be2c3e3d44fc +size 8192 diff --git a/triton_models/weights/layers.17.past_kv_scale.0.weight b/triton_models/weights/layers.17.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..945eb96703d8de2eef6085a642b1a27de7fb8cba --- /dev/null +++ b/triton_models/weights/layers.17.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8029ca34c285ba5e30b011338457cb6e1aa2bde375aa5bddeb10d5f735b827aa +size 16 diff --git a/triton_models/weights/layers.18.attention.w_qkv.0.qweight b/triton_models/weights/layers.18.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c8f8e2fdabca3f7c34468465c2a769b83df35ce8 --- /dev/null +++ b/triton_models/weights/layers.18.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:802bfc3126429a1c8f50bb8bc82a62b62b5e4fac66b2e5201d5ca3dadc76b2b0 +size 12582912 diff --git a/triton_models/weights/layers.18.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.18.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..15b491c33507c9aa77edc43db2d844a6f497fca7 --- /dev/null +++ b/triton_models/weights/layers.18.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b1e35a7c3f4353a260afd771398ed0e6f3fb0cfe2c9e57c9c6aa837187477b +size 786432 diff --git a/triton_models/weights/layers.18.attention.wo.0.qweight b/triton_models/weights/layers.18.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..fda05fdf95a8e38dbba3ae8e857729fde60e6d1b --- /dev/null +++ b/triton_models/weights/layers.18.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d5e9b4b8ac11947e865c95a0ee01bea2b98bb4d8e186bc655980c0819220337 +size 8388608 diff --git a/triton_models/weights/layers.18.attention.wo.0.scales_zeros b/triton_models/weights/layers.18.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..56d79eb2481c7040c86fa26964ede1eeae1395e4 --- /dev/null +++ b/triton_models/weights/layers.18.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fb7cefb270cbf64d8347c25b5d776be71d432c570ac277fc6dcb8160f358040 +size 524288 diff --git a/triton_models/weights/layers.18.attention_norm.weight b/triton_models/weights/layers.18.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..3c20c25a40ad141d017b4cce8700f88ca3d8efca --- /dev/null +++ b/triton_models/weights/layers.18.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dac1fd7000d40fa00eb19ec7e140c8fd08a7e2fba5ac80c0f15abf00fd9048e +size 8192 diff --git a/triton_models/weights/layers.18.feed_forward.w13.0.qweight b/triton_models/weights/layers.18.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..3c1d6af45afa49731996db41ef7d18503411125c --- /dev/null +++ b/triton_models/weights/layers.18.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23dee44b6cb77a166863b69487459d9de5dfd4c3989306919d4c35dc20c884be +size 58720256 diff --git a/triton_models/weights/layers.18.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.18.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..54489f50388ea9154fce92dbadd4bf6a1a861f86 --- /dev/null +++ b/triton_models/weights/layers.18.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10a6c1e2ca46dac304c89690e837221b7cd15133dc1e7ccfb18f69187af51208 +size 3670016 diff --git a/triton_models/weights/layers.18.feed_forward.w2.0.qweight b/triton_models/weights/layers.18.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e90ed3787e1ac9da6ffed10588e004c09bf3b9b1 --- /dev/null +++ b/triton_models/weights/layers.18.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a35d9d5c12d752b160f51f53a49e9a763662605165cb85272e539b60a9f92055 +size 29360128 diff --git a/triton_models/weights/layers.18.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.18.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..17951129ba756efbad134062196862ef2b290c05 --- /dev/null +++ b/triton_models/weights/layers.18.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:845ca7749cf6829cc274de80528f41dbd289d125720a4f68417677871dd528c9 +size 1835008 diff --git a/triton_models/weights/layers.18.ffn_norm.weight b/triton_models/weights/layers.18.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..3fdc07d36718c6a4fb843c7a0e547971f25bbe50 --- /dev/null +++ b/triton_models/weights/layers.18.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:885808cbeec44e76e545008343da6029dce51d48908c85d61f4e3e5734a316a7 +size 8192 diff --git a/triton_models/weights/layers.18.past_kv_scale.0.weight b/triton_models/weights/layers.18.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..4b8d6bdb257005f9da0843e14b064394e5e12366 --- /dev/null +++ b/triton_models/weights/layers.18.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da3eda4da09ebaeb73ef447011ce0b9ef2ee982ab26d8d0408ad482f9b2b389e +size 16 diff --git a/triton_models/weights/layers.19.attention.w_qkv.0.qweight b/triton_models/weights/layers.19.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f58ac78fbf8480c4a875a904f3eca7296b9d1dc7 --- /dev/null +++ b/triton_models/weights/layers.19.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a697cc9e5c643856df75e5d40a4ddc810ad41c0ab9362ad6c7745862c000ccf +size 12582912 diff --git a/triton_models/weights/layers.19.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.19.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..ff2f26342ca1663ff6c89e5015b02b41e976f9a9 --- /dev/null +++ b/triton_models/weights/layers.19.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5deb01a923b8c70c8adaa62c3b6128231899cb7c185908822279725696d1c819 +size 786432 diff --git a/triton_models/weights/layers.19.attention.wo.0.qweight b/triton_models/weights/layers.19.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f444fcc2661a285f914957b05cedde19a4954ace --- /dev/null +++ b/triton_models/weights/layers.19.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:682754ebee51648ef7b0249fee7289fdf825e61916f97ec62087c8e39e9c14bb +size 8388608 diff --git a/triton_models/weights/layers.19.attention.wo.0.scales_zeros b/triton_models/weights/layers.19.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..41cb9a3fa2554343948079acebcb10fa2a940517 --- /dev/null +++ b/triton_models/weights/layers.19.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6d4a938a39924f222f02b460355a83ffb98a00ff19d05048c3bcb82c9e57edc +size 524288 diff --git a/triton_models/weights/layers.19.attention_norm.weight b/triton_models/weights/layers.19.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..5acd5f2587a22bc1a1e2870e9b4af8ea1eaeb505 --- /dev/null +++ b/triton_models/weights/layers.19.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63d26f2643a9aceebf2af38dbc611dc36da45a176257e478e62f85ddbc559f55 +size 8192 diff --git a/triton_models/weights/layers.19.feed_forward.w13.0.qweight b/triton_models/weights/layers.19.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..cc8dd8ef920737fc2e432adac1ce42303e7d7111 --- /dev/null +++ b/triton_models/weights/layers.19.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a54bcfb108f050cf4a7c7cb37114ceb35476b3f8bb6cf6c541e8df014fbf6133 +size 58720256 diff --git a/triton_models/weights/layers.19.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.19.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c378e9b9bed297468e52701cb4eea8586e317e8f --- /dev/null +++ b/triton_models/weights/layers.19.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11cb4b7bd0b53f894236952f72793d3d4e647e6d07fc37e1112b0c5ba392176c +size 3670016 diff --git a/triton_models/weights/layers.19.feed_forward.w2.0.qweight b/triton_models/weights/layers.19.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..131386a17e034a3ba0ce59be9c0351b35dfc20e1 --- /dev/null +++ b/triton_models/weights/layers.19.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f32b6e7bb6005ba215aa938a0b52300230f7008150b45a11916829314ef3494 +size 29360128 diff --git a/triton_models/weights/layers.19.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.19.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..af5383b2c8c39d1c54f5dea9298ea08f5cbe267b --- /dev/null +++ b/triton_models/weights/layers.19.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84f83448a65d6bf12e5484bdf2805b2648a5ee6c0f71f592f1399a71f787a365 +size 1835008 diff --git a/triton_models/weights/layers.19.ffn_norm.weight b/triton_models/weights/layers.19.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..6f5513a9af9eec5fbc82dd527339fb220156deb0 --- /dev/null +++ b/triton_models/weights/layers.19.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7e2f003c72088419d2608b060a98ab42356eeffed53510f1d468f4ccd3f1141 +size 8192 diff --git a/triton_models/weights/layers.19.past_kv_scale.0.weight b/triton_models/weights/layers.19.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..fd5be00138be7b2df59bf0b592a9bef86dc82eb8 --- /dev/null +++ b/triton_models/weights/layers.19.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c71b33b311eb0e23a8b2494a543ba1181fd72314b49cf78a9749b9cf4a00df4 +size 16 diff --git a/triton_models/weights/layers.2.attention.w_qkv.0.qweight b/triton_models/weights/layers.2.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2d9c45e71e2c0ab82208f4202b06c9b97f6ba148 --- /dev/null +++ b/triton_models/weights/layers.2.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fa15c6683fb8dd4f6a17b49bb0a989e462a984b2b1a62741c0261b0205e4d3a +size 12582912 diff --git a/triton_models/weights/layers.2.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.2.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..cf230e2e4ec022b7dadc04504edd265c2736423a --- /dev/null +++ b/triton_models/weights/layers.2.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d46a56b3063ca3e890569f20f0f9554bd4b8b3dce4dd28c6de2a2c8b018de692 +size 786432 diff --git a/triton_models/weights/layers.2.attention.wo.0.qweight b/triton_models/weights/layers.2.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2ec2d68e756cc1afd558415a1c748d3366f51240 --- /dev/null +++ b/triton_models/weights/layers.2.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:745bd18832a4be0427eecf06fbd16e5b4d9045d9bae02a538648bf061f1bcd31 +size 8388608 diff --git a/triton_models/weights/layers.2.attention.wo.0.scales_zeros b/triton_models/weights/layers.2.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..d1e959a3fa4ef4072ae44bb537bc108a99c3799e --- /dev/null +++ b/triton_models/weights/layers.2.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f165998aa89a2e93b82203e08444995edcdc00ed2dd2b3dc3171ed8c4aef68f +size 524288 diff --git a/triton_models/weights/layers.2.attention_norm.weight b/triton_models/weights/layers.2.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..775cfb53b3214e57d496df775c7f2e98df37a237 --- /dev/null +++ b/triton_models/weights/layers.2.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35db76352c3fef9616c14aefa7c0b05850df54a54e3e6c922df8876639c7048e +size 8192 diff --git a/triton_models/weights/layers.2.feed_forward.w13.0.qweight b/triton_models/weights/layers.2.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..1b19b3f633c84fa1134ae29f0bf9f119d9b25d42 --- /dev/null +++ b/triton_models/weights/layers.2.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5d14e61c9cc1a1874bbf7c1db7fb04e8b97f8d49e011bf0b5c2003a072083cf +size 58720256 diff --git a/triton_models/weights/layers.2.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.2.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..e293bf94f00d2acb588e4a05e8b36c07adfd4cfe --- /dev/null +++ b/triton_models/weights/layers.2.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02a79b8fb1590037f3bcbe91f25dbcb82b2b91fe0a109dca31de0493a089fcdd +size 3670016 diff --git a/triton_models/weights/layers.2.feed_forward.w2.0.qweight b/triton_models/weights/layers.2.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c43fcc94e533822deff81b234c66897d23c2a5aa --- /dev/null +++ b/triton_models/weights/layers.2.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbde66d92d3be35621cdb2171a2b9e5ab5448d229f07d7da65d25553adcce029 +size 29360128 diff --git a/triton_models/weights/layers.2.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.2.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c5beb7d2b7d8320386a5105a4a2618ceec4e4943 --- /dev/null +++ b/triton_models/weights/layers.2.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41bfc952713a7fd5409f909e9ab107d9ef734e730f7b00d97fc34ef24395e62e +size 1835008 diff --git a/triton_models/weights/layers.2.ffn_norm.weight b/triton_models/weights/layers.2.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..45e884fea486483f4689411e2b0f5841bb3e6317 --- /dev/null +++ b/triton_models/weights/layers.2.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f16599930e314f9a8ef2b760cc6773e75961152d32432b5fc3e411955dbdc227 +size 8192 diff --git a/triton_models/weights/layers.2.past_kv_scale.0.weight b/triton_models/weights/layers.2.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..70e74bf48eaad9dd65823e3d66a8d46c4452b13d --- /dev/null +++ b/triton_models/weights/layers.2.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7808c14f00dcb7b2b77edadc8852138f46802e013a3025e161a669adde20339 +size 16 diff --git a/triton_models/weights/layers.20.attention.w_qkv.0.qweight b/triton_models/weights/layers.20.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..6053a83955560e1c2a84e72515c7672d70304835 --- /dev/null +++ b/triton_models/weights/layers.20.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45521551eeea8b702589fe7c6b19749333abf647f53f56713807dc38f58041ec +size 12582912 diff --git a/triton_models/weights/layers.20.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.20.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..0e188dc213c48bf55e4b2001a68e495c895187a7 --- /dev/null +++ b/triton_models/weights/layers.20.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7d9740714493408c67acb934d26406c11421ab7efdabd743bd990103a90f701 +size 786432 diff --git a/triton_models/weights/layers.20.attention.wo.0.qweight b/triton_models/weights/layers.20.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..730a6aa484d4286f408baf8abf88ea73e0b5aa02 --- /dev/null +++ b/triton_models/weights/layers.20.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55586decc011d181feef941588d73d75de2ec8040bce7db734699a33a7bd6f42 +size 8388608 diff --git a/triton_models/weights/layers.20.attention.wo.0.scales_zeros b/triton_models/weights/layers.20.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..affb6ab65788c985dc6ccf43d5cb3fcc8f4e91f6 --- /dev/null +++ b/triton_models/weights/layers.20.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3dff92bdb0d4bd34ecf08c0c024d9aabfeb9dc6407b55b55d25835922bddb9c +size 524288 diff --git a/triton_models/weights/layers.20.attention_norm.weight b/triton_models/weights/layers.20.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..a4b06c9551477c77ebc9de6151cd219a9c13f63c --- /dev/null +++ b/triton_models/weights/layers.20.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dfd453a8ca7eaa0368df85c67b0c4520d044c50e21e3e9c642016e56425fe2c +size 8192 diff --git a/triton_models/weights/layers.20.feed_forward.w13.0.qweight b/triton_models/weights/layers.20.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e0aa342e545feda824e44af8745b7bf6714e3672 --- /dev/null +++ b/triton_models/weights/layers.20.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a12408ddaac163c3473e187a838044bf3c05b1a72758d6b77338da700a74f845 +size 58720256 diff --git a/triton_models/weights/layers.20.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.20.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..595f2605064e623b1acbbbb39aad1abe47d2b5fe --- /dev/null +++ b/triton_models/weights/layers.20.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a20c9c4a6621e851abb268c647e4f9459277dc53bc5f64a0504562c9e7736b61 +size 3670016 diff --git a/triton_models/weights/layers.20.feed_forward.w2.0.qweight b/triton_models/weights/layers.20.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..3881b21e76f4c55a6f5a94d56794ece1d12912e8 --- /dev/null +++ b/triton_models/weights/layers.20.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e13a13177f50e58cd454dfef4083e8b8da065d25bd277aeabcbbd65d9c7ee2db +size 29360128 diff --git a/triton_models/weights/layers.20.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.20.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..f0c038b596c5143988722e1d044fdba36b9f4c53 --- /dev/null +++ b/triton_models/weights/layers.20.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2bb55062eaf5f412bae85c9ac428ddc2e0e59d0e53ebd21abb1228cf4d1ea3c +size 1835008 diff --git a/triton_models/weights/layers.20.ffn_norm.weight b/triton_models/weights/layers.20.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..3cfe4cc50ce587ea9b564a20130b4fe2225d7d52 --- /dev/null +++ b/triton_models/weights/layers.20.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37c809eef52d6f683a42650531b04e14b95934556c2f3607466882fff2c7a049 +size 8192 diff --git a/triton_models/weights/layers.20.past_kv_scale.0.weight b/triton_models/weights/layers.20.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..3fe9d60389494bd97b6721514bbf76a4a2f4aeea --- /dev/null +++ b/triton_models/weights/layers.20.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97801b00a17ab91f1019edf80b667e915c772df1461e322cb8602d8bd831a8b1 +size 16 diff --git a/triton_models/weights/layers.21.attention.w_qkv.0.qweight b/triton_models/weights/layers.21.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..905d5eb82f1967282905cf3974e526f1e48e2b90 --- /dev/null +++ b/triton_models/weights/layers.21.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2792bae2516c6d5167b1efdd66141ddc18439be883865eee923aa0d64f3501f7 +size 12582912 diff --git a/triton_models/weights/layers.21.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.21.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..9a1f6b2beb40845a92a60a5b1ea44afefad5446c --- /dev/null +++ b/triton_models/weights/layers.21.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:953b7c49b7ba4bab3b5ab552b697d5be9184144ec4f8f6ea9815a0e12420a4c6 +size 786432 diff --git a/triton_models/weights/layers.21.attention.wo.0.qweight b/triton_models/weights/layers.21.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..fbd8d63b76ae1f3a0394dfd4c09e724627ce656a --- /dev/null +++ b/triton_models/weights/layers.21.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f719914491c7941474c1b6efa5a79541ade54eff71a6d65a28dcff17baeacd89 +size 8388608 diff --git a/triton_models/weights/layers.21.attention.wo.0.scales_zeros b/triton_models/weights/layers.21.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..3199f31825d84cf98169a9ac8361fd01195c513a --- /dev/null +++ b/triton_models/weights/layers.21.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21e70d0275306b0d766b533780955602dc9d5163028c509745120b4e9dd070d1 +size 524288 diff --git a/triton_models/weights/layers.21.attention_norm.weight b/triton_models/weights/layers.21.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..ace9b471c09970005b6d8dcb34406ac8671f3340 --- /dev/null +++ b/triton_models/weights/layers.21.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f5b37279d734e53f01e524b941104c4a2a0794819cb443255e46130190eb060 +size 8192 diff --git a/triton_models/weights/layers.21.feed_forward.w13.0.qweight b/triton_models/weights/layers.21.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..93ad736f2b44139c784864069aece4a59db96543 --- /dev/null +++ b/triton_models/weights/layers.21.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7298a7ea1a9a2f16bfcca14510dce8da6342ceaccf48354e63945a00c86a8887 +size 58720256 diff --git a/triton_models/weights/layers.21.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.21.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..a7e502a74af20d234730806f84f0ee0fbec81a3d --- /dev/null +++ b/triton_models/weights/layers.21.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90e896e7361f2fde100ee9cbf4591ba2509c11ad2e06ff9150614c28f39f6cc7 +size 3670016 diff --git a/triton_models/weights/layers.21.feed_forward.w2.0.qweight b/triton_models/weights/layers.21.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e129776d2c3518130aa1688eefa5ce1d57e1f1cb --- /dev/null +++ b/triton_models/weights/layers.21.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0415c4da6fb2feb289a75e84a73c525272f0098ee5c14faf5544454178576f62 +size 29360128 diff --git a/triton_models/weights/layers.21.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.21.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..60435a424658f628b48358ed84954acb2782b727 --- /dev/null +++ b/triton_models/weights/layers.21.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ff5c969303a6b351d8bb80064aad2c92e8c5c32d85bff840317ca0739ced463 +size 1835008 diff --git a/triton_models/weights/layers.21.ffn_norm.weight b/triton_models/weights/layers.21.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..6655336998857a70516ff902b71f61175fd1a6c3 --- /dev/null +++ b/triton_models/weights/layers.21.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8042770bf17c4b7520332fdeeef3decf2eb77871e6d80a2fcfe79e850827faae +size 8192 diff --git a/triton_models/weights/layers.21.past_kv_scale.0.weight b/triton_models/weights/layers.21.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..68bb063c7fe76ee11dc858fe2552eff20f89fc06 --- /dev/null +++ b/triton_models/weights/layers.21.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:babef4e3b7889042e89f865f3c8bb53f6191e2c9329e3eb418e0627256b4bbf7 +size 16 diff --git a/triton_models/weights/layers.22.attention.w_qkv.0.qweight b/triton_models/weights/layers.22.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..26e5e328af67eb6995b4eccd4f3f47e2a5572bbb --- /dev/null +++ b/triton_models/weights/layers.22.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3845fa57cee6ae1adc7c640c17820f11d196a86138e3ab1b26d1fcdb5a12d480 +size 12582912 diff --git a/triton_models/weights/layers.22.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.22.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..25e896649de6e4eebef3fb52b4695e66834ea627 --- /dev/null +++ b/triton_models/weights/layers.22.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60a8fb6d26d3741fbf2dbd24d9e96a689ce0d8311349bc7b7d487a94ffae7309 +size 786432 diff --git a/triton_models/weights/layers.22.attention.wo.0.qweight b/triton_models/weights/layers.22.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..30d513ba9872686a172b2e5bb54d7dc19c89b18b --- /dev/null +++ b/triton_models/weights/layers.22.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e8c0a44652ccfbbb876d6c56c552653b788b14188b48f41b957d17036111f93 +size 8388608 diff --git a/triton_models/weights/layers.22.attention.wo.0.scales_zeros b/triton_models/weights/layers.22.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..63489132ff37547f3c5a7082e39f7d6e60d99e2f --- /dev/null +++ b/triton_models/weights/layers.22.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cf24c066812a6a36df8eec192b40520df7d10573d5a2bfd2327ddaecf6e938a +size 524288 diff --git a/triton_models/weights/layers.22.attention_norm.weight b/triton_models/weights/layers.22.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..67e9beee3472ac10efd53bef75c3678f86f0287a --- /dev/null +++ b/triton_models/weights/layers.22.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87603494aa61475dfc747464841436f303bcf654dc27b1a07564f53558ebc0e8 +size 8192 diff --git a/triton_models/weights/layers.22.feed_forward.w13.0.qweight b/triton_models/weights/layers.22.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..a6f81f752873c957d60d333f567fcf45dc101888 --- /dev/null +++ b/triton_models/weights/layers.22.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37604a1d32f8001155e15ab4e13282b050da543ad0d0a25b759081246fdbdb15 +size 58720256 diff --git a/triton_models/weights/layers.22.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.22.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..7fc132bdca2ee4128bec7e863686fdca2f7aebf4 --- /dev/null +++ b/triton_models/weights/layers.22.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06d1aced0b15076b9f26d4ea4f4f6b732368d7b373e7a588635da39cb9db5f39 +size 3670016 diff --git a/triton_models/weights/layers.22.feed_forward.w2.0.qweight b/triton_models/weights/layers.22.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2679586d03d73f48a045c13e8c8b19ad6eaa9b50 --- /dev/null +++ b/triton_models/weights/layers.22.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15b2a9ac0ae91a96deefa360ba92e79339705410d925b2356b9815692ea31061 +size 29360128 diff --git a/triton_models/weights/layers.22.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.22.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..7216f3454da54e1117fd4e92befe84b4c8b46a1a --- /dev/null +++ b/triton_models/weights/layers.22.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a99b63ab8c94e4d8f81bc8cab1561f47e3c2bac9f6e13f0b23d9438e02d7d1e +size 1835008 diff --git a/triton_models/weights/layers.22.ffn_norm.weight b/triton_models/weights/layers.22.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..4d71b5ceacf9dcc9afaaf1adf8978c2911ea951f --- /dev/null +++ b/triton_models/weights/layers.22.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:309c8793e4e6d01a426ded64878ab5bb81fc897a4369e2e12e180067d9e2f97f +size 8192 diff --git a/triton_models/weights/layers.22.past_kv_scale.0.weight b/triton_models/weights/layers.22.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..265569647dc54011c0c7aa312cda60679eddf224 --- /dev/null +++ b/triton_models/weights/layers.22.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a792b8d14741661477851bbe77b6f5dc4fecf7ce07009fb7d6bd25090b2ad2b +size 16 diff --git a/triton_models/weights/layers.23.attention.w_qkv.0.qweight b/triton_models/weights/layers.23.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..3c4b6c3a2d7fa4c456839afe2c5df63b4801cf29 --- /dev/null +++ b/triton_models/weights/layers.23.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2a664f7c9133d9a3d3f013ae68b7c826124f0ce8ee3e2a8b7a3d412fc4ce18c +size 12582912 diff --git a/triton_models/weights/layers.23.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.23.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..6980321a22d78892613c341246abfd4fa6a6ec1b --- /dev/null +++ b/triton_models/weights/layers.23.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d1caf7d6d040d5052d79ec08aa4282d486d3fd63e54ce73293b62776d97cc01 +size 786432 diff --git a/triton_models/weights/layers.23.attention.wo.0.qweight b/triton_models/weights/layers.23.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..a959f9c51c2010dee1865544214aa31aca8e384b --- /dev/null +++ b/triton_models/weights/layers.23.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:019ccc843a3257c4a7b36900f96de821382e2847851af142ae89a9238b434b20 +size 8388608 diff --git a/triton_models/weights/layers.23.attention.wo.0.scales_zeros b/triton_models/weights/layers.23.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..63ad5cf1b74567dc10825bf3797cef1aeaf45b20 --- /dev/null +++ b/triton_models/weights/layers.23.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80a82f597426b697fe58ed646f41dd9a6f4514d8d93e7f2791fac932dac100ca +size 524288 diff --git a/triton_models/weights/layers.23.attention_norm.weight b/triton_models/weights/layers.23.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..95ac563b56807e330af49708f5e09a5b5d763971 --- /dev/null +++ b/triton_models/weights/layers.23.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d621b52a30d8a04c1866972255522c844eebd9f0b57ee2b90fd4f8e5e7ba07a +size 8192 diff --git a/triton_models/weights/layers.23.feed_forward.w13.0.qweight b/triton_models/weights/layers.23.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..070dac5924104453edc840b81f83c3af7c79534c --- /dev/null +++ b/triton_models/weights/layers.23.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e95a18e90a00cd47b6fce45cb8c1eeedb6ec2b8fed6f0cd8de85f36cfd5dedee +size 58720256 diff --git a/triton_models/weights/layers.23.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.23.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..53c5e980f8815c039d907e5466820c61f9d1076c --- /dev/null +++ b/triton_models/weights/layers.23.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae6d90f0468717c0bf1b22ab4914319697011c4ee53f13241c0ca1970acc3331 +size 3670016 diff --git a/triton_models/weights/layers.23.feed_forward.w2.0.qweight b/triton_models/weights/layers.23.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..3dbd1908961ec50661072cfe35a0e65123ee0522 --- /dev/null +++ b/triton_models/weights/layers.23.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1036d81bd9d055c59bed34241ec3328c1035676dbcd78a0186946147c58af98b +size 29360128 diff --git a/triton_models/weights/layers.23.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.23.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..377898876f13249c94c85b69c632e4edbf89ca0d --- /dev/null +++ b/triton_models/weights/layers.23.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f354eef95b3a2007598e99428488351bc81e825cc08c8a22beea2a74432f0e91 +size 1835008 diff --git a/triton_models/weights/layers.23.ffn_norm.weight b/triton_models/weights/layers.23.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..6034309e63a873c266790385d8a50379dff8c851 --- /dev/null +++ b/triton_models/weights/layers.23.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36a712b30e1f4b920e2bf0e553bf62898650a968b94cb544d4c0cb45dd9724ba +size 8192 diff --git a/triton_models/weights/layers.23.past_kv_scale.0.weight b/triton_models/weights/layers.23.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..2054dd9b5bac4cc5f3947a6a29b0a00ee9c8f9c6 --- /dev/null +++ b/triton_models/weights/layers.23.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:362bc48a1da392c1d9c1404743b87e700f048e91e2236c0f23136126cbd17a42 +size 16 diff --git a/triton_models/weights/layers.24.attention.w_qkv.0.qweight b/triton_models/weights/layers.24.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..90ca332aa05b52f6a6c1174451a057235aeec1f3 --- /dev/null +++ b/triton_models/weights/layers.24.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c5cb069457b3e48f9401929077bc5a44b988b7741941ed8157cf23fc0af8fa2 +size 12582912 diff --git a/triton_models/weights/layers.24.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.24.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c424c3a6af59cdb2e6cd3d2acdd6fa6b8585e46b --- /dev/null +++ b/triton_models/weights/layers.24.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b47c34802342bd2a02dc98d311924169d7abdc703e43279cffdcf1422243038d +size 786432 diff --git a/triton_models/weights/layers.24.attention.wo.0.qweight b/triton_models/weights/layers.24.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..183cbc95eb079e344c88e1fa4774f568a66dbbd9 --- /dev/null +++ b/triton_models/weights/layers.24.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6468f6b524dabe33d4487522c605b92a5c91eaaa9d6b39433dd31588bfd09215 +size 8388608 diff --git a/triton_models/weights/layers.24.attention.wo.0.scales_zeros b/triton_models/weights/layers.24.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..c435ad2044cc72cc87bf58ea590aea7b6e463349 --- /dev/null +++ b/triton_models/weights/layers.24.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59fa63a2023ffc20a936686267ae08fe6c793889ca330e0fb0a44ab2b5fe8041 +size 524288 diff --git a/triton_models/weights/layers.24.attention_norm.weight b/triton_models/weights/layers.24.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..dccff49fb462091aab55a0c4eb163652123ff7d5 --- /dev/null +++ b/triton_models/weights/layers.24.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d38dd18c9fe84631f30cb2b7cb92efc25473d4ba1c438a7817690ed3bbaabd8 +size 8192 diff --git a/triton_models/weights/layers.24.feed_forward.w13.0.qweight b/triton_models/weights/layers.24.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f0bea0526b3fe332953eeee191fd4d279f3a8286 --- /dev/null +++ b/triton_models/weights/layers.24.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db478db4b91a673763d0252f233423fa31c7a562f80cbc6c106931886d56e253 +size 58720256 diff --git a/triton_models/weights/layers.24.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.24.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..8d233c239c539161b7c5f0b5f890f196d9c544c2 --- /dev/null +++ b/triton_models/weights/layers.24.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5329cd85fc6390d7fc596abdb5907e3c2576c2fb6fc87d7c0dc2dbae326a826 +size 3670016 diff --git a/triton_models/weights/layers.24.feed_forward.w2.0.qweight b/triton_models/weights/layers.24.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..d4c99dfed4f5fd009c04c0693ddd1253dadfb80e --- /dev/null +++ b/triton_models/weights/layers.24.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78e4b556d2c58615b1f3bcbfe8780a1217bc0420383b55afbf6767315ca09e66 +size 29360128 diff --git a/triton_models/weights/layers.24.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.24.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..8d61abbf087e7f17d99482529ceb6649e5f98e4b --- /dev/null +++ b/triton_models/weights/layers.24.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9861b1f0dcf30259bc7a9d1c02969f271b805981c696d49b1dcdd939a7ff504b +size 1835008 diff --git a/triton_models/weights/layers.24.ffn_norm.weight b/triton_models/weights/layers.24.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..a5247850bcab46ee044a136c8ca64f1223e6f1a7 --- /dev/null +++ b/triton_models/weights/layers.24.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f64ff3faab2a3c58cde1f351d57bef281660b552a9dbb9c0aa49bff00dcd6719 +size 8192 diff --git a/triton_models/weights/layers.24.past_kv_scale.0.weight b/triton_models/weights/layers.24.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..3a9a25a5c3ba55692571909bb40b460b6ed82ade --- /dev/null +++ b/triton_models/weights/layers.24.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d2ab419befc2e7b0391b3b7e7bfa13bf728db0d6cba53136aedc0802a4fcc8c +size 16 diff --git a/triton_models/weights/layers.25.attention.w_qkv.0.qweight b/triton_models/weights/layers.25.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..41c3344f95ab3594af8a3648d644979c8b8a3e84 --- /dev/null +++ b/triton_models/weights/layers.25.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0971d51d3ac5fa3cb80bf7adb2616878c3921d6810a7b8c312f2c5edfc20ba2b +size 12582912 diff --git a/triton_models/weights/layers.25.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.25.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..380f67b6fde572f2eecd73076b154bb56c631ceb --- /dev/null +++ b/triton_models/weights/layers.25.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd9d2322fc1ac860eeeb0ae4f57b15011ca5728cab0c2de14ad0734c813b1070 +size 786432 diff --git a/triton_models/weights/layers.25.attention.wo.0.qweight b/triton_models/weights/layers.25.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..58a080a5403fbc6975a8c92d3d8890d106c41f32 --- /dev/null +++ b/triton_models/weights/layers.25.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42757d1b84d12da08d617496b557df5dc43260ad03444559342e57effdeff897 +size 8388608 diff --git a/triton_models/weights/layers.25.attention.wo.0.scales_zeros b/triton_models/weights/layers.25.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..a623dfbef7759c22ba42888f23b6af5e7c88703c --- /dev/null +++ b/triton_models/weights/layers.25.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc49597aa705026d30a172bcee0421ded59135ee57d2d1a38d511274fd00db51 +size 524288 diff --git a/triton_models/weights/layers.25.attention_norm.weight b/triton_models/weights/layers.25.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..e330398be316b3c7d2b4e8091847c876352631d0 --- /dev/null +++ b/triton_models/weights/layers.25.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f978aa26bb24bbd527a1e949719d548e1c7bf7d30f04b02f0f28d1343053132 +size 8192 diff --git a/triton_models/weights/layers.25.feed_forward.w13.0.qweight b/triton_models/weights/layers.25.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..941b657818aee3d6c553e08ef74566cd98e55321 --- /dev/null +++ b/triton_models/weights/layers.25.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:063a4b6c0bb854f67986762bafa9651778da009fd725fe723fa47306a99a845f +size 58720256 diff --git a/triton_models/weights/layers.25.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.25.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..4df2b6e64935f05f8ec6ea3db6b9723c6ca0a7bd --- /dev/null +++ b/triton_models/weights/layers.25.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4a77dbd2274b6de3cfb89254d1cb2c0af54b304bb9134a280cbe9b620a361a9 +size 3670016 diff --git a/triton_models/weights/layers.25.feed_forward.w2.0.qweight b/triton_models/weights/layers.25.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..a2a36f211eb8cebc2e1ce26bbd4bcd9a806cee31 --- /dev/null +++ b/triton_models/weights/layers.25.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1626e0d17ba4f05b0f1e65537f46ada22bef2d00deb136c30dd6bb481b617d58 +size 29360128 diff --git a/triton_models/weights/layers.25.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.25.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..09e7a5b567087d78bfcd3614b11b21106f5f8f59 --- /dev/null +++ b/triton_models/weights/layers.25.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d9b0e50a31c6c29d57500a64edf731ea04db50967219bfdcb0853730c574333 +size 1835008 diff --git a/triton_models/weights/layers.25.ffn_norm.weight b/triton_models/weights/layers.25.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..026c4beed926345148e983d57a1eb89a25c4fd1c --- /dev/null +++ b/triton_models/weights/layers.25.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0eea4a26418b7a503c71abf443da9d784c2adca6551e4f1b998f94d6145d696 +size 8192 diff --git a/triton_models/weights/layers.25.past_kv_scale.0.weight b/triton_models/weights/layers.25.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..67871afaf8d1df47fbde1f4a65674ded07d4a864 --- /dev/null +++ b/triton_models/weights/layers.25.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cad249894548c60911d6d65a7d5846938c1e479698b4466d4cc6e03d2444922 +size 16 diff --git a/triton_models/weights/layers.26.attention.w_qkv.0.qweight b/triton_models/weights/layers.26.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..8e3258b77728a5579d15c2a374b61be41a2afa09 --- /dev/null +++ b/triton_models/weights/layers.26.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3b88ded4b32bf8ff5ab7fa3616ab98f1bfea6fd86f37b729ad69ffe89d33e97 +size 12582912 diff --git a/triton_models/weights/layers.26.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.26.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..cb16882090f73a8651b55899be0c7b66b7d89aef --- /dev/null +++ b/triton_models/weights/layers.26.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1303373a67371e1e2f3ed25bc8cd8e559b9503bc5b4fdc37bfaf758cd26acfb3 +size 786432 diff --git a/triton_models/weights/layers.26.attention.wo.0.qweight b/triton_models/weights/layers.26.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f65b33bea38f966cd6cd26980998df21898fad28 --- /dev/null +++ b/triton_models/weights/layers.26.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da07e11c5ce840df7eaa7de1ddff66356a2995b93b6d1cdefe1d96f6d4eb62a6 +size 8388608 diff --git a/triton_models/weights/layers.26.attention.wo.0.scales_zeros b/triton_models/weights/layers.26.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..e34f9fbc1e33e117eb223353e64a0d03c3a1ce09 --- /dev/null +++ b/triton_models/weights/layers.26.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec446a339a8b88e9d35b0feb0dc82c82f64420cc45aa67b0730bc6fdfeb33b24 +size 524288 diff --git a/triton_models/weights/layers.26.attention_norm.weight b/triton_models/weights/layers.26.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..bd89d7d2bb2a10e4537def6bc6550ddf681db645 --- /dev/null +++ b/triton_models/weights/layers.26.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:452e37de79706d39a7fddbbd901e8353363bb41bb1178eebb42b0a9aad1998fc +size 8192 diff --git a/triton_models/weights/layers.26.feed_forward.w13.0.qweight b/triton_models/weights/layers.26.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..ef1f200bdb37b79404804e211dddd09441a90cfb --- /dev/null +++ b/triton_models/weights/layers.26.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fac2317afed02f28c9f68eae5e04821f1fea2d7553bd4ce30b68b9a7e896be65 +size 58720256 diff --git a/triton_models/weights/layers.26.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.26.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..3613b7754b7de11bd7146b2f99bbb2aabad43346 --- /dev/null +++ b/triton_models/weights/layers.26.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e892079f260d62e05e5169a508c1b50c3beffc1e568e189b358850a9596863ac +size 3670016 diff --git a/triton_models/weights/layers.26.feed_forward.w2.0.qweight b/triton_models/weights/layers.26.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..42508b0d05c03cfe54875df80e5848f92e3a2148 --- /dev/null +++ b/triton_models/weights/layers.26.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b2ab3bee38aee899c1454a69dc424ae61b6d14d67438c307369be02f6460085 +size 29360128 diff --git a/triton_models/weights/layers.26.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.26.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..6078af07ebbfebda87b1016fd58cdcffbb0b4c73 --- /dev/null +++ b/triton_models/weights/layers.26.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:552933cb4c5ad88c47fcfc8c8982e8a9d6c2bcf4975d0a1ff17f85a0de9a72a0 +size 1835008 diff --git a/triton_models/weights/layers.26.ffn_norm.weight b/triton_models/weights/layers.26.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..659727ca29164c591b4db04c441375c79e981fce --- /dev/null +++ b/triton_models/weights/layers.26.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a474d6dce328dea51c94d84fde68d4472d68dbbf19ce347181b5956b98d41847 +size 8192 diff --git a/triton_models/weights/layers.26.past_kv_scale.0.weight b/triton_models/weights/layers.26.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..db316b10f011519fdc39c70e40706bb6499001f4 --- /dev/null +++ b/triton_models/weights/layers.26.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d995b27407d7307c6a5b4a4fa7f6247eac5d8c1cc62c066c9bd4395d0455a939 +size 16 diff --git a/triton_models/weights/layers.27.attention.w_qkv.0.qweight b/triton_models/weights/layers.27.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2b398a0b63fe43f5bd6467e9001673b60b3d8b76 --- /dev/null +++ b/triton_models/weights/layers.27.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fb11cc9d2229d99f45200d53d2430007eca65a120d988a8ace070a0e3754128 +size 12582912 diff --git a/triton_models/weights/layers.27.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.27.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..36269d2bb210deac5bfb20fc68c3a3c0ba2430d9 --- /dev/null +++ b/triton_models/weights/layers.27.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3b885790c722268908e56129344337198b0c0e4b3bf5e21a7f091d0846a5d30 +size 786432 diff --git a/triton_models/weights/layers.27.attention.wo.0.qweight b/triton_models/weights/layers.27.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..75c54cf768728053f1051c6d1260296c943bc2cd --- /dev/null +++ b/triton_models/weights/layers.27.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d46493db19a5dc9a8d01151f769f22f10733969cad257ff2372fe9ef169efdc7 +size 8388608 diff --git a/triton_models/weights/layers.27.attention.wo.0.scales_zeros b/triton_models/weights/layers.27.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..91523912e1e6240ee472d551a8422724c7f9396f --- /dev/null +++ b/triton_models/weights/layers.27.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f80605e605d11e0f5a9e470c80c72859f9651f99f3db043b9eab3989fffd647 +size 524288 diff --git a/triton_models/weights/layers.27.attention_norm.weight b/triton_models/weights/layers.27.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..92e464dfb802dd2cde189e137b6e908acaec5c38 --- /dev/null +++ b/triton_models/weights/layers.27.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b48e7db8fe774bd46f4eecc92ef7f6bde3cb8e3ba66836e6cae00572ea0e14e +size 8192 diff --git a/triton_models/weights/layers.27.feed_forward.w13.0.qweight b/triton_models/weights/layers.27.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e7392da13e07a3f00396eb1965e2c22daece98a8 --- /dev/null +++ b/triton_models/weights/layers.27.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a634ce6c3f2743a5e0fa245a0adf32df70a41dc7c969d40b1a3197f0436cdf5 +size 58720256 diff --git a/triton_models/weights/layers.27.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.27.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..4fadfc7e45425848c37d17c3f39ffbbb822a8c78 --- /dev/null +++ b/triton_models/weights/layers.27.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc138f3c7e31e1be2b6e2a57d7d5a2ffab4fa52343122dd272e41ac4bfd9096e +size 3670016 diff --git a/triton_models/weights/layers.27.feed_forward.w2.0.qweight b/triton_models/weights/layers.27.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..aae88c0abda360c16b47ef75abda1c4077edf25e --- /dev/null +++ b/triton_models/weights/layers.27.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9052da467e48c0c4138fd3769e456cb753464bb30a03a4942846a5b3877131f +size 29360128 diff --git a/triton_models/weights/layers.27.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.27.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..3b2fa2b516a8c83d6eed1702e517e005ac19f281 --- /dev/null +++ b/triton_models/weights/layers.27.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e1f67441bf5d4f5ca51f1f289e07a3c59907d324265741f76ad966bf1755749 +size 1835008 diff --git a/triton_models/weights/layers.27.ffn_norm.weight b/triton_models/weights/layers.27.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..c15c40329868b970cca611aff6e2bbe13d48abf0 --- /dev/null +++ b/triton_models/weights/layers.27.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fda3309eb353c9341280ab8f2a516011494cba8b769560e91cd0c9d27fc6561 +size 8192 diff --git a/triton_models/weights/layers.27.past_kv_scale.0.weight b/triton_models/weights/layers.27.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..d8710f2aebc08c7c65db4a66ef9daeba362df5ce --- /dev/null +++ b/triton_models/weights/layers.27.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2367dba495b15a673a5e8f907f19e98254caa8845195d88897b3ecc36d7c794 +size 16 diff --git a/triton_models/weights/layers.28.attention.w_qkv.0.qweight b/triton_models/weights/layers.28.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..11c1eafa7f15149287cd144977ef8e5a42645397 --- /dev/null +++ b/triton_models/weights/layers.28.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1f9e7857882c7a56236572f8a03d72222b257c8d9ed6e2efa1d66c6b5e21fb1 +size 12582912 diff --git a/triton_models/weights/layers.28.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.28.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..f725cdf5914a0af48485baa5a948fb90c3030913 --- /dev/null +++ b/triton_models/weights/layers.28.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da00a72b006477cacf5f86157b6206faefb0b9a1945fed4e5f2a2f9fc9846f55 +size 786432 diff --git a/triton_models/weights/layers.28.attention.wo.0.qweight b/triton_models/weights/layers.28.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..294eeaef86a93508f7f8b171fb8a303bcfb5602c --- /dev/null +++ b/triton_models/weights/layers.28.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:626eff3b0dc5215c6954f774fc8116aa989824ab9c971a3782d8bce5ad31d0a8 +size 8388608 diff --git a/triton_models/weights/layers.28.attention.wo.0.scales_zeros b/triton_models/weights/layers.28.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..90a1002de820fee0fabb5d5081cde6d434fa08dc --- /dev/null +++ b/triton_models/weights/layers.28.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5faf82a3313ab0b53237e677fa72b3b44137a47ab5f26d401a3bf43f5beb1bd8 +size 524288 diff --git a/triton_models/weights/layers.28.attention_norm.weight b/triton_models/weights/layers.28.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..1ec94894ca9c51e452e351065e83a91a22a1d264 --- /dev/null +++ b/triton_models/weights/layers.28.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac4a8732ba2c28970db1dc7e821bd6c8b0e4de12f8de1b6bc6692840154562a4 +size 8192 diff --git a/triton_models/weights/layers.28.feed_forward.w13.0.qweight b/triton_models/weights/layers.28.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2ad5905fe8ebd68dafedb5c0bbe70d34f3f8c71d --- /dev/null +++ b/triton_models/weights/layers.28.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f375cdf0cd1a60d7c9d00319853242606c44be5322598f91dbff37284f0ab67 +size 58720256 diff --git a/triton_models/weights/layers.28.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.28.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..f8676ba3b145e257dc1c75c1f9d9dd86413bc37d --- /dev/null +++ b/triton_models/weights/layers.28.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f57f5b0745ad5281aa67d83c0da6f1ebc7539dff487ae1345761bf995aedb1c +size 3670016 diff --git a/triton_models/weights/layers.28.feed_forward.w2.0.qweight b/triton_models/weights/layers.28.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e3532b664b06cd727ceb44f27462084bddb160c3 --- /dev/null +++ b/triton_models/weights/layers.28.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:393b972c36770d253df01db59d0c889a018a26ec7a18cf1e69617828344e2ed4 +size 29360128 diff --git a/triton_models/weights/layers.28.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.28.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..9cba65bef1506cf3787aac95439d21334e5424fa --- /dev/null +++ b/triton_models/weights/layers.28.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f4650f45c05fbd9d52eade717d47d32b1127ad57db10133ba490f5af3843551 +size 1835008 diff --git a/triton_models/weights/layers.28.ffn_norm.weight b/triton_models/weights/layers.28.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..0a50537a8d1863c6ea2bf1177d91c15f67d42dec --- /dev/null +++ b/triton_models/weights/layers.28.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26ab58696d625c79d618dd907bbeefb29dcb441a358411ed99c0f88e8649e74b +size 8192 diff --git a/triton_models/weights/layers.28.past_kv_scale.0.weight b/triton_models/weights/layers.28.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..335aa2710f889028753142ad7c1c770b5aaece8c --- /dev/null +++ b/triton_models/weights/layers.28.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be67c63310802e47b331969149928657a52d9caadc4dcd0599f0ed63fa8fe4c3 +size 16 diff --git a/triton_models/weights/layers.29.attention.w_qkv.0.qweight b/triton_models/weights/layers.29.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f7fb2a0c283d5309b0acac81e3f78bf535e119e0 --- /dev/null +++ b/triton_models/weights/layers.29.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:964846927bb91f85e501fe1626e8958dba12656845d1c2963d6f0d31ba0e6fe9 +size 12582912 diff --git a/triton_models/weights/layers.29.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.29.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..e4616ace3831b1353261ce821a222788574a6a7e --- /dev/null +++ b/triton_models/weights/layers.29.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59389b1002ea4286ef68d6a28a48de0070a8fe63bb33881a4ea5b4d4824b586a +size 786432 diff --git a/triton_models/weights/layers.29.attention.wo.0.qweight b/triton_models/weights/layers.29.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c339b504ad1ca7893a586fe0fbab27e0414733d4 --- /dev/null +++ b/triton_models/weights/layers.29.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a9f306da7ef17418be8aa9f47f97e653aeab2c155aaf1f32ea93c6e3e424c19 +size 8388608 diff --git a/triton_models/weights/layers.29.attention.wo.0.scales_zeros b/triton_models/weights/layers.29.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..880d7d9c3c95158609d1215b2f6bba14a3a6c655 --- /dev/null +++ b/triton_models/weights/layers.29.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1370f068209c9ab1f42b6657508b06a3511d1d2d8d2c5b5988f4d58591d40279 +size 524288 diff --git a/triton_models/weights/layers.29.attention_norm.weight b/triton_models/weights/layers.29.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..dc3408e864d2f349f03d2ea9f976241c0dd4ae19 --- /dev/null +++ b/triton_models/weights/layers.29.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0136d8df649cc27c395128240a43f899929866414704347f851202cc638b9ec0 +size 8192 diff --git a/triton_models/weights/layers.29.feed_forward.w13.0.qweight b/triton_models/weights/layers.29.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..12bd5dfc4141909486de6f81eb5de2cd0541f243 --- /dev/null +++ b/triton_models/weights/layers.29.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90f34915975f77f41c0057ec1ddc7e83098a74c6efe44d5cfcbd6252f7483773 +size 58720256 diff --git a/triton_models/weights/layers.29.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.29.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..92ba76313e8ccbbbbf563a230bc24e60c122fbbb --- /dev/null +++ b/triton_models/weights/layers.29.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56814e27f2fc6ea900d3623c77d1df558ea69fe154c99fe57fd45b6567a62186 +size 3670016 diff --git a/triton_models/weights/layers.29.feed_forward.w2.0.qweight b/triton_models/weights/layers.29.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..850b76dcf051ec7876aa7626f2aee3c02df70a73 --- /dev/null +++ b/triton_models/weights/layers.29.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95e520a4a76d63d5f4cfad6bb9577ab1343c24d563ee6491b0120e8b8f605a24 +size 29360128 diff --git a/triton_models/weights/layers.29.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.29.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..8d8434eea29d62735d93ec7d3ed91e73a56773a5 --- /dev/null +++ b/triton_models/weights/layers.29.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a45ecef0ec7bb53ccdd1499338dfc1590c5b4d4e64ca01119d8e2eac40c5249 +size 1835008 diff --git a/triton_models/weights/layers.29.ffn_norm.weight b/triton_models/weights/layers.29.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..cf3ccd85ec2a836282f95d8ffa96f001a6c78bfb --- /dev/null +++ b/triton_models/weights/layers.29.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80000d50b78aad7b0076bc159838fbc0e679d1b07aa00f374142e40c5fcbba01 +size 8192 diff --git a/triton_models/weights/layers.29.past_kv_scale.0.weight b/triton_models/weights/layers.29.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..135fea41df0db406183c0c705ee1bf4e15b3d938 --- /dev/null +++ b/triton_models/weights/layers.29.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2334dc6b4e2acee8b2c60625419023d8b5cb9692341970a8cb0cb0950658940d +size 16 diff --git a/triton_models/weights/layers.3.attention.w_qkv.0.qweight b/triton_models/weights/layers.3.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..741f2dbe9906898116ac1c0bcf6b6f1305ac0c7d --- /dev/null +++ b/triton_models/weights/layers.3.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b023e843f1b897e2768f8aa9d1f18e1a2fcb8a17ee904981117c3822cafda263 +size 12582912 diff --git a/triton_models/weights/layers.3.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.3.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..58882890a176f4e5d124ddfbdce381fc920d5b9d --- /dev/null +++ b/triton_models/weights/layers.3.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02c5a27de7ab84dc800a722021cefc12233818ba708f7ef20abed96d1efa3b29 +size 786432 diff --git a/triton_models/weights/layers.3.attention.wo.0.qweight b/triton_models/weights/layers.3.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..28835af03e975d2a253d1b43e9094dcef5665859 --- /dev/null +++ b/triton_models/weights/layers.3.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:961c0e6293f13ca0eb880f274fcf96b1394f554b645856d99f898ae03ba05ab1 +size 8388608 diff --git a/triton_models/weights/layers.3.attention.wo.0.scales_zeros b/triton_models/weights/layers.3.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..4941d02a83a0dab878ad6795511df8e08e216ce0 --- /dev/null +++ b/triton_models/weights/layers.3.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6a94458f402b8342d3936d5c436bcc1125e642d5216c1cf70ad7850d134dbdf +size 524288 diff --git a/triton_models/weights/layers.3.attention_norm.weight b/triton_models/weights/layers.3.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..fee571b50c58b11c6d17e7daaf1a1796af101e8a --- /dev/null +++ b/triton_models/weights/layers.3.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e702523cc2696abf9ea5f86ca0c3b8110cbc92f9074f3573cd0935519da7f326 +size 8192 diff --git a/triton_models/weights/layers.3.feed_forward.w13.0.qweight b/triton_models/weights/layers.3.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..6576fcc897f882a63b4376d2366b8a16b75529b2 --- /dev/null +++ b/triton_models/weights/layers.3.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec59414d327ec0ca8adf200f8593102b1cbef09d5a97e88f7e6f3d1d941e32d7 +size 58720256 diff --git a/triton_models/weights/layers.3.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.3.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..39bfc8b9158d17ace10985a0aefa5ed9b27c830f --- /dev/null +++ b/triton_models/weights/layers.3.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:592014759039919238673a2d601e2d397b3eb60f2b684d06201310dc35e6f870 +size 3670016 diff --git a/triton_models/weights/layers.3.feed_forward.w2.0.qweight b/triton_models/weights/layers.3.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..a2dc182c2e093651d77ac65087453506558cc6df --- /dev/null +++ b/triton_models/weights/layers.3.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c67555a8eae4e6cc55420ec37ea21933418f802190fc809bb33855011f8ec82a +size 29360128 diff --git a/triton_models/weights/layers.3.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.3.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..b12f9eae6cb382f2ef562f1e7dad7d8f2c7f4f48 --- /dev/null +++ b/triton_models/weights/layers.3.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b8d6409835e70b1c0fdf81979b61995fb90f43381277f9e457070df5a91229c +size 1835008 diff --git a/triton_models/weights/layers.3.ffn_norm.weight b/triton_models/weights/layers.3.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..1ac16014018db6a631b37da0836ea438c9d2fdaa --- /dev/null +++ b/triton_models/weights/layers.3.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b485c2892ea53a76f21e84c2ed42436b05a41f5dab146fab77f25d2b506ae53 +size 8192 diff --git a/triton_models/weights/layers.3.past_kv_scale.0.weight b/triton_models/weights/layers.3.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..86f8adc521ad298ee51185ebf02afa53325facc9 --- /dev/null +++ b/triton_models/weights/layers.3.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76bf77db19b1d0234ee2da545c98ee3d5921030e6deaa8b2742d4e9d400d7207 +size 16 diff --git a/triton_models/weights/layers.30.attention.w_qkv.0.qweight b/triton_models/weights/layers.30.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..087b322573894903eb8e5cf81dc0e4962ccbb4bb --- /dev/null +++ b/triton_models/weights/layers.30.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b085323586c5f61228e43ec3cf935799c983d169abd417a55a6c3f82cd255a1 +size 12582912 diff --git a/triton_models/weights/layers.30.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.30.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..de17498ac115e410694314f9e590322ecc3140ef --- /dev/null +++ b/triton_models/weights/layers.30.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:602a6e94ab5a7bda70167414ea1e71c46be0e7b46a69689d093f991dc6930079 +size 786432 diff --git a/triton_models/weights/layers.30.attention.wo.0.qweight b/triton_models/weights/layers.30.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..e9eddf6db391e55430e3ca4f04fc6966cdb3bc10 --- /dev/null +++ b/triton_models/weights/layers.30.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5864869bc2f57778cafb236ed45dbcacce36836e1c8b3dd94fd1375829174baa +size 8388608 diff --git a/triton_models/weights/layers.30.attention.wo.0.scales_zeros b/triton_models/weights/layers.30.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..f810acf8fcee1cdadd5b34adde32f9c37b177343 --- /dev/null +++ b/triton_models/weights/layers.30.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c899fc162f4dbec0809e3059f9ed0ba9d3004a75d31841ade9aaf16df93493e +size 524288 diff --git a/triton_models/weights/layers.30.attention_norm.weight b/triton_models/weights/layers.30.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..ad23a4893d3cffe2d398058b89dc78f528c91053 --- /dev/null +++ b/triton_models/weights/layers.30.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:683f799d6ecb59ef5b47ee78d4d1653b6a49da4dc6c6865734f2832457ad888e +size 8192 diff --git a/triton_models/weights/layers.30.feed_forward.w13.0.qweight b/triton_models/weights/layers.30.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..b61119e589e6b7759f74e927ba8c5a5286eb965f --- /dev/null +++ b/triton_models/weights/layers.30.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb97c170f0415eeb563dfaab343a6b7c736fb302b605cf65ac29e190d485f03a +size 58720256 diff --git a/triton_models/weights/layers.30.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.30.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..3f892216a36905289e63b4b93c0eaf050e7acc02 --- /dev/null +++ b/triton_models/weights/layers.30.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:debf89602b57cf687b1f434d484beefd647c3ea0e8305484658248c8238a347f +size 3670016 diff --git a/triton_models/weights/layers.30.feed_forward.w2.0.qweight b/triton_models/weights/layers.30.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..d0743b7b13a262d47d3c95ff5f00bcf70dca3937 --- /dev/null +++ b/triton_models/weights/layers.30.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00143d530f528cfdded636568772b1ac564990d10d52c943463e8198b0f45b22 +size 29360128 diff --git a/triton_models/weights/layers.30.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.30.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..649ffe4f3c74051e77a62d2bd111b1c8956635a4 --- /dev/null +++ b/triton_models/weights/layers.30.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6881934dda1754f8b7bdb5619bed9e9ec7cd819080a5080d36c545274e7563bd +size 1835008 diff --git a/triton_models/weights/layers.30.ffn_norm.weight b/triton_models/weights/layers.30.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..19611f78c82d05c2fa778fc4099462db96768018 --- /dev/null +++ b/triton_models/weights/layers.30.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c07830c7b5e53981d0d97e28af650885ba42b1395e88e2a8b553c080258be805 +size 8192 diff --git a/triton_models/weights/layers.30.past_kv_scale.0.weight b/triton_models/weights/layers.30.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..ebf0f2ce5ad46a9897b292cf74ea4074253d9e00 --- /dev/null +++ b/triton_models/weights/layers.30.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7a7079eaefe501289467f67ff3ec35deb358c17022eff2a2d77c011d87a7485 +size 16 diff --git a/triton_models/weights/layers.31.attention.w_qkv.0.qweight b/triton_models/weights/layers.31.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..33f1f7e919ab93f0f093697cc6564c8041cf7c9a --- /dev/null +++ b/triton_models/weights/layers.31.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42e8c9373e34e9f38c5aa5b7f9e7282f283dd138fa488699361a998289d4f0b8 +size 12582912 diff --git a/triton_models/weights/layers.31.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.31.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..51b423248b2e8762a232cb9f6524cc2d2882e6a1 --- /dev/null +++ b/triton_models/weights/layers.31.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e74870d817de1f15c0b372de19d9049754192d574290aa47cc2da4114e02fbe3 +size 786432 diff --git a/triton_models/weights/layers.31.attention.wo.0.qweight b/triton_models/weights/layers.31.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..7976fa7add831d946d9634761ff8db4d07f69a6b --- /dev/null +++ b/triton_models/weights/layers.31.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:882c11872607c376a08d0e7ab4025ebae8050ca0a958b4678fa7c5f5fe34af8c +size 8388608 diff --git a/triton_models/weights/layers.31.attention.wo.0.scales_zeros b/triton_models/weights/layers.31.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..87b74517a018f5d65e974fc575140a80f0cf2f63 --- /dev/null +++ b/triton_models/weights/layers.31.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:780d8a3fc0d41d7e42ab7524e0e8eb3a5044627584cb749954a08d74e8889cc2 +size 524288 diff --git a/triton_models/weights/layers.31.attention_norm.weight b/triton_models/weights/layers.31.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..9e1759f5a7b8ce3bcbdf54ac4a167aa2a3836eeb --- /dev/null +++ b/triton_models/weights/layers.31.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13b79fca3496315c35d45be930b96ac34c0616ae9bb69018d41d4fe7d77fa1c3 +size 8192 diff --git a/triton_models/weights/layers.31.feed_forward.w13.0.qweight b/triton_models/weights/layers.31.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..fa724a72baf441d9817165d242ae54e77b819e7d --- /dev/null +++ b/triton_models/weights/layers.31.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d6490623b97868d9d81417ecbbc40bbcf24f872882ca23b74a76f6f384082cd +size 58720256 diff --git a/triton_models/weights/layers.31.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.31.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..4e046750532412be4588ab28e7285c8f68bccf2f --- /dev/null +++ b/triton_models/weights/layers.31.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b03dd848d3c92adda40904bb369f812d1a2de1d72e53600bdf89cf3002aa5e4 +size 3670016 diff --git a/triton_models/weights/layers.31.feed_forward.w2.0.qweight b/triton_models/weights/layers.31.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..7954c17e1c4aac980fc31bc92786998b66007879 --- /dev/null +++ b/triton_models/weights/layers.31.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f491d3ff06bae3646c8cabbf8c8b6e14963e909e5a3f2cadd84931bb1acc076 +size 29360128 diff --git a/triton_models/weights/layers.31.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.31.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..1f95fe4038958211cbda9224b4161cae99e0c2e5 --- /dev/null +++ b/triton_models/weights/layers.31.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7895c436da989422f207c0631685485aada8b0cf45d0db3bbf0cb18b8573d8f4 +size 1835008 diff --git a/triton_models/weights/layers.31.ffn_norm.weight b/triton_models/weights/layers.31.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..491eadebff5c76dbdda444c927fd0bb153d54dbd --- /dev/null +++ b/triton_models/weights/layers.31.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b010068e8df791fcfd32ddefe46198f72adc5cb104f59512820541ed232ed52 +size 8192 diff --git a/triton_models/weights/layers.31.past_kv_scale.0.weight b/triton_models/weights/layers.31.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..9ed6ce58e195ff81f658649f8fbf99311dad0183 --- /dev/null +++ b/triton_models/weights/layers.31.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcd30ad8a1a6ae548b3b6cdbe2b3693c1d260fcf73e63e4cb201f4ff3a9216e8 +size 16 diff --git a/triton_models/weights/layers.4.attention.w_qkv.0.qweight b/triton_models/weights/layers.4.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..9efa7ae8526ee807be03ca3903436c1c4e096b2a --- /dev/null +++ b/triton_models/weights/layers.4.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd04897e691fff067678bfb5826f8c0dae0914c4a822266312a9fd08f9c8dfb9 +size 12582912 diff --git a/triton_models/weights/layers.4.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.4.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..b717a0bccf881f43c4dd4849aa9abac991f829b7 --- /dev/null +++ b/triton_models/weights/layers.4.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a4e0a9b4313f6f28361952f5e1c00250e0bc8d8e348238f634679cc9983d4b0 +size 786432 diff --git a/triton_models/weights/layers.4.attention.wo.0.qweight b/triton_models/weights/layers.4.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..bbc885705f67c282413e4e10b430177fa24c64d1 --- /dev/null +++ b/triton_models/weights/layers.4.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83ef42f037338f04aa63a71554b631e20e2cc1f4c44d0498061891de5d46dfec +size 8388608 diff --git a/triton_models/weights/layers.4.attention.wo.0.scales_zeros b/triton_models/weights/layers.4.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..0dea56a4d1087a93efcf6c1d4c45d4eddcffd41d --- /dev/null +++ b/triton_models/weights/layers.4.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92669ba1e130035258630c4bb58a6ae23088baa4c818edb89d18126368fdd2b1 +size 524288 diff --git a/triton_models/weights/layers.4.attention_norm.weight b/triton_models/weights/layers.4.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..85901d7d4381bcdd1d25c69d8652668e9e82e4d7 --- /dev/null +++ b/triton_models/weights/layers.4.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4392ba124c790351e1e804e3f6954b04df59cabe55918fb2ab208b9fcb1a25d4 +size 8192 diff --git a/triton_models/weights/layers.4.feed_forward.w13.0.qweight b/triton_models/weights/layers.4.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2eecef389220ebcbbb1b399d81d28d5c7123895d --- /dev/null +++ b/triton_models/weights/layers.4.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efab7d32785919b64059b2e20f610eae03ee8a2ba95bcd5c2d786e3074f66875 +size 58720256 diff --git a/triton_models/weights/layers.4.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.4.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..203aad693c83911b91ea533a372c2414914f0c33 --- /dev/null +++ b/triton_models/weights/layers.4.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:624fd673a1cb8d5eed0814f7d0ebcfa6de1f0933f2c808a43fe9915863d06992 +size 3670016 diff --git a/triton_models/weights/layers.4.feed_forward.w2.0.qweight b/triton_models/weights/layers.4.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..22624a1646b9f3bc812053a3e4eccd3aa066e8cc --- /dev/null +++ b/triton_models/weights/layers.4.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2a9bc1f9a857eb51f12e913af082a9d065232ad278a46bf3312fee70b57c929 +size 29360128 diff --git a/triton_models/weights/layers.4.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.4.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..ba1d032b1632c72d516bf607d69ef9d858ec3f69 --- /dev/null +++ b/triton_models/weights/layers.4.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f5a160ff8d293e97b6037541c207caf6ea4b15e625bd94dba7be81f1aa3052f +size 1835008 diff --git a/triton_models/weights/layers.4.ffn_norm.weight b/triton_models/weights/layers.4.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..10fdc6cff9055cfb29be992fd58fec67e3a1e156 --- /dev/null +++ b/triton_models/weights/layers.4.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7584bdc2460f81e60ad3db90f314b1c3c0bb458b724ad5a8ef2f6b87991871f +size 8192 diff --git a/triton_models/weights/layers.4.past_kv_scale.0.weight b/triton_models/weights/layers.4.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..8ab0548585972c0f9a19539e4f0246ed192f0042 --- /dev/null +++ b/triton_models/weights/layers.4.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:734c894776290dd532cb25f542e38b56c9151c45fb751e1d58f5aba3c1cf86ce +size 16 diff --git a/triton_models/weights/layers.5.attention.w_qkv.0.qweight b/triton_models/weights/layers.5.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..04ab0a16f4f6b5b500d30b4b27152a073d6efffb --- /dev/null +++ b/triton_models/weights/layers.5.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76f7240f7f94715ffc2e22da1e1986a7738b3a81d2803a89fa8d467ab37d52f3 +size 12582912 diff --git a/triton_models/weights/layers.5.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.5.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..35b017f6b8442ef2ed28b4f1d7f2aab7e6c8f3d4 --- /dev/null +++ b/triton_models/weights/layers.5.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f30a98755d5e88115a8343930c20bbfd34ef8095694f4c0709b299e0ee587b25 +size 786432 diff --git a/triton_models/weights/layers.5.attention.wo.0.qweight b/triton_models/weights/layers.5.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..4b270cc9d0768c5834bf5dee3db2ae53b9d1a2db --- /dev/null +++ b/triton_models/weights/layers.5.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c2c8b87162bc3f8d4c6044cbbba5bff1a0b4d484418966d683cd8edd5ffe289 +size 8388608 diff --git a/triton_models/weights/layers.5.attention.wo.0.scales_zeros b/triton_models/weights/layers.5.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..2170f6316f894a43c57df7c6f3b6435d6d290e59 --- /dev/null +++ b/triton_models/weights/layers.5.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8a0bc293e079e00c8fb29ea166613fb81fc7a51dfae01bda404298bd3541858 +size 524288 diff --git a/triton_models/weights/layers.5.attention_norm.weight b/triton_models/weights/layers.5.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..e56c76ec2f895f4ab09e315bcb026a0cd110898e --- /dev/null +++ b/triton_models/weights/layers.5.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e322bf9e96c707a007b6cf18e95291034a7b4acc28cc9c868ba72a2067f42a4a +size 8192 diff --git a/triton_models/weights/layers.5.feed_forward.w13.0.qweight b/triton_models/weights/layers.5.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c0603e429404aebb532d112009658a498d6a25d2 --- /dev/null +++ b/triton_models/weights/layers.5.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b111a37c3e4700a7ac8bcc755e22baf0cdd205a4f64cce28587b12e6bf542fa5 +size 58720256 diff --git a/triton_models/weights/layers.5.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.5.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..54720e241e1c6574c937ac39760a84933da14ee8 --- /dev/null +++ b/triton_models/weights/layers.5.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccbdd88d473982cb63c5daa191f2956e0826feff876c6303ad46054ce474a9f3 +size 3670016 diff --git a/triton_models/weights/layers.5.feed_forward.w2.0.qweight b/triton_models/weights/layers.5.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f69f281b519e24e86576e49e914a3f29b9833837 --- /dev/null +++ b/triton_models/weights/layers.5.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d055b75469902bb480fb2470766fc359100caf6f512e030d846c895cb23501e +size 29360128 diff --git a/triton_models/weights/layers.5.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.5.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..39d27ba627be29fdb76869d39b5a02b38030a6a9 --- /dev/null +++ b/triton_models/weights/layers.5.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf2b8068885689ca049003d3dff4bc8e68b47ddb9be7d7fdd56b39582b7fd61e +size 1835008 diff --git a/triton_models/weights/layers.5.ffn_norm.weight b/triton_models/weights/layers.5.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..8f90bb2bd06c0ff2405bb8ca61c65441dc384653 --- /dev/null +++ b/triton_models/weights/layers.5.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c886bfe39172273f70831164b7b87f48054c0da65cd1724be839673c817009b9 +size 8192 diff --git a/triton_models/weights/layers.5.past_kv_scale.0.weight b/triton_models/weights/layers.5.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..0032439aec9359a437391315477b7201d232b7ba --- /dev/null +++ b/triton_models/weights/layers.5.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b479855806803e6c485764401a2ed76b362ac09f2606a6d58fbba9b134ee186 +size 16 diff --git a/triton_models/weights/layers.6.attention.w_qkv.0.qweight b/triton_models/weights/layers.6.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..08c09cae235117db0cf2be801f075c4236bd6ba2 --- /dev/null +++ b/triton_models/weights/layers.6.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebf9ddd2465c02a1a37bafe82e009127d6cbbcf0bec3b323eece36934bb6eeff +size 12582912 diff --git a/triton_models/weights/layers.6.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.6.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..78b67e25716cf86de09b47dc537db6ec420fd21a --- /dev/null +++ b/triton_models/weights/layers.6.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b39acb9cc4de067c3ef5b0128c253ad0b646756445766d91f2421ca30ab6e272 +size 786432 diff --git a/triton_models/weights/layers.6.attention.wo.0.qweight b/triton_models/weights/layers.6.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..2d2cd5ddae6f67b08f6610fd6bfd8fe17ff43ad7 --- /dev/null +++ b/triton_models/weights/layers.6.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81ad5a0787961305a05ec9b7c0fb89cc2aa70589a36efea39557a8ff33be93c9 +size 8388608 diff --git a/triton_models/weights/layers.6.attention.wo.0.scales_zeros b/triton_models/weights/layers.6.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..380b6dedbd40afe6240e0271cfd0000ef9f17b01 --- /dev/null +++ b/triton_models/weights/layers.6.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edadc4493b3568ab5ebe758a1aedc2ef5fefcd688f5a78eb1866379967ca1cd6 +size 524288 diff --git a/triton_models/weights/layers.6.attention_norm.weight b/triton_models/weights/layers.6.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..68cf1e82a5f3d60ef2c37bde39437efe411c0263 --- /dev/null +++ b/triton_models/weights/layers.6.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dcd4367593812ecec39d8b1ff7cd21912c1283686db24be488384fd2453162c +size 8192 diff --git a/triton_models/weights/layers.6.feed_forward.w13.0.qweight b/triton_models/weights/layers.6.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..f66c0c431c68905f3cc431d2b266b628bcc1f9b1 --- /dev/null +++ b/triton_models/weights/layers.6.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3cc20446684f9b809fd52c40bda9d32c115789c650575c0e54f5ab030b7ceed +size 58720256 diff --git a/triton_models/weights/layers.6.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.6.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..d158d234d215899f80ded95207cff364e20e0c1d --- /dev/null +++ b/triton_models/weights/layers.6.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f01f13b1cd0cd8080d7c4906d71e44200b8053aa605a37069f1a9e1034a81f93 +size 3670016 diff --git a/triton_models/weights/layers.6.feed_forward.w2.0.qweight b/triton_models/weights/layers.6.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..0bee7d213091341bc193cd21b808a3776987b7dd --- /dev/null +++ b/triton_models/weights/layers.6.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95865a00e74b9d37ba9c21241922979b4f26eb06b78b84b25be12bcfba617657 +size 29360128 diff --git a/triton_models/weights/layers.6.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.6.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..80f3f7257450ba5de9d4dabaa61b516c7c807046 --- /dev/null +++ b/triton_models/weights/layers.6.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0dcaefa2acb86a25aedc25d60558af179bbf8968f1fd023b20343dad73b0184 +size 1835008 diff --git a/triton_models/weights/layers.6.ffn_norm.weight b/triton_models/weights/layers.6.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..b56799656e38d049d14d02b2d7e4ab1e470bac6d --- /dev/null +++ b/triton_models/weights/layers.6.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e458ef7058c9d7734737447072dc2908dea9ebf64a2ebcef932e4d6832057f5b +size 8192 diff --git a/triton_models/weights/layers.6.past_kv_scale.0.weight b/triton_models/weights/layers.6.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..70c460d32701c69c43ce43977e55d4c5e407b1c8 --- /dev/null +++ b/triton_models/weights/layers.6.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa3e886e06b35057d676139206ed116fafd8c8dd29244eff07cf1221837e8807 +size 16 diff --git a/triton_models/weights/layers.7.attention.w_qkv.0.qweight b/triton_models/weights/layers.7.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..4bd1b6da8292c5b10b20dbee8e2ee7e95a46637d --- /dev/null +++ b/triton_models/weights/layers.7.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0c4ca025a4e163c0dc2da98d463549125001a9cc93654f37907cce2a9882d52 +size 12582912 diff --git a/triton_models/weights/layers.7.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.7.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..8846088f9a04128c3626ebdde6d6747d1d663587 --- /dev/null +++ b/triton_models/weights/layers.7.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c086c5de28164657905ed6eaed423d6244ae0368c6180aa26fc0a6eb89724a83 +size 786432 diff --git a/triton_models/weights/layers.7.attention.wo.0.qweight b/triton_models/weights/layers.7.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c4891059c086711d0200456b57dc31f93418ba81 --- /dev/null +++ b/triton_models/weights/layers.7.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efcb8926a09d3f78acbff4e19e2e5bafad04172d17321a6af2b4fe7974c40fe1 +size 8388608 diff --git a/triton_models/weights/layers.7.attention.wo.0.scales_zeros b/triton_models/weights/layers.7.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..a08abb8652ecda43c661807290bbefa793fb0160 --- /dev/null +++ b/triton_models/weights/layers.7.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c0cdf8402670c6998b317082c140f0eb51c4bb0b41ca4e6386c6f1648f56a76 +size 524288 diff --git a/triton_models/weights/layers.7.attention_norm.weight b/triton_models/weights/layers.7.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..37c18cd18f7054a248d6352d4d5a25ac9a4175e5 --- /dev/null +++ b/triton_models/weights/layers.7.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28cf5e25d536f7d9180c2eb1d7dcfd7d4bb749816849f75c5e09f0210cdbc417 +size 8192 diff --git a/triton_models/weights/layers.7.feed_forward.w13.0.qweight b/triton_models/weights/layers.7.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..9b50669a9dc81bf91e567a299ee57d333907a007 --- /dev/null +++ b/triton_models/weights/layers.7.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0581fd7f812265f9b47b8eab7621664a046c4c6f98279676df767aaf339eee7 +size 58720256 diff --git a/triton_models/weights/layers.7.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.7.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..93d6f40d2e5bcd8b2a2da3d12418121279963070 --- /dev/null +++ b/triton_models/weights/layers.7.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f86e5d5f8bd7d8eded5bf5a5cbefc9b1b3242cdb2b486f6b1b0289d75f4df828 +size 3670016 diff --git a/triton_models/weights/layers.7.feed_forward.w2.0.qweight b/triton_models/weights/layers.7.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..9d07164c18362f5b0879cc88dbb43ef395f284f2 --- /dev/null +++ b/triton_models/weights/layers.7.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b02b881d979d0fb77a4d705ed4bc68ca58e7cfa84a504d90b9e816ddd99a6b0 +size 29360128 diff --git a/triton_models/weights/layers.7.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.7.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..b95f34d475e6c10781aca4639fbcadc9e706fc5a --- /dev/null +++ b/triton_models/weights/layers.7.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0c7e60168198f2ac9347ac8eb4fc59ea42fe0380e24550cd4fa2e989a2d90b4 +size 1835008 diff --git a/triton_models/weights/layers.7.ffn_norm.weight b/triton_models/weights/layers.7.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..7669f396fbea22312892ecc7e69f5847e3e3d0f7 --- /dev/null +++ b/triton_models/weights/layers.7.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bce0233aef9e8401ea7eaddce5b44f2a28b6fd1018023ec3f2cae495f4d205b6 +size 8192 diff --git a/triton_models/weights/layers.7.past_kv_scale.0.weight b/triton_models/weights/layers.7.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..d2b299db6620c0abf87b67b228dd03b696854499 --- /dev/null +++ b/triton_models/weights/layers.7.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae08ed15fa296e998f7e93b866fb5536103b357ca8fd0e8ee44423c4fe3ea4d3 +size 16 diff --git a/triton_models/weights/layers.8.attention.w_qkv.0.qweight b/triton_models/weights/layers.8.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..9a071d9e1c24a362c04a0f4335000d1eeeadbfea --- /dev/null +++ b/triton_models/weights/layers.8.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:312a5231076c36e023c30c18761d4793c7aaf2d1658f740a4ed6fe3ab9fb9532 +size 12582912 diff --git a/triton_models/weights/layers.8.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.8.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..b756258fc2694a8580c1d6d55d73c1aae4f88737 --- /dev/null +++ b/triton_models/weights/layers.8.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:045eb164e9d18487951013b4a69dab786f034139e232a0c079e6c6de0b84d445 +size 786432 diff --git a/triton_models/weights/layers.8.attention.wo.0.qweight b/triton_models/weights/layers.8.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..79dcacb0bc5ed37629a105bb0afdc20c383e1736 --- /dev/null +++ b/triton_models/weights/layers.8.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:917ac6b4102a88cb5fe47a13834f30fb45329e8234e6bf4a6d5def09acfca138 +size 8388608 diff --git a/triton_models/weights/layers.8.attention.wo.0.scales_zeros b/triton_models/weights/layers.8.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..3f21f5d05d73002cb0251350fce183ec3b6f82cc --- /dev/null +++ b/triton_models/weights/layers.8.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:075ca25071e36779993618787bcad51f47a6210b5c7efb13836b9f0c39113c7b +size 524288 diff --git a/triton_models/weights/layers.8.attention_norm.weight b/triton_models/weights/layers.8.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..6441edc914d86ab07b46c530e63df5e212499fbf --- /dev/null +++ b/triton_models/weights/layers.8.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7645c5cc08248a97031708e37a8869793e72e86be7d529ee2d38214aa125f326 +size 8192 diff --git a/triton_models/weights/layers.8.feed_forward.w13.0.qweight b/triton_models/weights/layers.8.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..6b623d7f4ebef4670369d48905c1f66aa9b3fd94 --- /dev/null +++ b/triton_models/weights/layers.8.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0a76bb17ba96c365a1bf660f901c21c3fc1d15165b0532e97c7ad86158513f0 +size 58720256 diff --git a/triton_models/weights/layers.8.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.8.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..f7b56f5fefdb81227823903289604a2f9e33cbf6 --- /dev/null +++ b/triton_models/weights/layers.8.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f6cc9bf35da7c08e89248a2d1151ca84f97e0d44fda2f474fbe090fa2b71bc6 +size 3670016 diff --git a/triton_models/weights/layers.8.feed_forward.w2.0.qweight b/triton_models/weights/layers.8.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..c837700cdf510ee1df94f861174695bb0e1ccfc8 --- /dev/null +++ b/triton_models/weights/layers.8.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67d6a461146ce6fca245beab647f837c7718f50c1ae6d48f852becd4b88ecd68 +size 29360128 diff --git a/triton_models/weights/layers.8.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.8.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..63ba13362b7c68d37224b01f241452a27cf8717a --- /dev/null +++ b/triton_models/weights/layers.8.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22f763f7c06275a5821c55ab0428986c7982da93d02ec561c4c1cf0bc83cb82a +size 1835008 diff --git a/triton_models/weights/layers.8.ffn_norm.weight b/triton_models/weights/layers.8.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..c4ec482ee099d1dd8d7b2633b38f9546642f8c04 --- /dev/null +++ b/triton_models/weights/layers.8.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97f607d08fdcc7d4a7048194e994afa25c34242bddec4d56534a779484534dec +size 8192 diff --git a/triton_models/weights/layers.8.past_kv_scale.0.weight b/triton_models/weights/layers.8.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..dae30d205782945d230c044159736e88b8c261e0 --- /dev/null +++ b/triton_models/weights/layers.8.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55e7e6e9663622f872cb332c414eac32a102e97ffdf3f5a2b6afa6f8371e1a5f +size 16 diff --git a/triton_models/weights/layers.9.attention.w_qkv.0.qweight b/triton_models/weights/layers.9.attention.w_qkv.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..502cfce88cfb73bd839f1fb667fba672259c4294 --- /dev/null +++ b/triton_models/weights/layers.9.attention.w_qkv.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ad1c9bfda707333f5860de8512ec7db789721d5f17e96ec0c1f79f98533c42c +size 12582912 diff --git a/triton_models/weights/layers.9.attention.w_qkv.0.scales_zeros b/triton_models/weights/layers.9.attention.w_qkv.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..47605d66d4acddffb2885150c9d68d184f94a9c6 --- /dev/null +++ b/triton_models/weights/layers.9.attention.w_qkv.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b5179dc3fba3abadb58abf409bfef33b382dc7373a002c3c43da9785c86f614 +size 786432 diff --git a/triton_models/weights/layers.9.attention.wo.0.qweight b/triton_models/weights/layers.9.attention.wo.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..0c3613bd080dd0fe0abbe07c8a567bf85e48e33d --- /dev/null +++ b/triton_models/weights/layers.9.attention.wo.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:535eb0ed2a008590448c38ddcfcf990219dd0c1752e28d11fe3310cdf4039d57 +size 8388608 diff --git a/triton_models/weights/layers.9.attention.wo.0.scales_zeros b/triton_models/weights/layers.9.attention.wo.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..bc68d0462949d41fb22495d6fc4d8a2c6c21b6a6 --- /dev/null +++ b/triton_models/weights/layers.9.attention.wo.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee2d02d009e36ca78d86a48ea408c2017c21903b64400397a77f437f495d936c +size 524288 diff --git a/triton_models/weights/layers.9.attention_norm.weight b/triton_models/weights/layers.9.attention_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..8493ee9741dd897107d9fe3cea7c2d01fdd4dee5 --- /dev/null +++ b/triton_models/weights/layers.9.attention_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcacb811b4cf62144e1ac2d3eadbafab30083e3420c46a92df1ab21840b29fe5 +size 8192 diff --git a/triton_models/weights/layers.9.feed_forward.w13.0.qweight b/triton_models/weights/layers.9.feed_forward.w13.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..bcb62122ef3b2bf1d13099eb7e64cd4f6266f02c --- /dev/null +++ b/triton_models/weights/layers.9.feed_forward.w13.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aca67258bcd3c39f17fb15a14b72cfe8ca597aeb30e0f4f298efa5eb093abcf3 +size 58720256 diff --git a/triton_models/weights/layers.9.feed_forward.w13.0.scales_zeros b/triton_models/weights/layers.9.feed_forward.w13.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..3e0e6af0add56eeb2e1cf7bc0142e52be7a5ae29 --- /dev/null +++ b/triton_models/weights/layers.9.feed_forward.w13.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4b60ceaccc0af57c36de7cd69acf05d8c307f2d6d27a7e765e0f132ae95d17a +size 3670016 diff --git a/triton_models/weights/layers.9.feed_forward.w2.0.qweight b/triton_models/weights/layers.9.feed_forward.w2.0.qweight new file mode 100644 index 0000000000000000000000000000000000000000..399c1fc8d6cc43a27e802ca067c88fc4f9a3bc73 --- /dev/null +++ b/triton_models/weights/layers.9.feed_forward.w2.0.qweight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e07e422f44ddda11dc7404b257cacd675b2b7f44491941e6754155df3a31d2e +size 29360128 diff --git a/triton_models/weights/layers.9.feed_forward.w2.0.scales_zeros b/triton_models/weights/layers.9.feed_forward.w2.0.scales_zeros new file mode 100644 index 0000000000000000000000000000000000000000..9509fd872d04e11bf53f07f99129e785b2056187 --- /dev/null +++ b/triton_models/weights/layers.9.feed_forward.w2.0.scales_zeros @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cc346804097116087236c77f2e2c018922efba4f2e32d8a71ddf8b026c9d34d +size 1835008 diff --git a/triton_models/weights/layers.9.ffn_norm.weight b/triton_models/weights/layers.9.ffn_norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..185031880012c613c2cf8937d4aa159e1c93a4c0 --- /dev/null +++ b/triton_models/weights/layers.9.ffn_norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98119ccde8c54eacba56311e43a7c74e62e30e0d7302b011202dea6a6348ba66 +size 8192 diff --git a/triton_models/weights/layers.9.past_kv_scale.0.weight b/triton_models/weights/layers.9.past_kv_scale.0.weight new file mode 100644 index 0000000000000000000000000000000000000000..0ec9f90c9c5be11398b7b1bdba1df5b0975ab0d4 --- /dev/null +++ b/triton_models/weights/layers.9.past_kv_scale.0.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62cf0a7960b56038dd17b81e2a1c38a016c2b78bd7272299dee18ae8e53e5c92 +size 16 diff --git a/triton_models/weights/norm.weight b/triton_models/weights/norm.weight new file mode 100644 index 0000000000000000000000000000000000000000..906361178f72cf7bd1f01447accc35bf0e1b633a --- /dev/null +++ b/triton_models/weights/norm.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efcd3fb0c1c5225c17e0eeb5b46068bb7311f716a4908d5a39d79b37985b58e7 +size 8192 diff --git a/triton_models/weights/output.weight b/triton_models/weights/output.weight new file mode 100644 index 0000000000000000000000000000000000000000..04e8f86f0b46051b3db62d5eefcbebda87641472 --- /dev/null +++ b/triton_models/weights/output.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b0ed41b4df8f91647fc8bdd2aa61f55c39e09b6e063c8bd509b591797293919 +size 758120448 diff --git a/triton_models/weights/tok_embeddings.weight b/triton_models/weights/tok_embeddings.weight new file mode 100644 index 0000000000000000000000000000000000000000..0b3edbd16fbb690f7c781043ea905fd4380e5f04 --- /dev/null +++ b/triton_models/weights/tok_embeddings.weight @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8986115ad7e59813a41c88c0d601235fa36138d6c15e5657a050cf4ec40fb037 +size 758120448