diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/config.json new file mode 100644 index 0000000000000000000000000000000000000000..50d927dc68b4eaa40bd4812b7417b3f2bd61f599 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "chatglm3-6b", + "add_bias_linear": false, + "add_qkv_bias": true, + "apply_query_key_layer_scaling": true, + "apply_residual_connection_post_layernorm": false, + "architectures": [ + "ChatGLMForConditionalGeneration" + ], + "attention_dropout": 0.0, + "attention_softmax_in_fp32": true, + "auto_map": { + "AutoConfig": "configuration_chatglm.ChatGLMConfig", + "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification" + }, + "bias_dropout_fusion": true, + "classifier_dropout": null, + "eos_token_id": 2, + "ffn_hidden_size": 13696, + "fp32_residual_connection": false, + "hidden_dropout": 0.0, + "hidden_size": 4096, + "kv_channels": 128, + "layernorm_epsilon": 1e-05, + "model_type": "chatglm", + "multi_query_attention": true, + "multi_query_group_num": 2, + "num_attention_heads": 32, + "num_layers": 28, + "original_rope": true, + "pad_token_id": 0, + "padded_vocab_size": 65024, + "post_layer_norm": true, + "pre_seq_len": 128, + "prefix_projection": false, + "quantization_bit": 0, + "rmsnorm": true, + "seq_length": 8192, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.34.0", + "use_cache": true, + "vocab_size": 65024 +} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/configuration_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/configuration_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..35600185f5a26951081de0f3a41a913eaf06af99 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/configuration_chatglm.py @@ -0,0 +1,61 @@ +from transformers import PretrainedConfig + + +class ChatGLMConfig(PretrainedConfig): + model_type = "chatglm" + def __init__( + self, + num_layers=28, + padded_vocab_size=65024, + hidden_size=4096, + ffn_hidden_size=13696, + kv_channels=128, + num_attention_heads=32, + seq_length=2048, + hidden_dropout=0.0, + classifier_dropout=None, + attention_dropout=0.0, + layernorm_epsilon=1e-5, + rmsnorm=True, + apply_residual_connection_post_layernorm=False, + post_layer_norm=True, + add_bias_linear=False, + add_qkv_bias=False, + bias_dropout_fusion=True, + multi_query_attention=False, + multi_query_group_num=1, + apply_query_key_layer_scaling=True, + attention_softmax_in_fp32=True, + fp32_residual_connection=False, + quantization_bit=0, + pre_seq_len=None, + prefix_projection=False, + **kwargs + ): + self.num_layers = num_layers + self.vocab_size = padded_vocab_size + self.padded_vocab_size = padded_vocab_size + self.hidden_size = hidden_size + self.ffn_hidden_size = ffn_hidden_size + self.kv_channels = kv_channels + self.num_attention_heads = num_attention_heads + self.seq_length = seq_length + self.hidden_dropout = hidden_dropout + self.classifier_dropout = classifier_dropout + self.attention_dropout = attention_dropout + self.layernorm_epsilon = layernorm_epsilon + self.rmsnorm = rmsnorm + self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm + self.post_layer_norm = post_layer_norm + self.add_bias_linear = add_bias_linear + self.add_qkv_bias = add_qkv_bias + self.bias_dropout_fusion = bias_dropout_fusion + self.multi_query_attention = multi_query_attention + self.multi_query_group_num = multi_query_group_num + self.apply_query_key_layer_scaling = apply_query_key_layer_scaling + self.attention_softmax_in_fp32 = attention_softmax_in_fp32 + self.fp32_residual_connection = fp32_residual_connection + self.quantization_bit = quantization_bit + self.pre_seq_len = pre_seq_len + self.prefix_projection = prefix_projection + super().__init__(**kwargs) \ No newline at end of file diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/generation_config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31d22ad9418a1daba6b2bbf472ac3762cd5ce643 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "eos_token_id": 2, + "pad_token_id": 0, + "transformers_version": "4.34.0" +} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/modeling_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/modeling_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..c5b5027587016090a377f25289284b6e4f829cb4 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/modeling_chatglm.py @@ -0,0 +1,1293 @@ +""" PyTorch ChatGLM model. """ + +import math +import copy +import warnings +import re +import sys + +import torch +import torch.utils.checkpoint +import torch.nn.functional as F +from torch import nn +from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss +from torch.nn.utils import skip_init +from typing import Optional, Tuple, Union, List, Callable, Dict, Any +from copy import deepcopy + +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import logging +from transformers.generation.logits_process import LogitsProcessor +from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput + +from .configuration_chatglm import ChatGLMConfig + +# flags required to enable jit fusion kernels + +if sys.platform != 'darwin': + torch._C._jit_set_profiling_mode(False) + torch._C._jit_set_profiling_executor(False) + torch._C._jit_override_can_fuse_on_cpu(True) + torch._C._jit_override_can_fuse_on_gpu(True) + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM" +_CONFIG_FOR_DOC = "ChatGLMConfig" + +CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "THUDM/chatglm3-6b", + # See all ChatGLM models at https://huggingface.co/models?filter=chatglm +] + + +def default_init(cls, *args, **kwargs): + return cls(*args, **kwargs) + + +class InvalidScoreLogitsProcessor(LogitsProcessor): + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + if torch.isnan(scores).any() or torch.isinf(scores).any(): + scores.zero_() + scores[..., 5] = 5e4 + return scores + + +class PrefixEncoder(torch.nn.Module): + """ + The torch.nn model to encode the prefix + Input shape: (batch-size, prefix-length) + Output shape: (batch-size, prefix-length, 2*layers*hidden) + """ + + def __init__(self, config: ChatGLMConfig): + super().__init__() + self.prefix_projection = config.prefix_projection + if self.prefix_projection: + # Use a two-layer MLP to encode the prefix + kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2 + self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size) + self.trans = torch.nn.Sequential( + torch.nn.Linear(kv_size, config.hidden_size), + torch.nn.Tanh(), + torch.nn.Linear(config.hidden_size, kv_size) + ) + else: + self.embedding = torch.nn.Embedding(config.pre_seq_len, + config.num_layers * config.kv_channels * config.multi_query_group_num * 2) + + def forward(self, prefix: torch.Tensor): + if self.prefix_projection: + prefix_tokens = self.embedding(prefix) + past_key_values = self.trans(prefix_tokens) + else: + past_key_values = self.embedding(prefix) + return past_key_values + + +def split_tensor_along_last_dim( + tensor: torch.Tensor, + num_partitions: int, + contiguous_split_chunks: bool = False, +) -> List[torch.Tensor]: + """Split a tensor along its last dimension. + + Arguments: + tensor: input tensor. + num_partitions: number of partitions to split the tensor + contiguous_split_chunks: If True, make each chunk contiguous + in memory. + + Returns: + A list of Tensors + """ + # Get the size and dimension. + last_dim = tensor.dim() - 1 + last_dim_size = tensor.size()[last_dim] // num_partitions + # Split. + tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) + # Note: torch.split does not create contiguous tensors by default. + if contiguous_split_chunks: + return tuple(chunk.contiguous() for chunk in tensor_list) + + return tensor_list + + +class RotaryEmbedding(nn.Module): + def __init__(self, dim, original_impl=False, device=None, dtype=None): + super().__init__() + inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim)) + self.register_buffer("inv_freq", inv_freq) + self.dim = dim + self.original_impl = original_impl + + def forward_impl( + self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000 + ): + """Enhanced Transformer with Rotary Position Embedding. + + Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/ + transformers/rope/__init__.py. MIT License: + https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license. + """ + # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$ + theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem)) + + # Create position indexes `[0, 1, ..., seq_len - 1]` + seq_idx = torch.arange(seq_len, dtype=torch.float, device=device) + + # Calculate the product of position index and $\theta_i$ + idx_theta = torch.outer(seq_idx, theta).float() + + cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1) + + # this is to mimic the behaviour of complex32, else we will get different results + if dtype in (torch.float16, torch.bfloat16, torch.int8): + cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half() + return cache + + def forward(self, max_seq_len, offset=0): + return self.forward_impl( + max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device + ) + + +@torch.jit.script +def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor: + # x: [sq, b, np, hn] + sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3) + rot_dim = rope_cache.shape[-2] * 2 + x, x_pass = x[..., :rot_dim], x[..., rot_dim:] + # truncate to support variable sizes + rope_cache = rope_cache[:sq] + xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2) + rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2) + x_out2 = torch.stack( + [ + xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1], + xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1], + ], + -1, + ) + x_out2 = x_out2.flatten(3) + return torch.cat((x_out2, x_pass), dim=-1) + + +class RMSNorm(torch.nn.Module): + def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs): + super().__init__() + self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype)) + self.eps = eps + + def forward(self, hidden_states: torch.Tensor): + input_dtype = hidden_states.dtype + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.eps) + + return (self.weight * hidden_states).to(input_dtype) + + +class CoreAttention(torch.nn.Module): + def __init__(self, config: ChatGLMConfig, layer_number): + super(CoreAttention, self).__init__() + + self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling + self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32 + if self.apply_query_key_layer_scaling: + self.attention_softmax_in_fp32 = True + self.layer_number = max(1, layer_number) + + projection_size = config.kv_channels * config.num_attention_heads + + # Per attention head and per partition values. + self.hidden_size_per_partition = projection_size + self.hidden_size_per_attention_head = projection_size // config.num_attention_heads + self.num_attention_heads_per_partition = config.num_attention_heads + + coeff = None + self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + if self.apply_query_key_layer_scaling: + coeff = self.layer_number + self.norm_factor *= coeff + self.coeff = coeff + + self.attention_dropout = torch.nn.Dropout(config.attention_dropout) + + def forward(self, query_layer, key_layer, value_layer, attention_mask): + pytorch_major_version = int(torch.__version__.split('.')[0]) + if pytorch_major_version >= 2: + query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]] + if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]: + context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, + is_causal=True) + else: + if attention_mask is not None: + attention_mask = ~attention_mask + context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, + attention_mask) + context_layer = context_layer.permute(2, 0, 1, 3) + new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) + context_layer = context_layer.reshape(*new_context_layer_shape) + else: + # Raw attention scores + + # [b, np, sq, sk] + output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0)) + + # [sq, b, np, hn] -> [sq, b * np, hn] + query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1) + # [sk, b, np, hn] -> [sk, b * np, hn] + key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) + + # preallocting input tensor: [b * np, sq, sk] + matmul_input_buffer = torch.empty( + output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype, + device=query_layer.device + ) + + # Raw attention scores. [b * np, sq, sk] + matmul_result = torch.baddbmm( + matmul_input_buffer, + query_layer.transpose(0, 1), # [b * np, sq, hn] + key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] + beta=0.0, + alpha=(1.0 / self.norm_factor), + ) + + # change view to [b, np, sq, sk] + attention_scores = matmul_result.view(*output_size) + + # =========================== + # Attention probs and dropout + # =========================== + + # attention scores and attention mask [b, np, sq, sk] + if self.attention_softmax_in_fp32: + attention_scores = attention_scores.float() + if self.coeff is not None: + attention_scores = attention_scores * self.coeff + if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]: + attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3], + device=attention_scores.device, dtype=torch.bool) + attention_mask.tril_() + attention_mask = ~attention_mask + if attention_mask is not None: + attention_scores = attention_scores.masked_fill(attention_mask, float("-inf")) + attention_probs = F.softmax(attention_scores, dim=-1) + attention_probs = attention_probs.type_as(value_layer) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.attention_dropout(attention_probs) + # ========================= + # Context layer. [sq, b, hp] + # ========================= + + # value_layer -> context layer. + # [sk, b, np, hn] --> [b, np, sq, hn] + + # context layer shape: [b, np, sq, hn] + output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3)) + # change view [sk, b * np, hn] + value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1) + # change view [b * np, sq, sk] + attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1) + # matmul: [b * np, sq, hn] + context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) + # change view [b, np, sq, hn] + context_layer = context_layer.view(*output_size) + # [b, np, sq, hn] --> [sq, b, np, hn] + context_layer = context_layer.permute(2, 0, 1, 3).contiguous() + # [sq, b, np, hn] --> [sq, b, hp] + new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) + context_layer = context_layer.view(*new_context_layer_shape) + + return context_layer + + +class SelfAttention(torch.nn.Module): + """Parallel self-attention layer abstract class. + + Self-attention layer takes input with size [s, b, h] + and returns output of the same size. + """ + + def __init__(self, config: ChatGLMConfig, layer_number, device=None): + super(SelfAttention, self).__init__() + self.layer_number = max(1, layer_number) + + self.projection_size = config.kv_channels * config.num_attention_heads + + # Per attention head and per partition values. + self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads + self.num_attention_heads_per_partition = config.num_attention_heads + + self.multi_query_attention = config.multi_query_attention + self.qkv_hidden_size = 3 * self.projection_size + if self.multi_query_attention: + self.num_multi_query_groups_per_partition = config.multi_query_group_num + self.qkv_hidden_size = ( + self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num + ) + self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size, + bias=config.add_bias_linear or config.add_qkv_bias, + device=device, **_config_to_kwargs(config) + ) + + self.core_attention = CoreAttention(config, self.layer_number) + + # Output. + self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear, + device=device, **_config_to_kwargs(config) + ) + + def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None): + if self.multi_query_attention: + num_attention_heads = self.num_multi_query_groups_per_partition + else: + num_attention_heads = self.num_attention_heads_per_partition + return torch.empty( + inference_max_sequence_len, + batch_size, + num_attention_heads, + self.hidden_size_per_attention_head, + dtype=dtype, + device=device, + ) + + def forward( + self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True + ): + # hidden_states: [sq, b, h] + + # ================================================= + # Pre-allocate memory for key-values for inference. + # ================================================= + # ===================== + # Query, Key, and Value + # ===================== + + # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)] + mixed_x_layer = self.query_key_value(hidden_states) + + if self.multi_query_attention: + (query_layer, key_layer, value_layer) = mixed_x_layer.split( + [ + self.num_attention_heads_per_partition * self.hidden_size_per_attention_head, + self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, + self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, + ], + dim=-1, + ) + query_layer = query_layer.view( + query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) + ) + key_layer = key_layer.view( + key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head) + ) + value_layer = value_layer.view( + value_layer.size()[:-1] + + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head) + ) + else: + new_tensor_shape = mixed_x_layer.size()[:-1] + \ + (self.num_attention_heads_per_partition, + 3 * self.hidden_size_per_attention_head) + mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) + + # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] + (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) + + # apply relative positional encoding (rotary embedding) + if rotary_pos_emb is not None: + query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb) + key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb) + + # adjust key and value for inference + if kv_cache is not None: + cache_k, cache_v = kv_cache + key_layer = torch.cat((cache_k, key_layer), dim=0) + value_layer = torch.cat((cache_v, value_layer), dim=0) + if use_cache: + kv_cache = (key_layer, value_layer) + else: + kv_cache = None + + if self.multi_query_attention: + key_layer = key_layer.unsqueeze(-2) + key_layer = key_layer.expand( + -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1 + ) + key_layer = key_layer.contiguous().view( + key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) + ) + value_layer = value_layer.unsqueeze(-2) + value_layer = value_layer.expand( + -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1 + ) + value_layer = value_layer.contiguous().view( + value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) + ) + + # ================================== + # core attention computation + # ================================== + + context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask) + + # ================= + # Output. [sq, b, h] + # ================= + + output = self.dense(context_layer) + + return output, kv_cache + + +def _config_to_kwargs(args): + common_kwargs = { + "dtype": args.torch_dtype, + } + return common_kwargs + + +class MLP(torch.nn.Module): + """MLP. + + MLP will take the input with h hidden state, project it to 4*h + hidden dimension, perform nonlinear transformation, and project the + state back into h hidden dimension. + """ + + def __init__(self, config: ChatGLMConfig, device=None): + super(MLP, self).__init__() + + self.add_bias = config.add_bias_linear + + # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf + self.dense_h_to_4h = nn.Linear( + config.hidden_size, + config.ffn_hidden_size * 2, + bias=self.add_bias, + device=device, + **_config_to_kwargs(config) + ) + + def swiglu(x): + x = torch.chunk(x, 2, dim=-1) + return F.silu(x[0]) * x[1] + + self.activation_func = swiglu + + # Project back to h. + self.dense_4h_to_h = nn.Linear( + config.ffn_hidden_size, + config.hidden_size, + bias=self.add_bias, + device=device, + **_config_to_kwargs(config) + ) + + def forward(self, hidden_states): + # [s, b, 4hp] + intermediate_parallel = self.dense_h_to_4h(hidden_states) + intermediate_parallel = self.activation_func(intermediate_parallel) + # [s, b, h] + output = self.dense_4h_to_h(intermediate_parallel) + return output + + +class GLMBlock(torch.nn.Module): + """A single transformer layer. + + Transformer layer takes input with size [s, b, h] and returns an + output of the same size. + """ + + def __init__(self, config: ChatGLMConfig, layer_number, device=None): + super(GLMBlock, self).__init__() + self.layer_number = layer_number + + self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm + + self.fp32_residual_connection = config.fp32_residual_connection + + LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm + # Layernorm on the input data. + self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, + dtype=config.torch_dtype) + + # Self attention. + self.self_attention = SelfAttention(config, layer_number, device=device) + self.hidden_dropout = config.hidden_dropout + + # Layernorm on the attention output + self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, + dtype=config.torch_dtype) + + # MLP + self.mlp = MLP(config, device=device) + + def forward( + self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True, + ): + # hidden_states: [s, b, h] + + # Layer norm at the beginning of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + # Self attention. + attention_output, kv_cache = self.self_attention( + layernorm_output, + attention_mask, + rotary_pos_emb, + kv_cache=kv_cache, + use_cache=use_cache + ) + + # Residual connection. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = hidden_states + + layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training) + layernorm_input = residual + layernorm_input + + # Layer norm post the self attention. + layernorm_output = self.post_attention_layernorm(layernorm_input) + + # MLP. + mlp_output = self.mlp(layernorm_output) + + # Second residual connection. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = layernorm_input + + output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training) + output = residual + output + + return output, kv_cache + + +class GLMTransformer(torch.nn.Module): + """Transformer class.""" + + def __init__(self, config: ChatGLMConfig, device=None): + super(GLMTransformer, self).__init__() + + self.fp32_residual_connection = config.fp32_residual_connection + self.post_layer_norm = config.post_layer_norm + + # Number of layers. + self.num_layers = config.num_layers + + # Transformer layers. + def build_layer(layer_number): + return GLMBlock(config, layer_number, device=device) + + self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)]) + + if self.post_layer_norm: + LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm + # Final layer norm before output. + self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, + dtype=config.torch_dtype) + + self.gradient_checkpointing = False + + def _get_layer(self, layer_number): + return self.layers[layer_number] + + def forward( + self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None, + use_cache: Optional[bool] = True, + output_hidden_states: Optional[bool] = False, + ): + if not kv_caches: + kv_caches = [None for _ in range(self.num_layers)] + presents = () if use_cache else None + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + all_self_attentions = None + all_hidden_states = () if output_hidden_states else None + for index in range(self.num_layers): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer = self._get_layer(index) + if self.gradient_checkpointing and self.training: + layer_ret = torch.utils.checkpoint.checkpoint( + layer, + hidden_states, + attention_mask, + rotary_pos_emb, + kv_caches[index], + use_cache + ) + else: + layer_ret = layer( + hidden_states, + attention_mask, + rotary_pos_emb, + kv_cache=kv_caches[index], + use_cache=use_cache + ) + hidden_states, kv_cache = layer_ret + if use_cache: + presents = presents + (kv_cache,) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # Final layer norm. + if self.post_layer_norm: + hidden_states = self.final_layernorm(hidden_states) + + return hidden_states, presents, all_hidden_states, all_self_attentions + + +class ChatGLMPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and + a simple interface for downloading and loading pretrained models. + """ + + is_parallelizable = False + supports_gradient_checkpointing = True + config_class = ChatGLMConfig + base_model_prefix = "transformer" + _no_split_modules = ["GLMBlock"] + + def _init_weights(self, module: nn.Module): + """Initialize the weights.""" + return + + def get_masks(self, input_ids, past_key_values, padding_mask=None): + batch_size, seq_length = input_ids.shape + full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device) + full_attention_mask.tril_() + past_length = 0 + if past_key_values: + past_length = past_key_values[0][0].shape[0] + if past_length: + full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length, + device=input_ids.device), full_attention_mask), dim=-1) + if padding_mask is not None: + full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1) + if not past_length and padding_mask is not None: + full_attention_mask -= padding_mask.unsqueeze(-1) - 1 + full_attention_mask = (full_attention_mask < 0.5).bool() + full_attention_mask.unsqueeze_(1) + return full_attention_mask + + def get_position_ids(self, input_ids, device): + batch_size, seq_length = input_ids.shape + position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1) + return position_ids + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, GLMTransformer): + module.gradient_checkpointing = value + + +class Embedding(torch.nn.Module): + """Language model embeddings.""" + + def __init__(self, config: ChatGLMConfig, device=None): + super(Embedding, self).__init__() + + self.hidden_size = config.hidden_size + # Word embeddings (parallel). + self.word_embeddings = nn.Embedding( + config.padded_vocab_size, + self.hidden_size, + dtype=config.torch_dtype, + device=device + ) + self.fp32_residual_connection = config.fp32_residual_connection + + def forward(self, input_ids): + # Embeddings. + words_embeddings = self.word_embeddings(input_ids) + embeddings = words_embeddings + # Data format change to avoid explicit tranposes : [b s h] --> [s b h]. + embeddings = embeddings.transpose(0, 1).contiguous() + # If the input flag for fp32 residual connection is set, convert for float. + if self.fp32_residual_connection: + embeddings = embeddings.float() + return embeddings + + +class ChatGLMModel(ChatGLMPreTrainedModel): + def __init__(self, config: ChatGLMConfig, device=None, empty_init=True): + super().__init__(config) + if empty_init: + init_method = skip_init + else: + init_method = default_init + init_kwargs = {} + if device is not None: + init_kwargs["device"] = device + self.embedding = init_method(Embedding, config, **init_kwargs) + self.num_layers = config.num_layers + self.multi_query_group_num = config.multi_query_group_num + self.kv_channels = config.kv_channels + + # Rotary positional embeddings + self.seq_length = config.seq_length + rotary_dim = ( + config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels + ) + + self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device, + dtype=config.torch_dtype) + self.encoder = init_method(GLMTransformer, config, **init_kwargs) + self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False, + dtype=config.torch_dtype, **init_kwargs) + self.pre_seq_len = config.pre_seq_len + self.prefix_projection = config.prefix_projection + if self.pre_seq_len is not None: + for param in self.parameters(): + param.requires_grad = False + self.prefix_tokens = torch.arange(self.pre_seq_len).long() + self.prefix_encoder = PrefixEncoder(config) + self.dropout = torch.nn.Dropout(0.1) + + def get_input_embeddings(self): + return self.embedding.word_embeddings + + def get_prompt(self, batch_size, device, dtype=torch.half): + prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device) + past_key_values = self.prefix_encoder(prefix_tokens).type(dtype) + past_key_values = past_key_values.view( + batch_size, + self.pre_seq_len, + self.num_layers * 2, + self.multi_query_group_num, + self.kv_channels + ) + # seq_len, b, nh, hidden_size + past_key_values = self.dropout(past_key_values) + past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2) + return past_key_values + + def forward( + self, + input_ids, + position_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.BoolTensor] = None, + full_attention_mask: Optional[torch.BoolTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, seq_length = input_ids.shape + + if inputs_embeds is None: + inputs_embeds = self.embedding(input_ids) + + if self.pre_seq_len is not None: + if past_key_values is None: + past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device, + dtype=inputs_embeds.dtype) + if attention_mask is not None: + attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)), + attention_mask], dim=-1) + + if full_attention_mask is None: + if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1): + full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask) + + # Rotary positional embeddings + rotary_pos_emb = self.rotary_pos_emb(self.seq_length) + if position_ids is not None: + rotary_pos_emb = rotary_pos_emb[position_ids] + else: + rotary_pos_emb = rotary_pos_emb[None, :seq_length] + rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous() + + # Run encoder. + hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder( + inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb, + kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states + ) + + if not return_dict: + return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + def quantize(self, weight_bit_width: int): + from .quantization import quantize + quantize(self.encoder, weight_bit_width) + return self + + +class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel): + def __init__(self, config: ChatGLMConfig, empty_init=True, device=None): + super().__init__(config) + + self.max_sequence_length = config.max_length + self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device) + self.config = config + self.quantized = False + + if self.config.quantization_bit: + self.quantize(self.config.quantization_bit, empty_init=True) + + def _update_model_kwargs_for_generation( + self, + outputs: ModelOutput, + model_kwargs: Dict[str, Any], + is_encoder_decoder: bool = False, + standardize_cache_format: bool = False, + ) -> Dict[str, Any]: + # update past_key_values + model_kwargs["past_key_values"] = self._extract_past_from_model_output( + outputs, standardize_cache_format=standardize_cache_format + ) + + # update attention mask + if "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + model_kwargs["attention_mask"] = torch.cat( + [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 + ) + + # update position ids + if "position_ids" in model_kwargs: + position_ids = model_kwargs["position_ids"] + new_position_id = position_ids[..., -1:].clone() + new_position_id += 1 + model_kwargs["position_ids"] = torch.cat( + [position_ids, new_position_id], dim=-1 + ) + + model_kwargs["is_first_forward"] = False + return model_kwargs + + def prepare_inputs_for_generation( + self, + input_ids: torch.LongTensor, + past_key_values: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + is_first_forward: bool = True, + **kwargs + ) -> dict: + # only last token for input_ids if past is not None + if position_ids is None: + position_ids = self.get_position_ids(input_ids, device=input_ids.device) + if not is_first_forward: + if past_key_values is not None: + position_ids = position_ids[..., -1:] + input_ids = input_ids[:, -1:] + return { + "input_ids": input_ids, + "past_key_values": past_key_values, + "position_ids": position_ids, + "attention_mask": attention_mask, + "return_last_logit": True, + "use_cache": use_cache + } + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + return_last_logit: Optional[bool] = False, + ): + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + if return_last_logit: + hidden_states = hidden_states[-1:] + lm_logits = self.transformer.output_layer(hidden_states) + lm_logits = lm_logits.transpose(0, 1).contiguous() + + loss = None + if labels is not None: + lm_logits = lm_logits.to(torch.float32) + + # Shift so that tokens < n predict n + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss(ignore_index=-100) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + lm_logits = lm_logits.to(hidden_states.dtype) + loss = loss.to(hidden_states.dtype) + + if not return_dict: + output = (lm_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + @staticmethod + def _reorder_cache( + past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor + ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]: + """ + This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or + [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct + beam_idx at every generation step. + + Output shares the same memory storage as `past`. + """ + return tuple( + ( + layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)), + layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)), + ) + for layer_past in past + ) + + def process_response(self, output, history): + content = "" + history = deepcopy(history) + for response in output.split("<|assistant|>"): + metadata, content = response.split("\n", maxsplit=1) + if not metadata.strip(): + content = content.strip() + history.append({"role": "assistant", "metadata": metadata, "content": content}) + content = content.replace("[[训练时间]]", "2023年") + else: + history.append({"role": "assistant", "metadata": metadata, "content": content}) + if history[0]["role"] == "system" and "tools" in history[0]: + content = "\n".join(content.split("\n")[1:-1]) + def tool_call(**kwargs): + return kwargs + parameters = eval(content) + content = {"name": metadata.strip(), "parameters": parameters} + else: + content = {"name": metadata.strip(), "content": content} + return content, history + + @torch.inference_mode() + def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", + max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None, + **kwargs): + if history is None: + history = [] + if logits_processor is None: + logits_processor = LogitsProcessorList() + logits_processor.append(InvalidScoreLogitsProcessor()) + gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, + "temperature": temperature, "logits_processor": logits_processor, **kwargs} + inputs = tokenizer.build_chat_input(query, history=history, role=role) + inputs = inputs.to(self.device) + eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"), + tokenizer.get_command("<|observation|>")] + outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id) + outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1] + response = tokenizer.decode(outputs) + history.append({"role": role, "content": query}) + response, history = self.process_response(response, history) + return response, history + + @torch.inference_mode() + def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", + past_key_values=None,max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8, + logits_processor=None, return_past_key_values=False, **kwargs): + if history is None: + history = [] + if logits_processor is None: + logits_processor = LogitsProcessorList() + logits_processor.append(InvalidScoreLogitsProcessor()) + eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"), + tokenizer.get_command("<|observation|>")] + gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p, + "temperature": temperature, "logits_processor": logits_processor, **kwargs} + if past_key_values is None: + inputs = tokenizer.build_chat_input(query, history=history, role=role) + else: + inputs = tokenizer.build_chat_input(query, role=role) + inputs = inputs.to(self.device) + if past_key_values is not None: + past_length = past_key_values[0][0].shape[0] + if self.transformer.pre_seq_len is not None: + past_length -= self.transformer.pre_seq_len + inputs.position_ids += past_length + attention_mask = inputs.attention_mask + attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1) + inputs['attention_mask'] = attention_mask + history.append({"role": role, "content": query}) + for outputs in self.stream_generate(**inputs, past_key_values=past_key_values, + eos_token_id=eos_token_id, return_past_key_values=return_past_key_values, + **gen_kwargs): + if return_past_key_values: + outputs, past_key_values = outputs + outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1] + response = tokenizer.decode(outputs) + if response and response[-1] != "�": + response, new_history = self.process_response(response, history) + if return_past_key_values: + yield response, new_history, past_key_values + else: + yield response, new_history + + @torch.inference_mode() + def stream_generate( + self, + input_ids, + generation_config: Optional[GenerationConfig] = None, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, + return_past_key_values=False, + **kwargs, + ): + batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1] + + if generation_config is None: + generation_config = self.generation_config + generation_config = copy.deepcopy(generation_config) + model_kwargs = generation_config.update(**kwargs) + model_kwargs["use_cache"] = generation_config.use_cache + bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id + + if isinstance(eos_token_id, int): + eos_token_id = [eos_token_id] + eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None + + has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None + if has_default_max_length and generation_config.max_new_tokens is None: + warnings.warn( + f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. " + "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we" + " recommend using `max_new_tokens` to control the maximum length of the generation.", + UserWarning, + ) + elif generation_config.max_new_tokens is not None: + generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length + if not has_default_max_length: + logger.warn( + f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(=" + f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. " + "Please refer to the documentation for more information. " + "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)", + UserWarning, + ) + + if input_ids_seq_length >= generation_config.max_length: + input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids" + logger.warning( + f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to" + f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider" + " increasing `max_new_tokens`." + ) + + # 2. Set generation parameters if not already defined + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + + logits_processor = self._get_logits_processor( + generation_config=generation_config, + input_ids_seq_length=input_ids_seq_length, + encoder_input_ids=input_ids, + prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, + logits_processor=logits_processor, + ) + + stopping_criteria = self._get_stopping_criteria( + generation_config=generation_config, stopping_criteria=stopping_criteria + ) + logits_warper = self._get_logits_warper(generation_config) + + unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) + scores = None + while True: + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + # forward pass to get next token + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=False, + output_hidden_states=False, + ) + + next_token_logits = outputs.logits[:, -1, :] + + # pre-process distribution + next_token_scores = logits_processor(input_ids, next_token_logits) + next_token_scores = logits_warper(input_ids, next_token_scores) + + # sample + probs = nn.functional.softmax(next_token_scores, dim=-1) + if generation_config.do_sample: + next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) + else: + next_tokens = torch.argmax(probs, dim=-1) + # update generated ids, model inputs, and length for next step + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + model_kwargs = self._update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder + ) + unfinished_sequences = unfinished_sequences.mul( + next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) + ) + if return_past_key_values: + yield input_ids, outputs.past_key_values + else: + yield input_ids + # stop when each sentence is finished, or if we exceed the maximum length + if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): + break + + def quantize(self, bits: int, empty_init=False, device=None, **kwargs): + if bits == 0: + return + + from .quantization import quantize + + if self.quantized: + logger.info("Already quantized.") + return self + + self.quantized = True + + self.config.quantization_bit = bits + + self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device, + **kwargs) + return self + + +class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel): + def __init__(self, config: ChatGLMConfig, empty_init=True, device=None): + super().__init__(config) + + self.num_labels = config.num_labels + self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device) + + self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half) + if config.classifier_dropout is not None: + self.dropout = nn.Dropout(config.classifier_dropout) + else: + self.dropout = None + self.config = config + + if self.config.quantization_bit: + self.quantize(self.config.quantization_bit, empty_init=True) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + full_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + inputs_embeds: Optional[torch.LongTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + full_attention_mask=full_attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + pooled_hidden_states = hidden_states[-1] + if self.dropout is not None: + pooled_hidden_states = self.dropout(pooled_hidden_states) + logits = self.classifier_head(pooled_hidden_states) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze().float(), labels.squeeze()) + else: + loss = loss_fct(logits.float(), labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits.float(), labels.view(-1, self.num_labels)) + + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/optimizer.pt b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..537a0c51543a9a8284ca138a0b0bac68293ea7d5 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1536e16847acaf3fcab9fe3cbd51c33a222333a9b1fa9bc163ccff4761e8e877 +size 14682210 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/pytorch_model.bin b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..7bc1511008eca1551f96ae55b62e99413c42561a --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fc22e1b75d012bb3ba2c92368d8b6364584c2af351af3dba685befac8e853db +size 7341306 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/quantization.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/quantization.py new file mode 100644 index 0000000000000000000000000000000000000000..cb95bfe82b203ff6a2aa962326d2c7a438d6a52f --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/quantization.py @@ -0,0 +1,188 @@ +from torch.nn import Linear +from torch.nn.parameter import Parameter + +import bz2 +import torch +import base64 +import ctypes +from transformers.utils import logging + +from typing import List +from functools import partial + +logger = logging.get_logger(__name__) + +try: + from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up + + class Kernel: + def __init__(self, code: bytes, function_names: List[str]): + self.code = code + self._function_names = function_names + self._cmodule = LazyKernelCModule(self.code) + + for name in self._function_names: + setattr(self, name, KernelFunction(self._cmodule, name)) + + quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ" + + kernels = Kernel( + bz2.decompress(base64.b64decode(quantization_code)), + [ + "int4WeightCompression", + "int4WeightExtractionFloat", + "int4WeightExtractionHalf", + "int8WeightExtractionFloat", + "int8WeightExtractionHalf", + ], + ) +except Exception as exception: + kernels = None + logger.warning("Failed to load cpm_kernels:" + str(exception)) + + +class W8A16Linear(torch.autograd.Function): + @staticmethod + def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width): + ctx.inp_shape = inp.size() + ctx.weight_bit_width = weight_bit_width + out_features = quant_w.size(0) + inp = inp.contiguous().view(-1, inp.size(-1)) + weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width) + ctx.weight_shape = weight.size() + output = inp.mm(weight.t()) + ctx.save_for_backward(inp, quant_w, scale_w) + return output.view(*(ctx.inp_shape[:-1] + (out_features,))) + + @staticmethod + def backward(ctx, grad_output: torch.Tensor): + inp, quant_w, scale_w = ctx.saved_tensors + weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width) + grad_output = grad_output.contiguous().view(-1, weight.size(0)) + grad_input = grad_output.mm(weight) + grad_weight = grad_output.t().mm(inp) + return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None + + +def compress_int4_weight(weight: torch.Tensor): # (n, m) + with torch.cuda.device(weight.device): + n, m = weight.size(0), weight.size(1) + assert m % 2 == 0 + m = m // 2 + out = torch.empty(n, m, dtype=torch.int8, device="cuda") + stream = torch.cuda.current_stream() + + gridDim = (n, 1, 1) + blockDim = (min(round_up(m, 32), 1024), 1, 1) + + kernels.int4WeightCompression( + gridDim, + blockDim, + 0, + stream, + [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)], + ) + return out + + +def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int): + assert scale_list.dtype in [torch.half, torch.bfloat16] + assert weight.dtype in [torch.int8] + if source_bit_width == 8: + return weight.to(scale_list.dtype) * scale_list[:, None] + elif source_bit_width == 4: + func = ( + kernels.int4WeightExtractionHalf if scale_list.dtype == torch.half else kernels.int4WeightExtractionBFloat16 + ) + else: + assert False, "Unsupported bit-width" + + with torch.cuda.device(weight.device): + n, m = weight.size(0), weight.size(1) + out = torch.empty(n, m * (8 // source_bit_width), dtype=scale_list.dtype, device="cuda") + stream = torch.cuda.current_stream() + + gridDim = (n, 1, 1) + blockDim = (min(round_up(m, 32), 1024), 1, 1) + + func( + gridDim, + blockDim, + 0, + stream, + [ + ctypes.c_void_p(weight.data_ptr()), + ctypes.c_void_p(scale_list.data_ptr()), + ctypes.c_void_p(out.data_ptr()), + ctypes.c_int32(n), + ctypes.c_int32(m), + ], + ) + return out + + +class QuantizedLinear(torch.nn.Module): + def __init__(self, weight_bit_width: int, weight, bias=None, device="cpu", dtype=None, empty_init=False, *args, + **kwargs): + super().__init__() + self.weight_bit_width = weight_bit_width + + shape = weight.shape + + if weight is None or empty_init: + self.weight = torch.empty(shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=device) + self.weight_scale = torch.empty(shape[0], dtype=dtype, device=device) + else: + self.weight_scale = weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1) + self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8) + if weight_bit_width == 4: + self.weight = compress_int4_weight(self.weight) + + self.weight = Parameter(self.weight.to(device), requires_grad=False) + self.weight_scale = Parameter(self.weight_scale.to(device), requires_grad=False) + self.bias = Parameter(bias.to(device), requires_grad=False) if bias is not None else None + + def forward(self, input): + output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width) + if self.bias is not None: + output = output + self.bias + return output + + +def quantize(model, weight_bit_width, empty_init=False, device=None): + """Replace fp16 linear with quantized linear""" + for layer in model.layers: + layer.self_attention.query_key_value = QuantizedLinear( + weight_bit_width=weight_bit_width, + weight=layer.self_attention.query_key_value.weight.to(torch.cuda.current_device()), + bias=layer.self_attention.query_key_value.bias, + dtype=layer.self_attention.query_key_value.weight.dtype, + device=layer.self_attention.query_key_value.weight.device if device is None else device, + empty_init=empty_init + ) + layer.self_attention.dense = QuantizedLinear( + weight_bit_width=weight_bit_width, + weight=layer.self_attention.dense.weight.to(torch.cuda.current_device()), + bias=layer.self_attention.dense.bias, + dtype=layer.self_attention.dense.weight.dtype, + device=layer.self_attention.dense.weight.device if device is None else device, + empty_init=empty_init + ) + layer.mlp.dense_h_to_4h = QuantizedLinear( + weight_bit_width=weight_bit_width, + weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()), + bias=layer.mlp.dense_h_to_4h.bias, + dtype=layer.mlp.dense_h_to_4h.weight.dtype, + device=layer.mlp.dense_h_to_4h.weight.device if device is None else device, + empty_init=empty_init + ) + layer.mlp.dense_4h_to_h = QuantizedLinear( + weight_bit_width=weight_bit_width, + weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()), + bias=layer.mlp.dense_4h_to_h.bias, + dtype=layer.mlp.dense_4h_to_h.weight.dtype, + device=layer.mlp.dense_4h_to_h.weight.device if device is None else device, + empty_init=empty_init + ) + + return model diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/rng_state.pth b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..dea498e970c6dc2b029df3494a4952092a985a0f --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:770fc5b3f081eaa40d1dc87b17544797be34efa6309211006ba5f9a46a02dfc0 +size 14244 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/scheduler.pt b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..68047aa06cd43cf8389655f1a7aebf9a795ee470 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfeb060c25a48c544e101802c9544e2b821808664074249db3b359d15595a31d +size 1064 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/special_tokens_map.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/special_tokens_map.json @@ -0,0 +1 @@ +{} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenization_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenizer.model b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenizer_config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5faafd38f2e2934804feb0e7d71ebf08b0839bf5 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenizer_config.json @@ -0,0 +1,18 @@ +{ + "added_tokens_decoder": {}, + "additional_special_tokens": [], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/trainer_state.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fa720c7b50086a8a29af2806f21a94be85cc04a0 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/trainer_state.json @@ -0,0 +1,619 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.411764705882353, + "eval_steps": 500, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.09, + "learning_rate": 0.009985714285714285, + "loss": 2.6971, + "step": 1 + }, + { + "epoch": 0.19, + "learning_rate": 0.009971428571428572, + "loss": 2.3927, + "step": 2 + }, + { + "epoch": 0.28, + "learning_rate": 0.009957142857142857, + "loss": 2.2539, + "step": 3 + }, + { + "epoch": 0.38, + "learning_rate": 0.009942857142857144, + "loss": 2.1408, + "step": 4 + }, + { + "epoch": 0.47, + "learning_rate": 0.009928571428571429, + "loss": 2.2672, + "step": 5 + }, + { + "epoch": 0.56, + "learning_rate": 0.009914285714285714, + "loss": 1.6433, + "step": 6 + }, + { + "epoch": 0.66, + "learning_rate": 0.0099, + "loss": 2.1405, + "step": 7 + }, + { + "epoch": 0.75, + "learning_rate": 0.009885714285714286, + "loss": 2.1464, + "step": 8 + }, + { + "epoch": 0.85, + "learning_rate": 0.009871428571428571, + "loss": 1.8498, + "step": 9 + }, + { + "epoch": 0.94, + "learning_rate": 0.009857142857142858, + "loss": 1.6896, + "step": 10 + }, + { + "epoch": 1.04, + "learning_rate": 0.009842857142857143, + "loss": 2.1932, + "step": 11 + }, + { + "epoch": 1.13, + "learning_rate": 0.00982857142857143, + "loss": 1.8236, + "step": 12 + }, + { + "epoch": 1.22, + "learning_rate": 0.009814285714285715, + "loss": 1.735, + "step": 13 + }, + { + "epoch": 1.32, + "learning_rate": 0.0098, + "loss": 1.7488, + "step": 14 + }, + { + "epoch": 1.41, + "learning_rate": 0.009785714285714285, + "loss": 1.8336, + "step": 15 + }, + { + "epoch": 1.51, + "learning_rate": 0.009771428571428572, + "loss": 1.9438, + "step": 16 + }, + { + "epoch": 1.6, + "learning_rate": 0.009757142857142858, + "loss": 1.7178, + "step": 17 + }, + { + "epoch": 1.69, + "learning_rate": 0.009742857142857143, + "loss": 1.5714, + "step": 18 + }, + { + "epoch": 1.79, + "learning_rate": 0.009728571428571428, + "loss": 1.537, + "step": 19 + }, + { + "epoch": 1.88, + "learning_rate": 0.009714285714285715, + "loss": 1.6764, + "step": 20 + }, + { + "epoch": 1.98, + "learning_rate": 0.0097, + "loss": 1.8919, + "step": 21 + }, + { + "epoch": 2.07, + "learning_rate": 0.009685714285714285, + "loss": 1.346, + "step": 22 + }, + { + "epoch": 2.16, + "learning_rate": 0.009671428571428572, + "loss": 1.5036, + "step": 23 + }, + { + "epoch": 2.26, + "learning_rate": 0.009657142857142857, + "loss": 1.6788, + "step": 24 + }, + { + "epoch": 2.35, + "learning_rate": 0.009642857142857144, + "loss": 1.6667, + "step": 25 + }, + { + "epoch": 2.45, + "learning_rate": 0.009628571428571429, + "loss": 1.7153, + "step": 26 + }, + { + "epoch": 2.54, + "learning_rate": 0.009614285714285714, + "loss": 1.601, + "step": 27 + }, + { + "epoch": 2.64, + "learning_rate": 0.0096, + "loss": 1.3002, + "step": 28 + }, + { + "epoch": 2.73, + "learning_rate": 0.009585714285714286, + "loss": 1.3294, + "step": 29 + }, + { + "epoch": 2.82, + "learning_rate": 0.009571428571428573, + "loss": 1.7477, + "step": 30 + }, + { + "epoch": 2.92, + "learning_rate": 0.009557142857142858, + "loss": 1.7961, + "step": 31 + }, + { + "epoch": 3.01, + "learning_rate": 0.009542857142857143, + "loss": 1.4954, + "step": 32 + }, + { + "epoch": 3.11, + "learning_rate": 0.009528571428571428, + "loss": 1.6452, + "step": 33 + }, + { + "epoch": 3.2, + "learning_rate": 0.009514285714285715, + "loss": 1.3528, + "step": 34 + }, + { + "epoch": 3.29, + "learning_rate": 0.0095, + "loss": 1.4811, + "step": 35 + }, + { + "epoch": 3.39, + "learning_rate": 0.009485714285714287, + "loss": 1.4738, + "step": 36 + }, + { + "epoch": 3.48, + "learning_rate": 0.009471428571428572, + "loss": 1.174, + "step": 37 + }, + { + "epoch": 3.58, + "learning_rate": 0.009457142857142857, + "loss": 1.2346, + "step": 38 + }, + { + "epoch": 3.67, + "learning_rate": 0.009442857142857143, + "loss": 1.5327, + "step": 39 + }, + { + "epoch": 3.76, + "learning_rate": 0.009428571428571429, + "loss": 1.5249, + "step": 40 + }, + { + "epoch": 3.86, + "learning_rate": 0.009414285714285714, + "loss": 1.5086, + "step": 41 + }, + { + "epoch": 3.95, + "learning_rate": 0.0094, + "loss": 1.8425, + "step": 42 + }, + { + "epoch": 4.05, + "learning_rate": 0.009385714285714287, + "loss": 1.1943, + "step": 43 + }, + { + "epoch": 4.14, + "learning_rate": 0.009371428571428572, + "loss": 1.6835, + "step": 44 + }, + { + "epoch": 4.24, + "learning_rate": 0.009357142857142857, + "loss": 1.75, + "step": 45 + }, + { + "epoch": 4.33, + "learning_rate": 0.009342857142857142, + "loss": 1.2561, + "step": 46 + }, + { + "epoch": 4.42, + "learning_rate": 0.009328571428571429, + "loss": 1.3784, + "step": 47 + }, + { + "epoch": 4.52, + "learning_rate": 0.009314285714285714, + "loss": 1.2538, + "step": 48 + }, + { + "epoch": 4.61, + "learning_rate": 0.009300000000000001, + "loss": 1.4429, + "step": 49 + }, + { + "epoch": 4.71, + "learning_rate": 0.009285714285714286, + "loss": 1.3687, + "step": 50 + }, + { + "epoch": 4.8, + "learning_rate": 0.009271428571428571, + "loss": 1.1511, + "step": 51 + }, + { + "epoch": 4.89, + "learning_rate": 0.009257142857142858, + "loss": 1.181, + "step": 52 + }, + { + "epoch": 4.99, + "learning_rate": 0.009242857142857143, + "loss": 1.1753, + "step": 53 + }, + { + "epoch": 5.08, + "learning_rate": 0.009228571428571428, + "loss": 1.1562, + "step": 54 + }, + { + "epoch": 5.18, + "learning_rate": 0.009214285714285715, + "loss": 1.2936, + "step": 55 + }, + { + "epoch": 5.27, + "learning_rate": 0.0092, + "loss": 1.3591, + "step": 56 + }, + { + "epoch": 5.36, + "learning_rate": 0.009185714285714287, + "loss": 1.1376, + "step": 57 + }, + { + "epoch": 5.46, + "learning_rate": 0.009171428571428572, + "loss": 1.372, + "step": 58 + }, + { + "epoch": 5.55, + "learning_rate": 0.009157142857142857, + "loss": 1.5141, + "step": 59 + }, + { + "epoch": 5.65, + "learning_rate": 0.009142857142857144, + "loss": 1.2087, + "step": 60 + }, + { + "epoch": 5.74, + "learning_rate": 0.009128571428571429, + "loss": 1.136, + "step": 61 + }, + { + "epoch": 5.84, + "learning_rate": 0.009114285714285715, + "loss": 1.2948, + "step": 62 + }, + { + "epoch": 5.93, + "learning_rate": 0.0091, + "loss": 1.0592, + "step": 63 + }, + { + "epoch": 6.02, + "learning_rate": 0.009085714285714286, + "loss": 1.2321, + "step": 64 + }, + { + "epoch": 6.12, + "learning_rate": 0.009071428571428572, + "loss": 1.0827, + "step": 65 + }, + { + "epoch": 6.21, + "learning_rate": 0.009057142857142857, + "loss": 1.1136, + "step": 66 + }, + { + "epoch": 6.31, + "learning_rate": 0.009042857142857142, + "loss": 1.475, + "step": 67 + }, + { + "epoch": 6.4, + "learning_rate": 0.009028571428571427, + "loss": 1.1316, + "step": 68 + }, + { + "epoch": 6.49, + "learning_rate": 0.009014285714285714, + "loss": 1.1688, + "step": 69 + }, + { + "epoch": 6.59, + "learning_rate": 0.009000000000000001, + "loss": 1.0882, + "step": 70 + }, + { + "epoch": 6.68, + "learning_rate": 0.008985714285714286, + "loss": 1.1085, + "step": 71 + }, + { + "epoch": 6.78, + "learning_rate": 0.008971428571428571, + "loss": 1.2029, + "step": 72 + }, + { + "epoch": 6.87, + "learning_rate": 0.008957142857142856, + "loss": 1.098, + "step": 73 + }, + { + "epoch": 6.96, + "learning_rate": 0.008942857142857143, + "loss": 1.219, + "step": 74 + }, + { + "epoch": 7.06, + "learning_rate": 0.00892857142857143, + "loss": 1.0092, + "step": 75 + }, + { + "epoch": 7.15, + "learning_rate": 0.008914285714285715, + "loss": 1.0112, + "step": 76 + }, + { + "epoch": 7.25, + "learning_rate": 0.0089, + "loss": 1.1481, + "step": 77 + }, + { + "epoch": 7.34, + "learning_rate": 0.008885714285714287, + "loss": 0.9873, + "step": 78 + }, + { + "epoch": 7.44, + "learning_rate": 0.008871428571428572, + "loss": 1.0586, + "step": 79 + }, + { + "epoch": 7.53, + "learning_rate": 0.008857142857142857, + "loss": 1.1177, + "step": 80 + }, + { + "epoch": 7.62, + "learning_rate": 0.008842857142857142, + "loss": 0.7814, + "step": 81 + }, + { + "epoch": 7.72, + "learning_rate": 0.008828571428571429, + "loss": 1.2043, + "step": 82 + }, + { + "epoch": 7.81, + "learning_rate": 0.008814285714285715, + "loss": 1.0062, + "step": 83 + }, + { + "epoch": 7.91, + "learning_rate": 0.0088, + "loss": 1.0831, + "step": 84 + }, + { + "epoch": 8.0, + "learning_rate": 0.008785714285714286, + "loss": 0.9554, + "step": 85 + }, + { + "epoch": 8.09, + "learning_rate": 0.00877142857142857, + "loss": 1.1674, + "step": 86 + }, + { + "epoch": 8.19, + "learning_rate": 0.008757142857142857, + "loss": 0.8226, + "step": 87 + }, + { + "epoch": 8.28, + "learning_rate": 0.008742857142857144, + "loss": 0.9166, + "step": 88 + }, + { + "epoch": 8.38, + "learning_rate": 0.00872857142857143, + "loss": 0.734, + "step": 89 + }, + { + "epoch": 8.47, + "learning_rate": 0.008714285714285714, + "loss": 0.8641, + "step": 90 + }, + { + "epoch": 8.56, + "learning_rate": 0.0087, + "loss": 0.9517, + "step": 91 + }, + { + "epoch": 8.66, + "learning_rate": 0.008685714285714286, + "loss": 0.9995, + "step": 92 + }, + { + "epoch": 8.75, + "learning_rate": 0.008671428571428571, + "loss": 0.763, + "step": 93 + }, + { + "epoch": 8.85, + "learning_rate": 0.008657142857142858, + "loss": 1.0712, + "step": 94 + }, + { + "epoch": 8.94, + "learning_rate": 0.008642857142857143, + "loss": 1.1111, + "step": 95 + }, + { + "epoch": 9.04, + "learning_rate": 0.008628571428571428, + "loss": 0.9626, + "step": 96 + }, + { + "epoch": 9.13, + "learning_rate": 0.008614285714285715, + "loss": 0.6385, + "step": 97 + }, + { + "epoch": 9.22, + "learning_rate": 0.0086, + "loss": 0.8147, + "step": 98 + }, + { + "epoch": 9.32, + "learning_rate": 0.008585714285714285, + "loss": 0.8109, + "step": 99 + }, + { + "epoch": 9.41, + "learning_rate": 0.008571428571428572, + "loss": 1.0953, + "step": 100 + } + ], + "logging_steps": 1.0, + "max_steps": 700, + "num_train_epochs": 70, + "save_steps": 100, + "total_flos": 1.175174321799168e+17, + "trial_name": null, + "trial_params": null +} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/training_args.bin b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..17f9bfbf1a7cdd9e0e808e0672d55ad9ad4efb5f --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00669a32a6ddac0a3243bbc04d3f1f70ffc8f89f2626c1fdafa93ce68c311aa0 +size 4664 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/config.json new file mode 100644 index 0000000000000000000000000000000000000000..50d927dc68b4eaa40bd4812b7417b3f2bd61f599 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "chatglm3-6b", + "add_bias_linear": false, + "add_qkv_bias": true, + "apply_query_key_layer_scaling": true, + "apply_residual_connection_post_layernorm": false, + "architectures": [ + "ChatGLMForConditionalGeneration" + ], + "attention_dropout": 0.0, + "attention_softmax_in_fp32": true, + "auto_map": { + "AutoConfig": "configuration_chatglm.ChatGLMConfig", + "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification" + }, + "bias_dropout_fusion": true, + "classifier_dropout": null, + "eos_token_id": 2, + "ffn_hidden_size": 13696, + "fp32_residual_connection": false, + "hidden_dropout": 0.0, + "hidden_size": 4096, + "kv_channels": 128, + "layernorm_epsilon": 1e-05, + "model_type": "chatglm", + "multi_query_attention": true, + "multi_query_group_num": 2, + "num_attention_heads": 32, + "num_layers": 28, + "original_rope": true, + "pad_token_id": 0, + "padded_vocab_size": 65024, + "post_layer_norm": true, + "pre_seq_len": 128, + "prefix_projection": false, + "quantization_bit": 0, + "rmsnorm": true, + "seq_length": 8192, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.34.0", + "use_cache": true, + "vocab_size": 65024 +} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/configuration_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/configuration_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..35600185f5a26951081de0f3a41a913eaf06af99 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/configuration_chatglm.py @@ -0,0 +1,61 @@ +from transformers import PretrainedConfig + + +class ChatGLMConfig(PretrainedConfig): + model_type = "chatglm" + def __init__( + self, + num_layers=28, + padded_vocab_size=65024, + hidden_size=4096, + ffn_hidden_size=13696, + kv_channels=128, + num_attention_heads=32, + seq_length=2048, + hidden_dropout=0.0, + classifier_dropout=None, + attention_dropout=0.0, + layernorm_epsilon=1e-5, + rmsnorm=True, + apply_residual_connection_post_layernorm=False, + post_layer_norm=True, + add_bias_linear=False, + add_qkv_bias=False, + bias_dropout_fusion=True, + multi_query_attention=False, + multi_query_group_num=1, + apply_query_key_layer_scaling=True, + attention_softmax_in_fp32=True, + fp32_residual_connection=False, + quantization_bit=0, + pre_seq_len=None, + prefix_projection=False, + **kwargs + ): + self.num_layers = num_layers + self.vocab_size = padded_vocab_size + self.padded_vocab_size = padded_vocab_size + self.hidden_size = hidden_size + self.ffn_hidden_size = ffn_hidden_size + self.kv_channels = kv_channels + self.num_attention_heads = num_attention_heads + self.seq_length = seq_length + self.hidden_dropout = hidden_dropout + self.classifier_dropout = classifier_dropout + self.attention_dropout = attention_dropout + self.layernorm_epsilon = layernorm_epsilon + self.rmsnorm = rmsnorm + self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm + self.post_layer_norm = post_layer_norm + self.add_bias_linear = add_bias_linear + self.add_qkv_bias = add_qkv_bias + self.bias_dropout_fusion = bias_dropout_fusion + self.multi_query_attention = multi_query_attention + self.multi_query_group_num = multi_query_group_num + self.apply_query_key_layer_scaling = apply_query_key_layer_scaling + self.attention_softmax_in_fp32 = attention_softmax_in_fp32 + self.fp32_residual_connection = fp32_residual_connection + self.quantization_bit = quantization_bit + self.pre_seq_len = pre_seq_len + self.prefix_projection = prefix_projection + super().__init__(**kwargs) \ No newline at end of file diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/generation_config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31d22ad9418a1daba6b2bbf472ac3762cd5ce643 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "eos_token_id": 2, + "pad_token_id": 0, + "transformers_version": "4.34.0" +} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/modeling_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/modeling_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..c5b5027587016090a377f25289284b6e4f829cb4 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/modeling_chatglm.py @@ -0,0 +1,1293 @@ +""" PyTorch ChatGLM model. """ + +import math +import copy +import warnings +import re +import sys + +import torch +import torch.utils.checkpoint +import torch.nn.functional as F +from torch import nn +from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss +from torch.nn.utils import skip_init +from typing import Optional, Tuple, Union, List, Callable, Dict, Any +from copy import deepcopy + +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import logging +from transformers.generation.logits_process import LogitsProcessor +from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput + +from .configuration_chatglm import ChatGLMConfig + +# flags required to enable jit fusion kernels + +if sys.platform != 'darwin': + torch._C._jit_set_profiling_mode(False) + torch._C._jit_set_profiling_executor(False) + torch._C._jit_override_can_fuse_on_cpu(True) + torch._C._jit_override_can_fuse_on_gpu(True) + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM" +_CONFIG_FOR_DOC = "ChatGLMConfig" + +CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "THUDM/chatglm3-6b", + # See all ChatGLM models at https://huggingface.co/models?filter=chatglm +] + + +def default_init(cls, *args, **kwargs): + return cls(*args, **kwargs) + + +class InvalidScoreLogitsProcessor(LogitsProcessor): + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + if torch.isnan(scores).any() or torch.isinf(scores).any(): + scores.zero_() + scores[..., 5] = 5e4 + return scores + + +class PrefixEncoder(torch.nn.Module): + """ + The torch.nn model to encode the prefix + Input shape: (batch-size, prefix-length) + Output shape: (batch-size, prefix-length, 2*layers*hidden) + """ + + def __init__(self, config: ChatGLMConfig): + super().__init__() + self.prefix_projection = config.prefix_projection + if self.prefix_projection: + # Use a two-layer MLP to encode the prefix + kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2 + self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size) + self.trans = torch.nn.Sequential( + torch.nn.Linear(kv_size, config.hidden_size), + torch.nn.Tanh(), + torch.nn.Linear(config.hidden_size, kv_size) + ) + else: + self.embedding = torch.nn.Embedding(config.pre_seq_len, + config.num_layers * config.kv_channels * config.multi_query_group_num * 2) + + def forward(self, prefix: torch.Tensor): + if self.prefix_projection: + prefix_tokens = self.embedding(prefix) + past_key_values = self.trans(prefix_tokens) + else: + past_key_values = self.embedding(prefix) + return past_key_values + + +def split_tensor_along_last_dim( + tensor: torch.Tensor, + num_partitions: int, + contiguous_split_chunks: bool = False, +) -> List[torch.Tensor]: + """Split a tensor along its last dimension. + + Arguments: + tensor: input tensor. + num_partitions: number of partitions to split the tensor + contiguous_split_chunks: If True, make each chunk contiguous + in memory. + + Returns: + A list of Tensors + """ + # Get the size and dimension. + last_dim = tensor.dim() - 1 + last_dim_size = tensor.size()[last_dim] // num_partitions + # Split. + tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) + # Note: torch.split does not create contiguous tensors by default. + if contiguous_split_chunks: + return tuple(chunk.contiguous() for chunk in tensor_list) + + return tensor_list + + +class RotaryEmbedding(nn.Module): + def __init__(self, dim, original_impl=False, device=None, dtype=None): + super().__init__() + inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim)) + self.register_buffer("inv_freq", inv_freq) + self.dim = dim + self.original_impl = original_impl + + def forward_impl( + self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000 + ): + """Enhanced Transformer with Rotary Position Embedding. + + Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/ + transformers/rope/__init__.py. MIT License: + https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license. + """ + # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$ + theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem)) + + # Create position indexes `[0, 1, ..., seq_len - 1]` + seq_idx = torch.arange(seq_len, dtype=torch.float, device=device) + + # Calculate the product of position index and $\theta_i$ + idx_theta = torch.outer(seq_idx, theta).float() + + cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1) + + # this is to mimic the behaviour of complex32, else we will get different results + if dtype in (torch.float16, torch.bfloat16, torch.int8): + cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half() + return cache + + def forward(self, max_seq_len, offset=0): + return self.forward_impl( + max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device + ) + + +@torch.jit.script +def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor: + # x: [sq, b, np, hn] + sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3) + rot_dim = rope_cache.shape[-2] * 2 + x, x_pass = x[..., :rot_dim], x[..., rot_dim:] + # truncate to support variable sizes + rope_cache = rope_cache[:sq] + xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2) + rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2) + x_out2 = torch.stack( + [ + xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1], + xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1], + ], + -1, + ) + x_out2 = x_out2.flatten(3) + return torch.cat((x_out2, x_pass), dim=-1) + + +class RMSNorm(torch.nn.Module): + def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs): + super().__init__() + self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype)) + self.eps = eps + + def forward(self, hidden_states: torch.Tensor): + input_dtype = hidden_states.dtype + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.eps) + + return (self.weight * hidden_states).to(input_dtype) + + +class CoreAttention(torch.nn.Module): + def __init__(self, config: ChatGLMConfig, layer_number): + super(CoreAttention, self).__init__() + + self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling + self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32 + if self.apply_query_key_layer_scaling: + self.attention_softmax_in_fp32 = True + self.layer_number = max(1, layer_number) + + projection_size = config.kv_channels * config.num_attention_heads + + # Per attention head and per partition values. + self.hidden_size_per_partition = projection_size + self.hidden_size_per_attention_head = projection_size // config.num_attention_heads + self.num_attention_heads_per_partition = config.num_attention_heads + + coeff = None + self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + if self.apply_query_key_layer_scaling: + coeff = self.layer_number + self.norm_factor *= coeff + self.coeff = coeff + + self.attention_dropout = torch.nn.Dropout(config.attention_dropout) + + def forward(self, query_layer, key_layer, value_layer, attention_mask): + pytorch_major_version = int(torch.__version__.split('.')[0]) + if pytorch_major_version >= 2: + query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]] + if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]: + context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, + is_causal=True) + else: + if attention_mask is not None: + attention_mask = ~attention_mask + context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, + attention_mask) + context_layer = context_layer.permute(2, 0, 1, 3) + new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) + context_layer = context_layer.reshape(*new_context_layer_shape) + else: + # Raw attention scores + + # [b, np, sq, sk] + output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0)) + + # [sq, b, np, hn] -> [sq, b * np, hn] + query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1) + # [sk, b, np, hn] -> [sk, b * np, hn] + key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) + + # preallocting input tensor: [b * np, sq, sk] + matmul_input_buffer = torch.empty( + output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype, + device=query_layer.device + ) + + # Raw attention scores. [b * np, sq, sk] + matmul_result = torch.baddbmm( + matmul_input_buffer, + query_layer.transpose(0, 1), # [b * np, sq, hn] + key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] + beta=0.0, + alpha=(1.0 / self.norm_factor), + ) + + # change view to [b, np, sq, sk] + attention_scores = matmul_result.view(*output_size) + + # =========================== + # Attention probs and dropout + # =========================== + + # attention scores and attention mask [b, np, sq, sk] + if self.attention_softmax_in_fp32: + attention_scores = attention_scores.float() + if self.coeff is not None: + attention_scores = attention_scores * self.coeff + if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]: + attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3], + device=attention_scores.device, dtype=torch.bool) + attention_mask.tril_() + attention_mask = ~attention_mask + if attention_mask is not None: + attention_scores = attention_scores.masked_fill(attention_mask, float("-inf")) + attention_probs = F.softmax(attention_scores, dim=-1) + attention_probs = attention_probs.type_as(value_layer) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.attention_dropout(attention_probs) + # ========================= + # Context layer. [sq, b, hp] + # ========================= + + # value_layer -> context layer. + # [sk, b, np, hn] --> [b, np, sq, hn] + + # context layer shape: [b, np, sq, hn] + output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3)) + # change view [sk, b * np, hn] + value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1) + # change view [b * np, sq, sk] + attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1) + # matmul: [b * np, sq, hn] + context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) + # change view [b, np, sq, hn] + context_layer = context_layer.view(*output_size) + # [b, np, sq, hn] --> [sq, b, np, hn] + context_layer = context_layer.permute(2, 0, 1, 3).contiguous() + # [sq, b, np, hn] --> [sq, b, hp] + new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) + context_layer = context_layer.view(*new_context_layer_shape) + + return context_layer + + +class SelfAttention(torch.nn.Module): + """Parallel self-attention layer abstract class. + + Self-attention layer takes input with size [s, b, h] + and returns output of the same size. + """ + + def __init__(self, config: ChatGLMConfig, layer_number, device=None): + super(SelfAttention, self).__init__() + self.layer_number = max(1, layer_number) + + self.projection_size = config.kv_channels * config.num_attention_heads + + # Per attention head and per partition values. + self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads + self.num_attention_heads_per_partition = config.num_attention_heads + + self.multi_query_attention = config.multi_query_attention + self.qkv_hidden_size = 3 * self.projection_size + if self.multi_query_attention: + self.num_multi_query_groups_per_partition = config.multi_query_group_num + self.qkv_hidden_size = ( + self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num + ) + self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size, + bias=config.add_bias_linear or config.add_qkv_bias, + device=device, **_config_to_kwargs(config) + ) + + self.core_attention = CoreAttention(config, self.layer_number) + + # Output. + self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear, + device=device, **_config_to_kwargs(config) + ) + + def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None): + if self.multi_query_attention: + num_attention_heads = self.num_multi_query_groups_per_partition + else: + num_attention_heads = self.num_attention_heads_per_partition + return torch.empty( + inference_max_sequence_len, + batch_size, + num_attention_heads, + self.hidden_size_per_attention_head, + dtype=dtype, + device=device, + ) + + def forward( + self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True + ): + # hidden_states: [sq, b, h] + + # ================================================= + # Pre-allocate memory for key-values for inference. + # ================================================= + # ===================== + # Query, Key, and Value + # ===================== + + # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)] + mixed_x_layer = self.query_key_value(hidden_states) + + if self.multi_query_attention: + (query_layer, key_layer, value_layer) = mixed_x_layer.split( + [ + self.num_attention_heads_per_partition * self.hidden_size_per_attention_head, + self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, + self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, + ], + dim=-1, + ) + query_layer = query_layer.view( + query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) + ) + key_layer = key_layer.view( + key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head) + ) + value_layer = value_layer.view( + value_layer.size()[:-1] + + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head) + ) + else: + new_tensor_shape = mixed_x_layer.size()[:-1] + \ + (self.num_attention_heads_per_partition, + 3 * self.hidden_size_per_attention_head) + mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) + + # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] + (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) + + # apply relative positional encoding (rotary embedding) + if rotary_pos_emb is not None: + query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb) + key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb) + + # adjust key and value for inference + if kv_cache is not None: + cache_k, cache_v = kv_cache + key_layer = torch.cat((cache_k, key_layer), dim=0) + value_layer = torch.cat((cache_v, value_layer), dim=0) + if use_cache: + kv_cache = (key_layer, value_layer) + else: + kv_cache = None + + if self.multi_query_attention: + key_layer = key_layer.unsqueeze(-2) + key_layer = key_layer.expand( + -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1 + ) + key_layer = key_layer.contiguous().view( + key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) + ) + value_layer = value_layer.unsqueeze(-2) + value_layer = value_layer.expand( + -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1 + ) + value_layer = value_layer.contiguous().view( + value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) + ) + + # ================================== + # core attention computation + # ================================== + + context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask) + + # ================= + # Output. [sq, b, h] + # ================= + + output = self.dense(context_layer) + + return output, kv_cache + + +def _config_to_kwargs(args): + common_kwargs = { + "dtype": args.torch_dtype, + } + return common_kwargs + + +class MLP(torch.nn.Module): + """MLP. + + MLP will take the input with h hidden state, project it to 4*h + hidden dimension, perform nonlinear transformation, and project the + state back into h hidden dimension. + """ + + def __init__(self, config: ChatGLMConfig, device=None): + super(MLP, self).__init__() + + self.add_bias = config.add_bias_linear + + # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf + self.dense_h_to_4h = nn.Linear( + config.hidden_size, + config.ffn_hidden_size * 2, + bias=self.add_bias, + device=device, + **_config_to_kwargs(config) + ) + + def swiglu(x): + x = torch.chunk(x, 2, dim=-1) + return F.silu(x[0]) * x[1] + + self.activation_func = swiglu + + # Project back to h. + self.dense_4h_to_h = nn.Linear( + config.ffn_hidden_size, + config.hidden_size, + bias=self.add_bias, + device=device, + **_config_to_kwargs(config) + ) + + def forward(self, hidden_states): + # [s, b, 4hp] + intermediate_parallel = self.dense_h_to_4h(hidden_states) + intermediate_parallel = self.activation_func(intermediate_parallel) + # [s, b, h] + output = self.dense_4h_to_h(intermediate_parallel) + return output + + +class GLMBlock(torch.nn.Module): + """A single transformer layer. + + Transformer layer takes input with size [s, b, h] and returns an + output of the same size. + """ + + def __init__(self, config: ChatGLMConfig, layer_number, device=None): + super(GLMBlock, self).__init__() + self.layer_number = layer_number + + self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm + + self.fp32_residual_connection = config.fp32_residual_connection + + LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm + # Layernorm on the input data. + self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, + dtype=config.torch_dtype) + + # Self attention. + self.self_attention = SelfAttention(config, layer_number, device=device) + self.hidden_dropout = config.hidden_dropout + + # Layernorm on the attention output + self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, + dtype=config.torch_dtype) + + # MLP + self.mlp = MLP(config, device=device) + + def forward( + self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True, + ): + # hidden_states: [s, b, h] + + # Layer norm at the beginning of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + # Self attention. + attention_output, kv_cache = self.self_attention( + layernorm_output, + attention_mask, + rotary_pos_emb, + kv_cache=kv_cache, + use_cache=use_cache + ) + + # Residual connection. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = hidden_states + + layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training) + layernorm_input = residual + layernorm_input + + # Layer norm post the self attention. + layernorm_output = self.post_attention_layernorm(layernorm_input) + + # MLP. + mlp_output = self.mlp(layernorm_output) + + # Second residual connection. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = layernorm_input + + output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training) + output = residual + output + + return output, kv_cache + + +class GLMTransformer(torch.nn.Module): + """Transformer class.""" + + def __init__(self, config: ChatGLMConfig, device=None): + super(GLMTransformer, self).__init__() + + self.fp32_residual_connection = config.fp32_residual_connection + self.post_layer_norm = config.post_layer_norm + + # Number of layers. + self.num_layers = config.num_layers + + # Transformer layers. + def build_layer(layer_number): + return GLMBlock(config, layer_number, device=device) + + self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)]) + + if self.post_layer_norm: + LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm + # Final layer norm before output. + self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, + dtype=config.torch_dtype) + + self.gradient_checkpointing = False + + def _get_layer(self, layer_number): + return self.layers[layer_number] + + def forward( + self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None, + use_cache: Optional[bool] = True, + output_hidden_states: Optional[bool] = False, + ): + if not kv_caches: + kv_caches = [None for _ in range(self.num_layers)] + presents = () if use_cache else None + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + all_self_attentions = None + all_hidden_states = () if output_hidden_states else None + for index in range(self.num_layers): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer = self._get_layer(index) + if self.gradient_checkpointing and self.training: + layer_ret = torch.utils.checkpoint.checkpoint( + layer, + hidden_states, + attention_mask, + rotary_pos_emb, + kv_caches[index], + use_cache + ) + else: + layer_ret = layer( + hidden_states, + attention_mask, + rotary_pos_emb, + kv_cache=kv_caches[index], + use_cache=use_cache + ) + hidden_states, kv_cache = layer_ret + if use_cache: + presents = presents + (kv_cache,) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # Final layer norm. + if self.post_layer_norm: + hidden_states = self.final_layernorm(hidden_states) + + return hidden_states, presents, all_hidden_states, all_self_attentions + + +class ChatGLMPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and + a simple interface for downloading and loading pretrained models. + """ + + is_parallelizable = False + supports_gradient_checkpointing = True + config_class = ChatGLMConfig + base_model_prefix = "transformer" + _no_split_modules = ["GLMBlock"] + + def _init_weights(self, module: nn.Module): + """Initialize the weights.""" + return + + def get_masks(self, input_ids, past_key_values, padding_mask=None): + batch_size, seq_length = input_ids.shape + full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device) + full_attention_mask.tril_() + past_length = 0 + if past_key_values: + past_length = past_key_values[0][0].shape[0] + if past_length: + full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length, + device=input_ids.device), full_attention_mask), dim=-1) + if padding_mask is not None: + full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1) + if not past_length and padding_mask is not None: + full_attention_mask -= padding_mask.unsqueeze(-1) - 1 + full_attention_mask = (full_attention_mask < 0.5).bool() + full_attention_mask.unsqueeze_(1) + return full_attention_mask + + def get_position_ids(self, input_ids, device): + batch_size, seq_length = input_ids.shape + position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1) + return position_ids + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, GLMTransformer): + module.gradient_checkpointing = value + + +class Embedding(torch.nn.Module): + """Language model embeddings.""" + + def __init__(self, config: ChatGLMConfig, device=None): + super(Embedding, self).__init__() + + self.hidden_size = config.hidden_size + # Word embeddings (parallel). + self.word_embeddings = nn.Embedding( + config.padded_vocab_size, + self.hidden_size, + dtype=config.torch_dtype, + device=device + ) + self.fp32_residual_connection = config.fp32_residual_connection + + def forward(self, input_ids): + # Embeddings. + words_embeddings = self.word_embeddings(input_ids) + embeddings = words_embeddings + # Data format change to avoid explicit tranposes : [b s h] --> [s b h]. + embeddings = embeddings.transpose(0, 1).contiguous() + # If the input flag for fp32 residual connection is set, convert for float. + if self.fp32_residual_connection: + embeddings = embeddings.float() + return embeddings + + +class ChatGLMModel(ChatGLMPreTrainedModel): + def __init__(self, config: ChatGLMConfig, device=None, empty_init=True): + super().__init__(config) + if empty_init: + init_method = skip_init + else: + init_method = default_init + init_kwargs = {} + if device is not None: + init_kwargs["device"] = device + self.embedding = init_method(Embedding, config, **init_kwargs) + self.num_layers = config.num_layers + self.multi_query_group_num = config.multi_query_group_num + self.kv_channels = config.kv_channels + + # Rotary positional embeddings + self.seq_length = config.seq_length + rotary_dim = ( + config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels + ) + + self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device, + dtype=config.torch_dtype) + self.encoder = init_method(GLMTransformer, config, **init_kwargs) + self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False, + dtype=config.torch_dtype, **init_kwargs) + self.pre_seq_len = config.pre_seq_len + self.prefix_projection = config.prefix_projection + if self.pre_seq_len is not None: + for param in self.parameters(): + param.requires_grad = False + self.prefix_tokens = torch.arange(self.pre_seq_len).long() + self.prefix_encoder = PrefixEncoder(config) + self.dropout = torch.nn.Dropout(0.1) + + def get_input_embeddings(self): + return self.embedding.word_embeddings + + def get_prompt(self, batch_size, device, dtype=torch.half): + prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device) + past_key_values = self.prefix_encoder(prefix_tokens).type(dtype) + past_key_values = past_key_values.view( + batch_size, + self.pre_seq_len, + self.num_layers * 2, + self.multi_query_group_num, + self.kv_channels + ) + # seq_len, b, nh, hidden_size + past_key_values = self.dropout(past_key_values) + past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2) + return past_key_values + + def forward( + self, + input_ids, + position_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.BoolTensor] = None, + full_attention_mask: Optional[torch.BoolTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, seq_length = input_ids.shape + + if inputs_embeds is None: + inputs_embeds = self.embedding(input_ids) + + if self.pre_seq_len is not None: + if past_key_values is None: + past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device, + dtype=inputs_embeds.dtype) + if attention_mask is not None: + attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)), + attention_mask], dim=-1) + + if full_attention_mask is None: + if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1): + full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask) + + # Rotary positional embeddings + rotary_pos_emb = self.rotary_pos_emb(self.seq_length) + if position_ids is not None: + rotary_pos_emb = rotary_pos_emb[position_ids] + else: + rotary_pos_emb = rotary_pos_emb[None, :seq_length] + rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous() + + # Run encoder. + hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder( + inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb, + kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states + ) + + if not return_dict: + return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + def quantize(self, weight_bit_width: int): + from .quantization import quantize + quantize(self.encoder, weight_bit_width) + return self + + +class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel): + def __init__(self, config: ChatGLMConfig, empty_init=True, device=None): + super().__init__(config) + + self.max_sequence_length = config.max_length + self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device) + self.config = config + self.quantized = False + + if self.config.quantization_bit: + self.quantize(self.config.quantization_bit, empty_init=True) + + def _update_model_kwargs_for_generation( + self, + outputs: ModelOutput, + model_kwargs: Dict[str, Any], + is_encoder_decoder: bool = False, + standardize_cache_format: bool = False, + ) -> Dict[str, Any]: + # update past_key_values + model_kwargs["past_key_values"] = self._extract_past_from_model_output( + outputs, standardize_cache_format=standardize_cache_format + ) + + # update attention mask + if "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + model_kwargs["attention_mask"] = torch.cat( + [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 + ) + + # update position ids + if "position_ids" in model_kwargs: + position_ids = model_kwargs["position_ids"] + new_position_id = position_ids[..., -1:].clone() + new_position_id += 1 + model_kwargs["position_ids"] = torch.cat( + [position_ids, new_position_id], dim=-1 + ) + + model_kwargs["is_first_forward"] = False + return model_kwargs + + def prepare_inputs_for_generation( + self, + input_ids: torch.LongTensor, + past_key_values: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + is_first_forward: bool = True, + **kwargs + ) -> dict: + # only last token for input_ids if past is not None + if position_ids is None: + position_ids = self.get_position_ids(input_ids, device=input_ids.device) + if not is_first_forward: + if past_key_values is not None: + position_ids = position_ids[..., -1:] + input_ids = input_ids[:, -1:] + return { + "input_ids": input_ids, + "past_key_values": past_key_values, + "position_ids": position_ids, + "attention_mask": attention_mask, + "return_last_logit": True, + "use_cache": use_cache + } + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + return_last_logit: Optional[bool] = False, + ): + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + if return_last_logit: + hidden_states = hidden_states[-1:] + lm_logits = self.transformer.output_layer(hidden_states) + lm_logits = lm_logits.transpose(0, 1).contiguous() + + loss = None + if labels is not None: + lm_logits = lm_logits.to(torch.float32) + + # Shift so that tokens < n predict n + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss(ignore_index=-100) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + lm_logits = lm_logits.to(hidden_states.dtype) + loss = loss.to(hidden_states.dtype) + + if not return_dict: + output = (lm_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + @staticmethod + def _reorder_cache( + past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor + ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]: + """ + This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or + [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct + beam_idx at every generation step. + + Output shares the same memory storage as `past`. + """ + return tuple( + ( + layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)), + layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)), + ) + for layer_past in past + ) + + def process_response(self, output, history): + content = "" + history = deepcopy(history) + for response in output.split("<|assistant|>"): + metadata, content = response.split("\n", maxsplit=1) + if not metadata.strip(): + content = content.strip() + history.append({"role": "assistant", "metadata": metadata, "content": content}) + content = content.replace("[[训练时间]]", "2023年") + else: + history.append({"role": "assistant", "metadata": metadata, "content": content}) + if history[0]["role"] == "system" and "tools" in history[0]: + content = "\n".join(content.split("\n")[1:-1]) + def tool_call(**kwargs): + return kwargs + parameters = eval(content) + content = {"name": metadata.strip(), "parameters": parameters} + else: + content = {"name": metadata.strip(), "content": content} + return content, history + + @torch.inference_mode() + def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", + max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None, + **kwargs): + if history is None: + history = [] + if logits_processor is None: + logits_processor = LogitsProcessorList() + logits_processor.append(InvalidScoreLogitsProcessor()) + gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, + "temperature": temperature, "logits_processor": logits_processor, **kwargs} + inputs = tokenizer.build_chat_input(query, history=history, role=role) + inputs = inputs.to(self.device) + eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"), + tokenizer.get_command("<|observation|>")] + outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id) + outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1] + response = tokenizer.decode(outputs) + history.append({"role": role, "content": query}) + response, history = self.process_response(response, history) + return response, history + + @torch.inference_mode() + def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", + past_key_values=None,max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8, + logits_processor=None, return_past_key_values=False, **kwargs): + if history is None: + history = [] + if logits_processor is None: + logits_processor = LogitsProcessorList() + logits_processor.append(InvalidScoreLogitsProcessor()) + eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"), + tokenizer.get_command("<|observation|>")] + gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p, + "temperature": temperature, "logits_processor": logits_processor, **kwargs} + if past_key_values is None: + inputs = tokenizer.build_chat_input(query, history=history, role=role) + else: + inputs = tokenizer.build_chat_input(query, role=role) + inputs = inputs.to(self.device) + if past_key_values is not None: + past_length = past_key_values[0][0].shape[0] + if self.transformer.pre_seq_len is not None: + past_length -= self.transformer.pre_seq_len + inputs.position_ids += past_length + attention_mask = inputs.attention_mask + attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1) + inputs['attention_mask'] = attention_mask + history.append({"role": role, "content": query}) + for outputs in self.stream_generate(**inputs, past_key_values=past_key_values, + eos_token_id=eos_token_id, return_past_key_values=return_past_key_values, + **gen_kwargs): + if return_past_key_values: + outputs, past_key_values = outputs + outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1] + response = tokenizer.decode(outputs) + if response and response[-1] != "�": + response, new_history = self.process_response(response, history) + if return_past_key_values: + yield response, new_history, past_key_values + else: + yield response, new_history + + @torch.inference_mode() + def stream_generate( + self, + input_ids, + generation_config: Optional[GenerationConfig] = None, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, + return_past_key_values=False, + **kwargs, + ): + batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1] + + if generation_config is None: + generation_config = self.generation_config + generation_config = copy.deepcopy(generation_config) + model_kwargs = generation_config.update(**kwargs) + model_kwargs["use_cache"] = generation_config.use_cache + bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id + + if isinstance(eos_token_id, int): + eos_token_id = [eos_token_id] + eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None + + has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None + if has_default_max_length and generation_config.max_new_tokens is None: + warnings.warn( + f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. " + "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we" + " recommend using `max_new_tokens` to control the maximum length of the generation.", + UserWarning, + ) + elif generation_config.max_new_tokens is not None: + generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length + if not has_default_max_length: + logger.warn( + f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(=" + f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. " + "Please refer to the documentation for more information. " + "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)", + UserWarning, + ) + + if input_ids_seq_length >= generation_config.max_length: + input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids" + logger.warning( + f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to" + f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider" + " increasing `max_new_tokens`." + ) + + # 2. Set generation parameters if not already defined + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + + logits_processor = self._get_logits_processor( + generation_config=generation_config, + input_ids_seq_length=input_ids_seq_length, + encoder_input_ids=input_ids, + prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, + logits_processor=logits_processor, + ) + + stopping_criteria = self._get_stopping_criteria( + generation_config=generation_config, stopping_criteria=stopping_criteria + ) + logits_warper = self._get_logits_warper(generation_config) + + unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) + scores = None + while True: + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + # forward pass to get next token + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=False, + output_hidden_states=False, + ) + + next_token_logits = outputs.logits[:, -1, :] + + # pre-process distribution + next_token_scores = logits_processor(input_ids, next_token_logits) + next_token_scores = logits_warper(input_ids, next_token_scores) + + # sample + probs = nn.functional.softmax(next_token_scores, dim=-1) + if generation_config.do_sample: + next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) + else: + next_tokens = torch.argmax(probs, dim=-1) + # update generated ids, model inputs, and length for next step + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + model_kwargs = self._update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder + ) + unfinished_sequences = unfinished_sequences.mul( + next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) + ) + if return_past_key_values: + yield input_ids, outputs.past_key_values + else: + yield input_ids + # stop when each sentence is finished, or if we exceed the maximum length + if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): + break + + def quantize(self, bits: int, empty_init=False, device=None, **kwargs): + if bits == 0: + return + + from .quantization import quantize + + if self.quantized: + logger.info("Already quantized.") + return self + + self.quantized = True + + self.config.quantization_bit = bits + + self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device, + **kwargs) + return self + + +class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel): + def __init__(self, config: ChatGLMConfig, empty_init=True, device=None): + super().__init__(config) + + self.num_labels = config.num_labels + self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device) + + self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half) + if config.classifier_dropout is not None: + self.dropout = nn.Dropout(config.classifier_dropout) + else: + self.dropout = None + self.config = config + + if self.config.quantization_bit: + self.quantize(self.config.quantization_bit, empty_init=True) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + full_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + inputs_embeds: Optional[torch.LongTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + full_attention_mask=full_attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + pooled_hidden_states = hidden_states[-1] + if self.dropout is not None: + pooled_hidden_states = self.dropout(pooled_hidden_states) + logits = self.classifier_head(pooled_hidden_states) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze().float(), labels.squeeze()) + else: + loss = loss_fct(logits.float(), labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits.float(), labels.view(-1, self.num_labels)) + + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/optimizer.pt b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad44085e9bc8966822377ccaefaf97559b0caa8e --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dd1da17d3b6b12fe5b24290a2d06569fb2d3550d90f5e90e2eb102ea5fe310b +size 14682210 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/pytorch_model.bin b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..53f07f8d3e9b4b3da941a4648f77fe1b33e4afd3 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b13806436bde90f40badb60acfb6ae15d876df2742640232e36e8b5d07fab9b0 +size 7341306 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/quantization.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/quantization.py new file mode 100644 index 0000000000000000000000000000000000000000..cb95bfe82b203ff6a2aa962326d2c7a438d6a52f --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/quantization.py @@ -0,0 +1,188 @@ +from torch.nn import Linear +from torch.nn.parameter import Parameter + +import bz2 +import torch +import base64 +import ctypes +from transformers.utils import logging + +from typing import List +from functools import partial + +logger = logging.get_logger(__name__) + +try: + from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up + + class Kernel: + def __init__(self, code: bytes, function_names: List[str]): + self.code = code + self._function_names = function_names + self._cmodule = LazyKernelCModule(self.code) + + for name in self._function_names: + setattr(self, name, KernelFunction(self._cmodule, name)) + + quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ" + + kernels = Kernel( + bz2.decompress(base64.b64decode(quantization_code)), + [ + "int4WeightCompression", + "int4WeightExtractionFloat", + "int4WeightExtractionHalf", + "int8WeightExtractionFloat", + "int8WeightExtractionHalf", + ], + ) +except Exception as exception: + kernels = None + logger.warning("Failed to load cpm_kernels:" + str(exception)) + + +class W8A16Linear(torch.autograd.Function): + @staticmethod + def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width): + ctx.inp_shape = inp.size() + ctx.weight_bit_width = weight_bit_width + out_features = quant_w.size(0) + inp = inp.contiguous().view(-1, inp.size(-1)) + weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width) + ctx.weight_shape = weight.size() + output = inp.mm(weight.t()) + ctx.save_for_backward(inp, quant_w, scale_w) + return output.view(*(ctx.inp_shape[:-1] + (out_features,))) + + @staticmethod + def backward(ctx, grad_output: torch.Tensor): + inp, quant_w, scale_w = ctx.saved_tensors + weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width) + grad_output = grad_output.contiguous().view(-1, weight.size(0)) + grad_input = grad_output.mm(weight) + grad_weight = grad_output.t().mm(inp) + return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None + + +def compress_int4_weight(weight: torch.Tensor): # (n, m) + with torch.cuda.device(weight.device): + n, m = weight.size(0), weight.size(1) + assert m % 2 == 0 + m = m // 2 + out = torch.empty(n, m, dtype=torch.int8, device="cuda") + stream = torch.cuda.current_stream() + + gridDim = (n, 1, 1) + blockDim = (min(round_up(m, 32), 1024), 1, 1) + + kernels.int4WeightCompression( + gridDim, + blockDim, + 0, + stream, + [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)], + ) + return out + + +def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int): + assert scale_list.dtype in [torch.half, torch.bfloat16] + assert weight.dtype in [torch.int8] + if source_bit_width == 8: + return weight.to(scale_list.dtype) * scale_list[:, None] + elif source_bit_width == 4: + func = ( + kernels.int4WeightExtractionHalf if scale_list.dtype == torch.half else kernels.int4WeightExtractionBFloat16 + ) + else: + assert False, "Unsupported bit-width" + + with torch.cuda.device(weight.device): + n, m = weight.size(0), weight.size(1) + out = torch.empty(n, m * (8 // source_bit_width), dtype=scale_list.dtype, device="cuda") + stream = torch.cuda.current_stream() + + gridDim = (n, 1, 1) + blockDim = (min(round_up(m, 32), 1024), 1, 1) + + func( + gridDim, + blockDim, + 0, + stream, + [ + ctypes.c_void_p(weight.data_ptr()), + ctypes.c_void_p(scale_list.data_ptr()), + ctypes.c_void_p(out.data_ptr()), + ctypes.c_int32(n), + ctypes.c_int32(m), + ], + ) + return out + + +class QuantizedLinear(torch.nn.Module): + def __init__(self, weight_bit_width: int, weight, bias=None, device="cpu", dtype=None, empty_init=False, *args, + **kwargs): + super().__init__() + self.weight_bit_width = weight_bit_width + + shape = weight.shape + + if weight is None or empty_init: + self.weight = torch.empty(shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=device) + self.weight_scale = torch.empty(shape[0], dtype=dtype, device=device) + else: + self.weight_scale = weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1) + self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8) + if weight_bit_width == 4: + self.weight = compress_int4_weight(self.weight) + + self.weight = Parameter(self.weight.to(device), requires_grad=False) + self.weight_scale = Parameter(self.weight_scale.to(device), requires_grad=False) + self.bias = Parameter(bias.to(device), requires_grad=False) if bias is not None else None + + def forward(self, input): + output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width) + if self.bias is not None: + output = output + self.bias + return output + + +def quantize(model, weight_bit_width, empty_init=False, device=None): + """Replace fp16 linear with quantized linear""" + for layer in model.layers: + layer.self_attention.query_key_value = QuantizedLinear( + weight_bit_width=weight_bit_width, + weight=layer.self_attention.query_key_value.weight.to(torch.cuda.current_device()), + bias=layer.self_attention.query_key_value.bias, + dtype=layer.self_attention.query_key_value.weight.dtype, + device=layer.self_attention.query_key_value.weight.device if device is None else device, + empty_init=empty_init + ) + layer.self_attention.dense = QuantizedLinear( + weight_bit_width=weight_bit_width, + weight=layer.self_attention.dense.weight.to(torch.cuda.current_device()), + bias=layer.self_attention.dense.bias, + dtype=layer.self_attention.dense.weight.dtype, + device=layer.self_attention.dense.weight.device if device is None else device, + empty_init=empty_init + ) + layer.mlp.dense_h_to_4h = QuantizedLinear( + weight_bit_width=weight_bit_width, + weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()), + bias=layer.mlp.dense_h_to_4h.bias, + dtype=layer.mlp.dense_h_to_4h.weight.dtype, + device=layer.mlp.dense_h_to_4h.weight.device if device is None else device, + empty_init=empty_init + ) + layer.mlp.dense_4h_to_h = QuantizedLinear( + weight_bit_width=weight_bit_width, + weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()), + bias=layer.mlp.dense_4h_to_h.bias, + dtype=layer.mlp.dense_4h_to_h.weight.dtype, + device=layer.mlp.dense_4h_to_h.weight.device if device is None else device, + empty_init=empty_init + ) + + return model diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/rng_state.pth b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8d79580df3289aeba2c49bc7ba0545698f615dc9 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9a63e97433d3fef7b3a60f533854fd6ddf541e22f57319249a84c8c03349901 +size 14244 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/scheduler.pt b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..84975cb4556df93d442cc53ad9a0422ccb68dfa3 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03807c669b0036b32d733f45cd6dd9532812cb5c07571d10756d30ca0c75581c +size 1064 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/special_tokens_map.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/special_tokens_map.json @@ -0,0 +1 @@ +{} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenization_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenizer.model b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenizer_config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5faafd38f2e2934804feb0e7d71ebf08b0839bf5 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenizer_config.json @@ -0,0 +1,18 @@ +{ + "added_tokens_decoder": {}, + "additional_special_tokens": [], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/trainer_state.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ce58b9e749f072c05c203cd33107a6fd6b1ef57c --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/trainer_state.json @@ -0,0 +1,1219 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 18.823529411764707, + "eval_steps": 500, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.09, + "learning_rate": 0.009985714285714285, + "loss": 2.6971, + "step": 1 + }, + { + "epoch": 0.19, + "learning_rate": 0.009971428571428572, + "loss": 2.3927, + "step": 2 + }, + { + "epoch": 0.28, + "learning_rate": 0.009957142857142857, + "loss": 2.2539, + "step": 3 + }, + { + "epoch": 0.38, + "learning_rate": 0.009942857142857144, + "loss": 2.1408, + "step": 4 + }, + { + "epoch": 0.47, + "learning_rate": 0.009928571428571429, + "loss": 2.2672, + "step": 5 + }, + { + "epoch": 0.56, + "learning_rate": 0.009914285714285714, + "loss": 1.6433, + "step": 6 + }, + { + "epoch": 0.66, + "learning_rate": 0.0099, + "loss": 2.1405, + "step": 7 + }, + { + "epoch": 0.75, + "learning_rate": 0.009885714285714286, + "loss": 2.1464, + "step": 8 + }, + { + "epoch": 0.85, + "learning_rate": 0.009871428571428571, + "loss": 1.8498, + "step": 9 + }, + { + "epoch": 0.94, + "learning_rate": 0.009857142857142858, + "loss": 1.6896, + "step": 10 + }, + { + "epoch": 1.04, + "learning_rate": 0.009842857142857143, + "loss": 2.1932, + "step": 11 + }, + { + "epoch": 1.13, + "learning_rate": 0.00982857142857143, + "loss": 1.8236, + "step": 12 + }, + { + "epoch": 1.22, + "learning_rate": 0.009814285714285715, + "loss": 1.735, + "step": 13 + }, + { + "epoch": 1.32, + "learning_rate": 0.0098, + "loss": 1.7488, + "step": 14 + }, + { + "epoch": 1.41, + "learning_rate": 0.009785714285714285, + "loss": 1.8336, + "step": 15 + }, + { + "epoch": 1.51, + "learning_rate": 0.009771428571428572, + "loss": 1.9438, + "step": 16 + }, + { + "epoch": 1.6, + "learning_rate": 0.009757142857142858, + "loss": 1.7178, + "step": 17 + }, + { + "epoch": 1.69, + "learning_rate": 0.009742857142857143, + "loss": 1.5714, + "step": 18 + }, + { + "epoch": 1.79, + "learning_rate": 0.009728571428571428, + "loss": 1.537, + "step": 19 + }, + { + "epoch": 1.88, + "learning_rate": 0.009714285714285715, + "loss": 1.6764, + "step": 20 + }, + { + "epoch": 1.98, + "learning_rate": 0.0097, + "loss": 1.8919, + "step": 21 + }, + { + "epoch": 2.07, + "learning_rate": 0.009685714285714285, + "loss": 1.346, + "step": 22 + }, + { + "epoch": 2.16, + "learning_rate": 0.009671428571428572, + "loss": 1.5036, + "step": 23 + }, + { + "epoch": 2.26, + "learning_rate": 0.009657142857142857, + "loss": 1.6788, + "step": 24 + }, + { + "epoch": 2.35, + "learning_rate": 0.009642857142857144, + "loss": 1.6667, + "step": 25 + }, + { + "epoch": 2.45, + "learning_rate": 0.009628571428571429, + "loss": 1.7153, + "step": 26 + }, + { + "epoch": 2.54, + "learning_rate": 0.009614285714285714, + "loss": 1.601, + "step": 27 + }, + { + "epoch": 2.64, + "learning_rate": 0.0096, + "loss": 1.3002, + "step": 28 + }, + { + "epoch": 2.73, + "learning_rate": 0.009585714285714286, + "loss": 1.3294, + "step": 29 + }, + { + "epoch": 2.82, + "learning_rate": 0.009571428571428573, + "loss": 1.7477, + "step": 30 + }, + { + "epoch": 2.92, + "learning_rate": 0.009557142857142858, + "loss": 1.7961, + "step": 31 + }, + { + "epoch": 3.01, + "learning_rate": 0.009542857142857143, + "loss": 1.4954, + "step": 32 + }, + { + "epoch": 3.11, + "learning_rate": 0.009528571428571428, + "loss": 1.6452, + "step": 33 + }, + { + "epoch": 3.2, + "learning_rate": 0.009514285714285715, + "loss": 1.3528, + "step": 34 + }, + { + "epoch": 3.29, + "learning_rate": 0.0095, + "loss": 1.4811, + "step": 35 + }, + { + "epoch": 3.39, + "learning_rate": 0.009485714285714287, + "loss": 1.4738, + "step": 36 + }, + { + "epoch": 3.48, + "learning_rate": 0.009471428571428572, + "loss": 1.174, + "step": 37 + }, + { + "epoch": 3.58, + "learning_rate": 0.009457142857142857, + "loss": 1.2346, + "step": 38 + }, + { + "epoch": 3.67, + "learning_rate": 0.009442857142857143, + "loss": 1.5327, + "step": 39 + }, + { + "epoch": 3.76, + "learning_rate": 0.009428571428571429, + "loss": 1.5249, + "step": 40 + }, + { + "epoch": 3.86, + "learning_rate": 0.009414285714285714, + "loss": 1.5086, + "step": 41 + }, + { + "epoch": 3.95, + "learning_rate": 0.0094, + "loss": 1.8425, + "step": 42 + }, + { + "epoch": 4.05, + "learning_rate": 0.009385714285714287, + "loss": 1.1943, + "step": 43 + }, + { + "epoch": 4.14, + "learning_rate": 0.009371428571428572, + "loss": 1.6835, + "step": 44 + }, + { + "epoch": 4.24, + "learning_rate": 0.009357142857142857, + "loss": 1.75, + "step": 45 + }, + { + "epoch": 4.33, + "learning_rate": 0.009342857142857142, + "loss": 1.2561, + "step": 46 + }, + { + "epoch": 4.42, + "learning_rate": 0.009328571428571429, + "loss": 1.3784, + "step": 47 + }, + { + "epoch": 4.52, + "learning_rate": 0.009314285714285714, + "loss": 1.2538, + "step": 48 + }, + { + "epoch": 4.61, + "learning_rate": 0.009300000000000001, + "loss": 1.4429, + "step": 49 + }, + { + "epoch": 4.71, + "learning_rate": 0.009285714285714286, + "loss": 1.3687, + "step": 50 + }, + { + "epoch": 4.8, + "learning_rate": 0.009271428571428571, + "loss": 1.1511, + "step": 51 + }, + { + "epoch": 4.89, + "learning_rate": 0.009257142857142858, + "loss": 1.181, + "step": 52 + }, + { + "epoch": 4.99, + "learning_rate": 0.009242857142857143, + "loss": 1.1753, + "step": 53 + }, + { + "epoch": 5.08, + "learning_rate": 0.009228571428571428, + "loss": 1.1562, + "step": 54 + }, + { + "epoch": 5.18, + "learning_rate": 0.009214285714285715, + "loss": 1.2936, + "step": 55 + }, + { + "epoch": 5.27, + "learning_rate": 0.0092, + "loss": 1.3591, + "step": 56 + }, + { + "epoch": 5.36, + "learning_rate": 0.009185714285714287, + "loss": 1.1376, + "step": 57 + }, + { + "epoch": 5.46, + "learning_rate": 0.009171428571428572, + "loss": 1.372, + "step": 58 + }, + { + "epoch": 5.55, + "learning_rate": 0.009157142857142857, + "loss": 1.5141, + "step": 59 + }, + { + "epoch": 5.65, + "learning_rate": 0.009142857142857144, + "loss": 1.2087, + "step": 60 + }, + { + "epoch": 5.74, + "learning_rate": 0.009128571428571429, + "loss": 1.136, + "step": 61 + }, + { + "epoch": 5.84, + "learning_rate": 0.009114285714285715, + "loss": 1.2948, + "step": 62 + }, + { + "epoch": 5.93, + "learning_rate": 0.0091, + "loss": 1.0592, + "step": 63 + }, + { + "epoch": 6.02, + "learning_rate": 0.009085714285714286, + "loss": 1.2321, + "step": 64 + }, + { + "epoch": 6.12, + "learning_rate": 0.009071428571428572, + "loss": 1.0827, + "step": 65 + }, + { + "epoch": 6.21, + "learning_rate": 0.009057142857142857, + "loss": 1.1136, + "step": 66 + }, + { + "epoch": 6.31, + "learning_rate": 0.009042857142857142, + "loss": 1.475, + "step": 67 + }, + { + "epoch": 6.4, + "learning_rate": 0.009028571428571427, + "loss": 1.1316, + "step": 68 + }, + { + "epoch": 6.49, + "learning_rate": 0.009014285714285714, + "loss": 1.1688, + "step": 69 + }, + { + "epoch": 6.59, + "learning_rate": 0.009000000000000001, + "loss": 1.0882, + "step": 70 + }, + { + "epoch": 6.68, + "learning_rate": 0.008985714285714286, + "loss": 1.1085, + "step": 71 + }, + { + "epoch": 6.78, + "learning_rate": 0.008971428571428571, + "loss": 1.2029, + "step": 72 + }, + { + "epoch": 6.87, + "learning_rate": 0.008957142857142856, + "loss": 1.098, + "step": 73 + }, + { + "epoch": 6.96, + "learning_rate": 0.008942857142857143, + "loss": 1.219, + "step": 74 + }, + { + "epoch": 7.06, + "learning_rate": 0.00892857142857143, + "loss": 1.0092, + "step": 75 + }, + { + "epoch": 7.15, + "learning_rate": 0.008914285714285715, + "loss": 1.0112, + "step": 76 + }, + { + "epoch": 7.25, + "learning_rate": 0.0089, + "loss": 1.1481, + "step": 77 + }, + { + "epoch": 7.34, + "learning_rate": 0.008885714285714287, + "loss": 0.9873, + "step": 78 + }, + { + "epoch": 7.44, + "learning_rate": 0.008871428571428572, + "loss": 1.0586, + "step": 79 + }, + { + "epoch": 7.53, + "learning_rate": 0.008857142857142857, + "loss": 1.1177, + "step": 80 + }, + { + "epoch": 7.62, + "learning_rate": 0.008842857142857142, + "loss": 0.7814, + "step": 81 + }, + { + "epoch": 7.72, + "learning_rate": 0.008828571428571429, + "loss": 1.2043, + "step": 82 + }, + { + "epoch": 7.81, + "learning_rate": 0.008814285714285715, + "loss": 1.0062, + "step": 83 + }, + { + "epoch": 7.91, + "learning_rate": 0.0088, + "loss": 1.0831, + "step": 84 + }, + { + "epoch": 8.0, + "learning_rate": 0.008785714285714286, + "loss": 0.9554, + "step": 85 + }, + { + "epoch": 8.09, + "learning_rate": 0.00877142857142857, + "loss": 1.1674, + "step": 86 + }, + { + "epoch": 8.19, + "learning_rate": 0.008757142857142857, + "loss": 0.8226, + "step": 87 + }, + { + "epoch": 8.28, + "learning_rate": 0.008742857142857144, + "loss": 0.9166, + "step": 88 + }, + { + "epoch": 8.38, + "learning_rate": 0.00872857142857143, + "loss": 0.734, + "step": 89 + }, + { + "epoch": 8.47, + "learning_rate": 0.008714285714285714, + "loss": 0.8641, + "step": 90 + }, + { + "epoch": 8.56, + "learning_rate": 0.0087, + "loss": 0.9517, + "step": 91 + }, + { + "epoch": 8.66, + "learning_rate": 0.008685714285714286, + "loss": 0.9995, + "step": 92 + }, + { + "epoch": 8.75, + "learning_rate": 0.008671428571428571, + "loss": 0.763, + "step": 93 + }, + { + "epoch": 8.85, + "learning_rate": 0.008657142857142858, + "loss": 1.0712, + "step": 94 + }, + { + "epoch": 8.94, + "learning_rate": 0.008642857142857143, + "loss": 1.1111, + "step": 95 + }, + { + "epoch": 9.04, + "learning_rate": 0.008628571428571428, + "loss": 0.9626, + "step": 96 + }, + { + "epoch": 9.13, + "learning_rate": 0.008614285714285715, + "loss": 0.6385, + "step": 97 + }, + { + "epoch": 9.22, + "learning_rate": 0.0086, + "loss": 0.8147, + "step": 98 + }, + { + "epoch": 9.32, + "learning_rate": 0.008585714285714285, + "loss": 0.8109, + "step": 99 + }, + { + "epoch": 9.41, + "learning_rate": 0.008571428571428572, + "loss": 1.0953, + "step": 100 + }, + { + "epoch": 9.51, + "learning_rate": 0.008557142857142859, + "loss": 0.7104, + "step": 101 + }, + { + "epoch": 9.6, + "learning_rate": 0.008542857142857144, + "loss": 0.9672, + "step": 102 + }, + { + "epoch": 9.69, + "learning_rate": 0.008528571428571429, + "loss": 0.7593, + "step": 103 + }, + { + "epoch": 9.79, + "learning_rate": 0.008514285714285714, + "loss": 1.0186, + "step": 104 + }, + { + "epoch": 9.88, + "learning_rate": 0.0085, + "loss": 0.7898, + "step": 105 + }, + { + "epoch": 9.98, + "learning_rate": 0.008485714285714286, + "loss": 0.7392, + "step": 106 + }, + { + "epoch": 10.07, + "learning_rate": 0.008471428571428572, + "loss": 0.7295, + "step": 107 + }, + { + "epoch": 10.16, + "learning_rate": 0.008457142857142858, + "loss": 0.7211, + "step": 108 + }, + { + "epoch": 10.26, + "learning_rate": 0.008442857142857143, + "loss": 0.769, + "step": 109 + }, + { + "epoch": 10.35, + "learning_rate": 0.00842857142857143, + "loss": 0.718, + "step": 110 + }, + { + "epoch": 10.45, + "learning_rate": 0.008414285714285714, + "loss": 0.6411, + "step": 111 + }, + { + "epoch": 10.54, + "learning_rate": 0.0084, + "loss": 0.8016, + "step": 112 + }, + { + "epoch": 10.64, + "learning_rate": 0.008385714285714286, + "loss": 0.6633, + "step": 113 + }, + { + "epoch": 10.73, + "learning_rate": 0.008371428571428571, + "loss": 0.7257, + "step": 114 + }, + { + "epoch": 10.82, + "learning_rate": 0.008357142857142858, + "loss": 0.7785, + "step": 115 + }, + { + "epoch": 10.92, + "learning_rate": 0.008342857142857143, + "loss": 0.8927, + "step": 116 + }, + { + "epoch": 11.01, + "learning_rate": 0.008328571428571428, + "loss": 0.7242, + "step": 117 + }, + { + "epoch": 11.11, + "learning_rate": 0.008314285714285715, + "loss": 0.8297, + "step": 118 + }, + { + "epoch": 11.2, + "learning_rate": 0.0083, + "loss": 0.6761, + "step": 119 + }, + { + "epoch": 11.29, + "learning_rate": 0.008285714285714287, + "loss": 0.6699, + "step": 120 + }, + { + "epoch": 11.39, + "learning_rate": 0.008271428571428572, + "loss": 0.5365, + "step": 121 + }, + { + "epoch": 11.48, + "learning_rate": 0.008257142857142857, + "loss": 0.9045, + "step": 122 + }, + { + "epoch": 11.58, + "learning_rate": 0.008242857142857144, + "loss": 0.5071, + "step": 123 + }, + { + "epoch": 11.67, + "learning_rate": 0.008228571428571429, + "loss": 0.6472, + "step": 124 + }, + { + "epoch": 11.76, + "learning_rate": 0.008214285714285714, + "loss": 0.6232, + "step": 125 + }, + { + "epoch": 11.86, + "learning_rate": 0.008199999999999999, + "loss": 0.4905, + "step": 126 + }, + { + "epoch": 11.95, + "learning_rate": 0.008185714285714286, + "loss": 0.557, + "step": 127 + }, + { + "epoch": 12.05, + "learning_rate": 0.008171428571428573, + "loss": 0.5517, + "step": 128 + }, + { + "epoch": 12.14, + "learning_rate": 0.008157142857142858, + "loss": 0.6321, + "step": 129 + }, + { + "epoch": 12.24, + "learning_rate": 0.008142857142857143, + "loss": 0.6619, + "step": 130 + }, + { + "epoch": 12.33, + "learning_rate": 0.008128571428571428, + "loss": 0.5524, + "step": 131 + }, + { + "epoch": 12.42, + "learning_rate": 0.008114285714285715, + "loss": 0.4688, + "step": 132 + }, + { + "epoch": 12.52, + "learning_rate": 0.008100000000000001, + "loss": 0.3717, + "step": 133 + }, + { + "epoch": 12.61, + "learning_rate": 0.008085714285714286, + "loss": 0.5118, + "step": 134 + }, + { + "epoch": 12.71, + "learning_rate": 0.008071428571428571, + "loss": 0.4521, + "step": 135 + }, + { + "epoch": 12.8, + "learning_rate": 0.008057142857142856, + "loss": 0.5865, + "step": 136 + }, + { + "epoch": 12.89, + "learning_rate": 0.008042857142857143, + "loss": 0.5977, + "step": 137 + }, + { + "epoch": 12.99, + "learning_rate": 0.008028571428571428, + "loss": 0.6977, + "step": 138 + }, + { + "epoch": 13.08, + "learning_rate": 0.008014285714285713, + "loss": 0.5625, + "step": 139 + }, + { + "epoch": 13.18, + "learning_rate": 0.008, + "loss": 0.3611, + "step": 140 + }, + { + "epoch": 13.27, + "learning_rate": 0.007985714285714287, + "loss": 0.5168, + "step": 141 + }, + { + "epoch": 13.36, + "learning_rate": 0.007971428571428572, + "loss": 0.4429, + "step": 142 + }, + { + "epoch": 13.46, + "learning_rate": 0.007957142857142857, + "loss": 0.4998, + "step": 143 + }, + { + "epoch": 13.55, + "learning_rate": 0.007942857142857142, + "loss": 0.4437, + "step": 144 + }, + { + "epoch": 13.65, + "learning_rate": 0.007928571428571429, + "loss": 0.4958, + "step": 145 + }, + { + "epoch": 13.74, + "learning_rate": 0.007914285714285716, + "loss": 0.4021, + "step": 146 + }, + { + "epoch": 13.84, + "learning_rate": 0.0079, + "loss": 0.6163, + "step": 147 + }, + { + "epoch": 13.93, + "learning_rate": 0.007885714285714286, + "loss": 0.406, + "step": 148 + }, + { + "epoch": 14.02, + "learning_rate": 0.007871428571428571, + "loss": 0.4905, + "step": 149 + }, + { + "epoch": 14.12, + "learning_rate": 0.007857142857142858, + "loss": 0.3824, + "step": 150 + }, + { + "epoch": 14.21, + "learning_rate": 0.007842857142857143, + "loss": 0.3591, + "step": 151 + }, + { + "epoch": 14.31, + "learning_rate": 0.007828571428571428, + "loss": 0.342, + "step": 152 + }, + { + "epoch": 14.4, + "learning_rate": 0.007814285714285715, + "loss": 0.4565, + "step": 153 + }, + { + "epoch": 14.49, + "learning_rate": 0.0078000000000000005, + "loss": 0.3287, + "step": 154 + }, + { + "epoch": 14.59, + "learning_rate": 0.007785714285714286, + "loss": 0.4179, + "step": 155 + }, + { + "epoch": 14.68, + "learning_rate": 0.0077714285714285715, + "loss": 0.3586, + "step": 156 + }, + { + "epoch": 14.78, + "learning_rate": 0.007757142857142857, + "loss": 0.4618, + "step": 157 + }, + { + "epoch": 14.87, + "learning_rate": 0.0077428571428571425, + "loss": 0.4133, + "step": 158 + }, + { + "epoch": 14.96, + "learning_rate": 0.007728571428571429, + "loss": 0.4326, + "step": 159 + }, + { + "epoch": 15.06, + "learning_rate": 0.007714285714285715, + "loss": 0.3838, + "step": 160 + }, + { + "epoch": 15.15, + "learning_rate": 0.0077, + "loss": 0.2978, + "step": 161 + }, + { + "epoch": 15.25, + "learning_rate": 0.007685714285714286, + "loss": 0.3993, + "step": 162 + }, + { + "epoch": 15.34, + "learning_rate": 0.007671428571428571, + "loss": 0.3249, + "step": 163 + }, + { + "epoch": 15.44, + "learning_rate": 0.007657142857142857, + "loss": 0.2796, + "step": 164 + }, + { + "epoch": 15.53, + "learning_rate": 0.007642857142857142, + "loss": 0.3918, + "step": 165 + }, + { + "epoch": 15.62, + "learning_rate": 0.007628571428571429, + "loss": 0.4122, + "step": 166 + }, + { + "epoch": 15.72, + "learning_rate": 0.007614285714285715, + "loss": 0.3403, + "step": 167 + }, + { + "epoch": 15.81, + "learning_rate": 0.0076, + "loss": 0.3759, + "step": 168 + }, + { + "epoch": 15.91, + "learning_rate": 0.007585714285714286, + "loss": 0.3621, + "step": 169 + }, + { + "epoch": 16.0, + "learning_rate": 0.007571428571428571, + "loss": 0.2991, + "step": 170 + }, + { + "epoch": 16.09, + "learning_rate": 0.007557142857142857, + "loss": 0.3039, + "step": 171 + }, + { + "epoch": 16.19, + "learning_rate": 0.007542857142857144, + "loss": 0.4571, + "step": 172 + }, + { + "epoch": 16.28, + "learning_rate": 0.007528571428571429, + "loss": 0.2759, + "step": 173 + }, + { + "epoch": 16.38, + "learning_rate": 0.007514285714285715, + "loss": 0.2835, + "step": 174 + }, + { + "epoch": 16.47, + "learning_rate": 0.0075, + "loss": 0.3221, + "step": 175 + }, + { + "epoch": 16.56, + "learning_rate": 0.007485714285714286, + "loss": 0.3072, + "step": 176 + }, + { + "epoch": 16.66, + "learning_rate": 0.007471428571428572, + "loss": 0.2852, + "step": 177 + }, + { + "epoch": 16.75, + "learning_rate": 0.007457142857142857, + "loss": 0.2559, + "step": 178 + }, + { + "epoch": 16.85, + "learning_rate": 0.007442857142857143, + "loss": 0.2787, + "step": 179 + }, + { + "epoch": 16.94, + "learning_rate": 0.007428571428571429, + "loss": 0.3331, + "step": 180 + }, + { + "epoch": 17.04, + "learning_rate": 0.007414285714285714, + "loss": 0.1929, + "step": 181 + }, + { + "epoch": 17.13, + "learning_rate": 0.0074, + "loss": 0.2065, + "step": 182 + }, + { + "epoch": 17.22, + "learning_rate": 0.007385714285714285, + "loss": 0.2868, + "step": 183 + }, + { + "epoch": 17.32, + "learning_rate": 0.007371428571428571, + "loss": 0.2206, + "step": 184 + }, + { + "epoch": 17.41, + "learning_rate": 0.007357142857142858, + "loss": 0.2355, + "step": 185 + }, + { + "epoch": 17.51, + "learning_rate": 0.007342857142857143, + "loss": 0.3041, + "step": 186 + }, + { + "epoch": 17.6, + "learning_rate": 0.007328571428571429, + "loss": 0.3028, + "step": 187 + }, + { + "epoch": 17.69, + "learning_rate": 0.007314285714285714, + "loss": 0.2435, + "step": 188 + }, + { + "epoch": 17.79, + "learning_rate": 0.0073, + "loss": 0.1869, + "step": 189 + }, + { + "epoch": 17.88, + "learning_rate": 0.007285714285714285, + "loss": 0.3036, + "step": 190 + }, + { + "epoch": 17.98, + "learning_rate": 0.007271428571428571, + "loss": 0.246, + "step": 191 + }, + { + "epoch": 18.07, + "learning_rate": 0.007257142857142858, + "loss": 0.2316, + "step": 192 + }, + { + "epoch": 18.16, + "learning_rate": 0.007242857142857143, + "loss": 0.186, + "step": 193 + }, + { + "epoch": 18.26, + "learning_rate": 0.007228571428571429, + "loss": 0.2616, + "step": 194 + }, + { + "epoch": 18.35, + "learning_rate": 0.007214285714285715, + "loss": 0.2824, + "step": 195 + }, + { + "epoch": 18.45, + "learning_rate": 0.0072, + "loss": 0.2, + "step": 196 + }, + { + "epoch": 18.54, + "learning_rate": 0.007185714285714286, + "loss": 0.1978, + "step": 197 + }, + { + "epoch": 18.64, + "learning_rate": 0.007171428571428572, + "loss": 0.1897, + "step": 198 + }, + { + "epoch": 18.73, + "learning_rate": 0.007157142857142858, + "loss": 0.1958, + "step": 199 + }, + { + "epoch": 18.82, + "learning_rate": 0.0071428571428571435, + "loss": 0.203, + "step": 200 + } + ], + "logging_steps": 1.0, + "max_steps": 700, + "num_train_epochs": 70, + "save_steps": 100, + "total_flos": 2.350348643598336e+17, + "trial_name": null, + "trial_params": null +} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/training_args.bin b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..17f9bfbf1a7cdd9e0e808e0672d55ad9ad4efb5f --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00669a32a6ddac0a3243bbc04d3f1f70ffc8f89f2626c1fdafa93ce68c311aa0 +size 4664 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/config.json new file mode 100644 index 0000000000000000000000000000000000000000..50d927dc68b4eaa40bd4812b7417b3f2bd61f599 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "chatglm3-6b", + "add_bias_linear": false, + "add_qkv_bias": true, + "apply_query_key_layer_scaling": true, + "apply_residual_connection_post_layernorm": false, + "architectures": [ + "ChatGLMForConditionalGeneration" + ], + "attention_dropout": 0.0, + "attention_softmax_in_fp32": true, + "auto_map": { + "AutoConfig": "configuration_chatglm.ChatGLMConfig", + "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification" + }, + "bias_dropout_fusion": true, + "classifier_dropout": null, + "eos_token_id": 2, + "ffn_hidden_size": 13696, + "fp32_residual_connection": false, + "hidden_dropout": 0.0, + "hidden_size": 4096, + "kv_channels": 128, + "layernorm_epsilon": 1e-05, + "model_type": "chatglm", + "multi_query_attention": true, + "multi_query_group_num": 2, + "num_attention_heads": 32, + "num_layers": 28, + "original_rope": true, + "pad_token_id": 0, + "padded_vocab_size": 65024, + "post_layer_norm": true, + "pre_seq_len": 128, + "prefix_projection": false, + "quantization_bit": 0, + "rmsnorm": true, + "seq_length": 8192, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.34.0", + "use_cache": true, + "vocab_size": 65024 +} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/configuration_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/configuration_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..35600185f5a26951081de0f3a41a913eaf06af99 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/configuration_chatglm.py @@ -0,0 +1,61 @@ +from transformers import PretrainedConfig + + +class ChatGLMConfig(PretrainedConfig): + model_type = "chatglm" + def __init__( + self, + num_layers=28, + padded_vocab_size=65024, + hidden_size=4096, + ffn_hidden_size=13696, + kv_channels=128, + num_attention_heads=32, + seq_length=2048, + hidden_dropout=0.0, + classifier_dropout=None, + attention_dropout=0.0, + layernorm_epsilon=1e-5, + rmsnorm=True, + apply_residual_connection_post_layernorm=False, + post_layer_norm=True, + add_bias_linear=False, + add_qkv_bias=False, + bias_dropout_fusion=True, + multi_query_attention=False, + multi_query_group_num=1, + apply_query_key_layer_scaling=True, + attention_softmax_in_fp32=True, + fp32_residual_connection=False, + quantization_bit=0, + pre_seq_len=None, + prefix_projection=False, + **kwargs + ): + self.num_layers = num_layers + self.vocab_size = padded_vocab_size + self.padded_vocab_size = padded_vocab_size + self.hidden_size = hidden_size + self.ffn_hidden_size = ffn_hidden_size + self.kv_channels = kv_channels + self.num_attention_heads = num_attention_heads + self.seq_length = seq_length + self.hidden_dropout = hidden_dropout + self.classifier_dropout = classifier_dropout + self.attention_dropout = attention_dropout + self.layernorm_epsilon = layernorm_epsilon + self.rmsnorm = rmsnorm + self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm + self.post_layer_norm = post_layer_norm + self.add_bias_linear = add_bias_linear + self.add_qkv_bias = add_qkv_bias + self.bias_dropout_fusion = bias_dropout_fusion + self.multi_query_attention = multi_query_attention + self.multi_query_group_num = multi_query_group_num + self.apply_query_key_layer_scaling = apply_query_key_layer_scaling + self.attention_softmax_in_fp32 = attention_softmax_in_fp32 + self.fp32_residual_connection = fp32_residual_connection + self.quantization_bit = quantization_bit + self.pre_seq_len = pre_seq_len + self.prefix_projection = prefix_projection + super().__init__(**kwargs) \ No newline at end of file diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/generation_config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31d22ad9418a1daba6b2bbf472ac3762cd5ce643 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "eos_token_id": 2, + "pad_token_id": 0, + "transformers_version": "4.34.0" +} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/modeling_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/modeling_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..c5b5027587016090a377f25289284b6e4f829cb4 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/modeling_chatglm.py @@ -0,0 +1,1293 @@ +""" PyTorch ChatGLM model. """ + +import math +import copy +import warnings +import re +import sys + +import torch +import torch.utils.checkpoint +import torch.nn.functional as F +from torch import nn +from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss +from torch.nn.utils import skip_init +from typing import Optional, Tuple, Union, List, Callable, Dict, Any +from copy import deepcopy + +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import logging +from transformers.generation.logits_process import LogitsProcessor +from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput + +from .configuration_chatglm import ChatGLMConfig + +# flags required to enable jit fusion kernels + +if sys.platform != 'darwin': + torch._C._jit_set_profiling_mode(False) + torch._C._jit_set_profiling_executor(False) + torch._C._jit_override_can_fuse_on_cpu(True) + torch._C._jit_override_can_fuse_on_gpu(True) + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM" +_CONFIG_FOR_DOC = "ChatGLMConfig" + +CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "THUDM/chatglm3-6b", + # See all ChatGLM models at https://huggingface.co/models?filter=chatglm +] + + +def default_init(cls, *args, **kwargs): + return cls(*args, **kwargs) + + +class InvalidScoreLogitsProcessor(LogitsProcessor): + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + if torch.isnan(scores).any() or torch.isinf(scores).any(): + scores.zero_() + scores[..., 5] = 5e4 + return scores + + +class PrefixEncoder(torch.nn.Module): + """ + The torch.nn model to encode the prefix + Input shape: (batch-size, prefix-length) + Output shape: (batch-size, prefix-length, 2*layers*hidden) + """ + + def __init__(self, config: ChatGLMConfig): + super().__init__() + self.prefix_projection = config.prefix_projection + if self.prefix_projection: + # Use a two-layer MLP to encode the prefix + kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2 + self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size) + self.trans = torch.nn.Sequential( + torch.nn.Linear(kv_size, config.hidden_size), + torch.nn.Tanh(), + torch.nn.Linear(config.hidden_size, kv_size) + ) + else: + self.embedding = torch.nn.Embedding(config.pre_seq_len, + config.num_layers * config.kv_channels * config.multi_query_group_num * 2) + + def forward(self, prefix: torch.Tensor): + if self.prefix_projection: + prefix_tokens = self.embedding(prefix) + past_key_values = self.trans(prefix_tokens) + else: + past_key_values = self.embedding(prefix) + return past_key_values + + +def split_tensor_along_last_dim( + tensor: torch.Tensor, + num_partitions: int, + contiguous_split_chunks: bool = False, +) -> List[torch.Tensor]: + """Split a tensor along its last dimension. + + Arguments: + tensor: input tensor. + num_partitions: number of partitions to split the tensor + contiguous_split_chunks: If True, make each chunk contiguous + in memory. + + Returns: + A list of Tensors + """ + # Get the size and dimension. + last_dim = tensor.dim() - 1 + last_dim_size = tensor.size()[last_dim] // num_partitions + # Split. + tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) + # Note: torch.split does not create contiguous tensors by default. + if contiguous_split_chunks: + return tuple(chunk.contiguous() for chunk in tensor_list) + + return tensor_list + + +class RotaryEmbedding(nn.Module): + def __init__(self, dim, original_impl=False, device=None, dtype=None): + super().__init__() + inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim)) + self.register_buffer("inv_freq", inv_freq) + self.dim = dim + self.original_impl = original_impl + + def forward_impl( + self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000 + ): + """Enhanced Transformer with Rotary Position Embedding. + + Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/ + transformers/rope/__init__.py. MIT License: + https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license. + """ + # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$ + theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem)) + + # Create position indexes `[0, 1, ..., seq_len - 1]` + seq_idx = torch.arange(seq_len, dtype=torch.float, device=device) + + # Calculate the product of position index and $\theta_i$ + idx_theta = torch.outer(seq_idx, theta).float() + + cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1) + + # this is to mimic the behaviour of complex32, else we will get different results + if dtype in (torch.float16, torch.bfloat16, torch.int8): + cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half() + return cache + + def forward(self, max_seq_len, offset=0): + return self.forward_impl( + max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device + ) + + +@torch.jit.script +def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor: + # x: [sq, b, np, hn] + sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3) + rot_dim = rope_cache.shape[-2] * 2 + x, x_pass = x[..., :rot_dim], x[..., rot_dim:] + # truncate to support variable sizes + rope_cache = rope_cache[:sq] + xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2) + rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2) + x_out2 = torch.stack( + [ + xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1], + xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1], + ], + -1, + ) + x_out2 = x_out2.flatten(3) + return torch.cat((x_out2, x_pass), dim=-1) + + +class RMSNorm(torch.nn.Module): + def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs): + super().__init__() + self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype)) + self.eps = eps + + def forward(self, hidden_states: torch.Tensor): + input_dtype = hidden_states.dtype + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.eps) + + return (self.weight * hidden_states).to(input_dtype) + + +class CoreAttention(torch.nn.Module): + def __init__(self, config: ChatGLMConfig, layer_number): + super(CoreAttention, self).__init__() + + self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling + self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32 + if self.apply_query_key_layer_scaling: + self.attention_softmax_in_fp32 = True + self.layer_number = max(1, layer_number) + + projection_size = config.kv_channels * config.num_attention_heads + + # Per attention head and per partition values. + self.hidden_size_per_partition = projection_size + self.hidden_size_per_attention_head = projection_size // config.num_attention_heads + self.num_attention_heads_per_partition = config.num_attention_heads + + coeff = None + self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + if self.apply_query_key_layer_scaling: + coeff = self.layer_number + self.norm_factor *= coeff + self.coeff = coeff + + self.attention_dropout = torch.nn.Dropout(config.attention_dropout) + + def forward(self, query_layer, key_layer, value_layer, attention_mask): + pytorch_major_version = int(torch.__version__.split('.')[0]) + if pytorch_major_version >= 2: + query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]] + if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]: + context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, + is_causal=True) + else: + if attention_mask is not None: + attention_mask = ~attention_mask + context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, + attention_mask) + context_layer = context_layer.permute(2, 0, 1, 3) + new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) + context_layer = context_layer.reshape(*new_context_layer_shape) + else: + # Raw attention scores + + # [b, np, sq, sk] + output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0)) + + # [sq, b, np, hn] -> [sq, b * np, hn] + query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1) + # [sk, b, np, hn] -> [sk, b * np, hn] + key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) + + # preallocting input tensor: [b * np, sq, sk] + matmul_input_buffer = torch.empty( + output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype, + device=query_layer.device + ) + + # Raw attention scores. [b * np, sq, sk] + matmul_result = torch.baddbmm( + matmul_input_buffer, + query_layer.transpose(0, 1), # [b * np, sq, hn] + key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] + beta=0.0, + alpha=(1.0 / self.norm_factor), + ) + + # change view to [b, np, sq, sk] + attention_scores = matmul_result.view(*output_size) + + # =========================== + # Attention probs and dropout + # =========================== + + # attention scores and attention mask [b, np, sq, sk] + if self.attention_softmax_in_fp32: + attention_scores = attention_scores.float() + if self.coeff is not None: + attention_scores = attention_scores * self.coeff + if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]: + attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3], + device=attention_scores.device, dtype=torch.bool) + attention_mask.tril_() + attention_mask = ~attention_mask + if attention_mask is not None: + attention_scores = attention_scores.masked_fill(attention_mask, float("-inf")) + attention_probs = F.softmax(attention_scores, dim=-1) + attention_probs = attention_probs.type_as(value_layer) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.attention_dropout(attention_probs) + # ========================= + # Context layer. [sq, b, hp] + # ========================= + + # value_layer -> context layer. + # [sk, b, np, hn] --> [b, np, sq, hn] + + # context layer shape: [b, np, sq, hn] + output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3)) + # change view [sk, b * np, hn] + value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1) + # change view [b * np, sq, sk] + attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1) + # matmul: [b * np, sq, hn] + context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) + # change view [b, np, sq, hn] + context_layer = context_layer.view(*output_size) + # [b, np, sq, hn] --> [sq, b, np, hn] + context_layer = context_layer.permute(2, 0, 1, 3).contiguous() + # [sq, b, np, hn] --> [sq, b, hp] + new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) + context_layer = context_layer.view(*new_context_layer_shape) + + return context_layer + + +class SelfAttention(torch.nn.Module): + """Parallel self-attention layer abstract class. + + Self-attention layer takes input with size [s, b, h] + and returns output of the same size. + """ + + def __init__(self, config: ChatGLMConfig, layer_number, device=None): + super(SelfAttention, self).__init__() + self.layer_number = max(1, layer_number) + + self.projection_size = config.kv_channels * config.num_attention_heads + + # Per attention head and per partition values. + self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads + self.num_attention_heads_per_partition = config.num_attention_heads + + self.multi_query_attention = config.multi_query_attention + self.qkv_hidden_size = 3 * self.projection_size + if self.multi_query_attention: + self.num_multi_query_groups_per_partition = config.multi_query_group_num + self.qkv_hidden_size = ( + self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num + ) + self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size, + bias=config.add_bias_linear or config.add_qkv_bias, + device=device, **_config_to_kwargs(config) + ) + + self.core_attention = CoreAttention(config, self.layer_number) + + # Output. + self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear, + device=device, **_config_to_kwargs(config) + ) + + def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None): + if self.multi_query_attention: + num_attention_heads = self.num_multi_query_groups_per_partition + else: + num_attention_heads = self.num_attention_heads_per_partition + return torch.empty( + inference_max_sequence_len, + batch_size, + num_attention_heads, + self.hidden_size_per_attention_head, + dtype=dtype, + device=device, + ) + + def forward( + self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True + ): + # hidden_states: [sq, b, h] + + # ================================================= + # Pre-allocate memory for key-values for inference. + # ================================================= + # ===================== + # Query, Key, and Value + # ===================== + + # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)] + mixed_x_layer = self.query_key_value(hidden_states) + + if self.multi_query_attention: + (query_layer, key_layer, value_layer) = mixed_x_layer.split( + [ + self.num_attention_heads_per_partition * self.hidden_size_per_attention_head, + self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, + self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, + ], + dim=-1, + ) + query_layer = query_layer.view( + query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) + ) + key_layer = key_layer.view( + key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head) + ) + value_layer = value_layer.view( + value_layer.size()[:-1] + + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head) + ) + else: + new_tensor_shape = mixed_x_layer.size()[:-1] + \ + (self.num_attention_heads_per_partition, + 3 * self.hidden_size_per_attention_head) + mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) + + # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] + (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) + + # apply relative positional encoding (rotary embedding) + if rotary_pos_emb is not None: + query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb) + key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb) + + # adjust key and value for inference + if kv_cache is not None: + cache_k, cache_v = kv_cache + key_layer = torch.cat((cache_k, key_layer), dim=0) + value_layer = torch.cat((cache_v, value_layer), dim=0) + if use_cache: + kv_cache = (key_layer, value_layer) + else: + kv_cache = None + + if self.multi_query_attention: + key_layer = key_layer.unsqueeze(-2) + key_layer = key_layer.expand( + -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1 + ) + key_layer = key_layer.contiguous().view( + key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) + ) + value_layer = value_layer.unsqueeze(-2) + value_layer = value_layer.expand( + -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1 + ) + value_layer = value_layer.contiguous().view( + value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) + ) + + # ================================== + # core attention computation + # ================================== + + context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask) + + # ================= + # Output. [sq, b, h] + # ================= + + output = self.dense(context_layer) + + return output, kv_cache + + +def _config_to_kwargs(args): + common_kwargs = { + "dtype": args.torch_dtype, + } + return common_kwargs + + +class MLP(torch.nn.Module): + """MLP. + + MLP will take the input with h hidden state, project it to 4*h + hidden dimension, perform nonlinear transformation, and project the + state back into h hidden dimension. + """ + + def __init__(self, config: ChatGLMConfig, device=None): + super(MLP, self).__init__() + + self.add_bias = config.add_bias_linear + + # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf + self.dense_h_to_4h = nn.Linear( + config.hidden_size, + config.ffn_hidden_size * 2, + bias=self.add_bias, + device=device, + **_config_to_kwargs(config) + ) + + def swiglu(x): + x = torch.chunk(x, 2, dim=-1) + return F.silu(x[0]) * x[1] + + self.activation_func = swiglu + + # Project back to h. + self.dense_4h_to_h = nn.Linear( + config.ffn_hidden_size, + config.hidden_size, + bias=self.add_bias, + device=device, + **_config_to_kwargs(config) + ) + + def forward(self, hidden_states): + # [s, b, 4hp] + intermediate_parallel = self.dense_h_to_4h(hidden_states) + intermediate_parallel = self.activation_func(intermediate_parallel) + # [s, b, h] + output = self.dense_4h_to_h(intermediate_parallel) + return output + + +class GLMBlock(torch.nn.Module): + """A single transformer layer. + + Transformer layer takes input with size [s, b, h] and returns an + output of the same size. + """ + + def __init__(self, config: ChatGLMConfig, layer_number, device=None): + super(GLMBlock, self).__init__() + self.layer_number = layer_number + + self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm + + self.fp32_residual_connection = config.fp32_residual_connection + + LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm + # Layernorm on the input data. + self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, + dtype=config.torch_dtype) + + # Self attention. + self.self_attention = SelfAttention(config, layer_number, device=device) + self.hidden_dropout = config.hidden_dropout + + # Layernorm on the attention output + self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, + dtype=config.torch_dtype) + + # MLP + self.mlp = MLP(config, device=device) + + def forward( + self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True, + ): + # hidden_states: [s, b, h] + + # Layer norm at the beginning of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + # Self attention. + attention_output, kv_cache = self.self_attention( + layernorm_output, + attention_mask, + rotary_pos_emb, + kv_cache=kv_cache, + use_cache=use_cache + ) + + # Residual connection. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = hidden_states + + layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training) + layernorm_input = residual + layernorm_input + + # Layer norm post the self attention. + layernorm_output = self.post_attention_layernorm(layernorm_input) + + # MLP. + mlp_output = self.mlp(layernorm_output) + + # Second residual connection. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = layernorm_input + + output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training) + output = residual + output + + return output, kv_cache + + +class GLMTransformer(torch.nn.Module): + """Transformer class.""" + + def __init__(self, config: ChatGLMConfig, device=None): + super(GLMTransformer, self).__init__() + + self.fp32_residual_connection = config.fp32_residual_connection + self.post_layer_norm = config.post_layer_norm + + # Number of layers. + self.num_layers = config.num_layers + + # Transformer layers. + def build_layer(layer_number): + return GLMBlock(config, layer_number, device=device) + + self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)]) + + if self.post_layer_norm: + LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm + # Final layer norm before output. + self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, + dtype=config.torch_dtype) + + self.gradient_checkpointing = False + + def _get_layer(self, layer_number): + return self.layers[layer_number] + + def forward( + self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None, + use_cache: Optional[bool] = True, + output_hidden_states: Optional[bool] = False, + ): + if not kv_caches: + kv_caches = [None for _ in range(self.num_layers)] + presents = () if use_cache else None + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + all_self_attentions = None + all_hidden_states = () if output_hidden_states else None + for index in range(self.num_layers): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer = self._get_layer(index) + if self.gradient_checkpointing and self.training: + layer_ret = torch.utils.checkpoint.checkpoint( + layer, + hidden_states, + attention_mask, + rotary_pos_emb, + kv_caches[index], + use_cache + ) + else: + layer_ret = layer( + hidden_states, + attention_mask, + rotary_pos_emb, + kv_cache=kv_caches[index], + use_cache=use_cache + ) + hidden_states, kv_cache = layer_ret + if use_cache: + presents = presents + (kv_cache,) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # Final layer norm. + if self.post_layer_norm: + hidden_states = self.final_layernorm(hidden_states) + + return hidden_states, presents, all_hidden_states, all_self_attentions + + +class ChatGLMPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and + a simple interface for downloading and loading pretrained models. + """ + + is_parallelizable = False + supports_gradient_checkpointing = True + config_class = ChatGLMConfig + base_model_prefix = "transformer" + _no_split_modules = ["GLMBlock"] + + def _init_weights(self, module: nn.Module): + """Initialize the weights.""" + return + + def get_masks(self, input_ids, past_key_values, padding_mask=None): + batch_size, seq_length = input_ids.shape + full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device) + full_attention_mask.tril_() + past_length = 0 + if past_key_values: + past_length = past_key_values[0][0].shape[0] + if past_length: + full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length, + device=input_ids.device), full_attention_mask), dim=-1) + if padding_mask is not None: + full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1) + if not past_length and padding_mask is not None: + full_attention_mask -= padding_mask.unsqueeze(-1) - 1 + full_attention_mask = (full_attention_mask < 0.5).bool() + full_attention_mask.unsqueeze_(1) + return full_attention_mask + + def get_position_ids(self, input_ids, device): + batch_size, seq_length = input_ids.shape + position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1) + return position_ids + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, GLMTransformer): + module.gradient_checkpointing = value + + +class Embedding(torch.nn.Module): + """Language model embeddings.""" + + def __init__(self, config: ChatGLMConfig, device=None): + super(Embedding, self).__init__() + + self.hidden_size = config.hidden_size + # Word embeddings (parallel). + self.word_embeddings = nn.Embedding( + config.padded_vocab_size, + self.hidden_size, + dtype=config.torch_dtype, + device=device + ) + self.fp32_residual_connection = config.fp32_residual_connection + + def forward(self, input_ids): + # Embeddings. + words_embeddings = self.word_embeddings(input_ids) + embeddings = words_embeddings + # Data format change to avoid explicit tranposes : [b s h] --> [s b h]. + embeddings = embeddings.transpose(0, 1).contiguous() + # If the input flag for fp32 residual connection is set, convert for float. + if self.fp32_residual_connection: + embeddings = embeddings.float() + return embeddings + + +class ChatGLMModel(ChatGLMPreTrainedModel): + def __init__(self, config: ChatGLMConfig, device=None, empty_init=True): + super().__init__(config) + if empty_init: + init_method = skip_init + else: + init_method = default_init + init_kwargs = {} + if device is not None: + init_kwargs["device"] = device + self.embedding = init_method(Embedding, config, **init_kwargs) + self.num_layers = config.num_layers + self.multi_query_group_num = config.multi_query_group_num + self.kv_channels = config.kv_channels + + # Rotary positional embeddings + self.seq_length = config.seq_length + rotary_dim = ( + config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels + ) + + self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device, + dtype=config.torch_dtype) + self.encoder = init_method(GLMTransformer, config, **init_kwargs) + self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False, + dtype=config.torch_dtype, **init_kwargs) + self.pre_seq_len = config.pre_seq_len + self.prefix_projection = config.prefix_projection + if self.pre_seq_len is not None: + for param in self.parameters(): + param.requires_grad = False + self.prefix_tokens = torch.arange(self.pre_seq_len).long() + self.prefix_encoder = PrefixEncoder(config) + self.dropout = torch.nn.Dropout(0.1) + + def get_input_embeddings(self): + return self.embedding.word_embeddings + + def get_prompt(self, batch_size, device, dtype=torch.half): + prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device) + past_key_values = self.prefix_encoder(prefix_tokens).type(dtype) + past_key_values = past_key_values.view( + batch_size, + self.pre_seq_len, + self.num_layers * 2, + self.multi_query_group_num, + self.kv_channels + ) + # seq_len, b, nh, hidden_size + past_key_values = self.dropout(past_key_values) + past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2) + return past_key_values + + def forward( + self, + input_ids, + position_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.BoolTensor] = None, + full_attention_mask: Optional[torch.BoolTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, seq_length = input_ids.shape + + if inputs_embeds is None: + inputs_embeds = self.embedding(input_ids) + + if self.pre_seq_len is not None: + if past_key_values is None: + past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device, + dtype=inputs_embeds.dtype) + if attention_mask is not None: + attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)), + attention_mask], dim=-1) + + if full_attention_mask is None: + if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1): + full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask) + + # Rotary positional embeddings + rotary_pos_emb = self.rotary_pos_emb(self.seq_length) + if position_ids is not None: + rotary_pos_emb = rotary_pos_emb[position_ids] + else: + rotary_pos_emb = rotary_pos_emb[None, :seq_length] + rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous() + + # Run encoder. + hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder( + inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb, + kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states + ) + + if not return_dict: + return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + def quantize(self, weight_bit_width: int): + from .quantization import quantize + quantize(self.encoder, weight_bit_width) + return self + + +class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel): + def __init__(self, config: ChatGLMConfig, empty_init=True, device=None): + super().__init__(config) + + self.max_sequence_length = config.max_length + self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device) + self.config = config + self.quantized = False + + if self.config.quantization_bit: + self.quantize(self.config.quantization_bit, empty_init=True) + + def _update_model_kwargs_for_generation( + self, + outputs: ModelOutput, + model_kwargs: Dict[str, Any], + is_encoder_decoder: bool = False, + standardize_cache_format: bool = False, + ) -> Dict[str, Any]: + # update past_key_values + model_kwargs["past_key_values"] = self._extract_past_from_model_output( + outputs, standardize_cache_format=standardize_cache_format + ) + + # update attention mask + if "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + model_kwargs["attention_mask"] = torch.cat( + [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 + ) + + # update position ids + if "position_ids" in model_kwargs: + position_ids = model_kwargs["position_ids"] + new_position_id = position_ids[..., -1:].clone() + new_position_id += 1 + model_kwargs["position_ids"] = torch.cat( + [position_ids, new_position_id], dim=-1 + ) + + model_kwargs["is_first_forward"] = False + return model_kwargs + + def prepare_inputs_for_generation( + self, + input_ids: torch.LongTensor, + past_key_values: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + is_first_forward: bool = True, + **kwargs + ) -> dict: + # only last token for input_ids if past is not None + if position_ids is None: + position_ids = self.get_position_ids(input_ids, device=input_ids.device) + if not is_first_forward: + if past_key_values is not None: + position_ids = position_ids[..., -1:] + input_ids = input_ids[:, -1:] + return { + "input_ids": input_ids, + "past_key_values": past_key_values, + "position_ids": position_ids, + "attention_mask": attention_mask, + "return_last_logit": True, + "use_cache": use_cache + } + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + return_last_logit: Optional[bool] = False, + ): + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + if return_last_logit: + hidden_states = hidden_states[-1:] + lm_logits = self.transformer.output_layer(hidden_states) + lm_logits = lm_logits.transpose(0, 1).contiguous() + + loss = None + if labels is not None: + lm_logits = lm_logits.to(torch.float32) + + # Shift so that tokens < n predict n + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss(ignore_index=-100) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + lm_logits = lm_logits.to(hidden_states.dtype) + loss = loss.to(hidden_states.dtype) + + if not return_dict: + output = (lm_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + @staticmethod + def _reorder_cache( + past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor + ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]: + """ + This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or + [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct + beam_idx at every generation step. + + Output shares the same memory storage as `past`. + """ + return tuple( + ( + layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)), + layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)), + ) + for layer_past in past + ) + + def process_response(self, output, history): + content = "" + history = deepcopy(history) + for response in output.split("<|assistant|>"): + metadata, content = response.split("\n", maxsplit=1) + if not metadata.strip(): + content = content.strip() + history.append({"role": "assistant", "metadata": metadata, "content": content}) + content = content.replace("[[训练时间]]", "2023年") + else: + history.append({"role": "assistant", "metadata": metadata, "content": content}) + if history[0]["role"] == "system" and "tools" in history[0]: + content = "\n".join(content.split("\n")[1:-1]) + def tool_call(**kwargs): + return kwargs + parameters = eval(content) + content = {"name": metadata.strip(), "parameters": parameters} + else: + content = {"name": metadata.strip(), "content": content} + return content, history + + @torch.inference_mode() + def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", + max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None, + **kwargs): + if history is None: + history = [] + if logits_processor is None: + logits_processor = LogitsProcessorList() + logits_processor.append(InvalidScoreLogitsProcessor()) + gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, + "temperature": temperature, "logits_processor": logits_processor, **kwargs} + inputs = tokenizer.build_chat_input(query, history=history, role=role) + inputs = inputs.to(self.device) + eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"), + tokenizer.get_command("<|observation|>")] + outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id) + outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1] + response = tokenizer.decode(outputs) + history.append({"role": role, "content": query}) + response, history = self.process_response(response, history) + return response, history + + @torch.inference_mode() + def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", + past_key_values=None,max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8, + logits_processor=None, return_past_key_values=False, **kwargs): + if history is None: + history = [] + if logits_processor is None: + logits_processor = LogitsProcessorList() + logits_processor.append(InvalidScoreLogitsProcessor()) + eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"), + tokenizer.get_command("<|observation|>")] + gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p, + "temperature": temperature, "logits_processor": logits_processor, **kwargs} + if past_key_values is None: + inputs = tokenizer.build_chat_input(query, history=history, role=role) + else: + inputs = tokenizer.build_chat_input(query, role=role) + inputs = inputs.to(self.device) + if past_key_values is not None: + past_length = past_key_values[0][0].shape[0] + if self.transformer.pre_seq_len is not None: + past_length -= self.transformer.pre_seq_len + inputs.position_ids += past_length + attention_mask = inputs.attention_mask + attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1) + inputs['attention_mask'] = attention_mask + history.append({"role": role, "content": query}) + for outputs in self.stream_generate(**inputs, past_key_values=past_key_values, + eos_token_id=eos_token_id, return_past_key_values=return_past_key_values, + **gen_kwargs): + if return_past_key_values: + outputs, past_key_values = outputs + outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1] + response = tokenizer.decode(outputs) + if response and response[-1] != "�": + response, new_history = self.process_response(response, history) + if return_past_key_values: + yield response, new_history, past_key_values + else: + yield response, new_history + + @torch.inference_mode() + def stream_generate( + self, + input_ids, + generation_config: Optional[GenerationConfig] = None, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, + return_past_key_values=False, + **kwargs, + ): + batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1] + + if generation_config is None: + generation_config = self.generation_config + generation_config = copy.deepcopy(generation_config) + model_kwargs = generation_config.update(**kwargs) + model_kwargs["use_cache"] = generation_config.use_cache + bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id + + if isinstance(eos_token_id, int): + eos_token_id = [eos_token_id] + eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None + + has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None + if has_default_max_length and generation_config.max_new_tokens is None: + warnings.warn( + f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. " + "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we" + " recommend using `max_new_tokens` to control the maximum length of the generation.", + UserWarning, + ) + elif generation_config.max_new_tokens is not None: + generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length + if not has_default_max_length: + logger.warn( + f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(=" + f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. " + "Please refer to the documentation for more information. " + "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)", + UserWarning, + ) + + if input_ids_seq_length >= generation_config.max_length: + input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids" + logger.warning( + f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to" + f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider" + " increasing `max_new_tokens`." + ) + + # 2. Set generation parameters if not already defined + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + + logits_processor = self._get_logits_processor( + generation_config=generation_config, + input_ids_seq_length=input_ids_seq_length, + encoder_input_ids=input_ids, + prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, + logits_processor=logits_processor, + ) + + stopping_criteria = self._get_stopping_criteria( + generation_config=generation_config, stopping_criteria=stopping_criteria + ) + logits_warper = self._get_logits_warper(generation_config) + + unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) + scores = None + while True: + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + # forward pass to get next token + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=False, + output_hidden_states=False, + ) + + next_token_logits = outputs.logits[:, -1, :] + + # pre-process distribution + next_token_scores = logits_processor(input_ids, next_token_logits) + next_token_scores = logits_warper(input_ids, next_token_scores) + + # sample + probs = nn.functional.softmax(next_token_scores, dim=-1) + if generation_config.do_sample: + next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) + else: + next_tokens = torch.argmax(probs, dim=-1) + # update generated ids, model inputs, and length for next step + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + model_kwargs = self._update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder + ) + unfinished_sequences = unfinished_sequences.mul( + next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) + ) + if return_past_key_values: + yield input_ids, outputs.past_key_values + else: + yield input_ids + # stop when each sentence is finished, or if we exceed the maximum length + if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): + break + + def quantize(self, bits: int, empty_init=False, device=None, **kwargs): + if bits == 0: + return + + from .quantization import quantize + + if self.quantized: + logger.info("Already quantized.") + return self + + self.quantized = True + + self.config.quantization_bit = bits + + self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device, + **kwargs) + return self + + +class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel): + def __init__(self, config: ChatGLMConfig, empty_init=True, device=None): + super().__init__(config) + + self.num_labels = config.num_labels + self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device) + + self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half) + if config.classifier_dropout is not None: + self.dropout = nn.Dropout(config.classifier_dropout) + else: + self.dropout = None + self.config = config + + if self.config.quantization_bit: + self.quantize(self.config.quantization_bit, empty_init=True) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + full_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + inputs_embeds: Optional[torch.LongTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + full_attention_mask=full_attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + pooled_hidden_states = hidden_states[-1] + if self.dropout is not None: + pooled_hidden_states = self.dropout(pooled_hidden_states) + logits = self.classifier_head(pooled_hidden_states) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze().float(), labels.squeeze()) + else: + loss = loss_fct(logits.float(), labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits.float(), labels.view(-1, self.num_labels)) + + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/optimizer.pt b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..23cca78a3dc5181762a4f56f3a4526b43c3cbaea --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24c11e7f9276f0075ac4e17159b9565cfe78bdc2e5a89771088b58390ea40548 +size 14682210 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/pytorch_model.bin b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..11f3421760601f92c3784e4a9cc53a327e7a6227 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac5a740f837528ecd48badb09ffa12ad042c01bcce3e6473d2332e85e841ecd4 +size 7341306 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/quantization.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/quantization.py new file mode 100644 index 0000000000000000000000000000000000000000..cb95bfe82b203ff6a2aa962326d2c7a438d6a52f --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/quantization.py @@ -0,0 +1,188 @@ +from torch.nn import Linear +from torch.nn.parameter import Parameter + +import bz2 +import torch +import base64 +import ctypes +from transformers.utils import logging + +from typing import List +from functools import partial + +logger = logging.get_logger(__name__) + +try: + from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up + + class Kernel: + def __init__(self, code: bytes, function_names: List[str]): + self.code = code + self._function_names = function_names + self._cmodule = LazyKernelCModule(self.code) + + for name in self._function_names: + setattr(self, name, KernelFunction(self._cmodule, name)) + + quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ" + + kernels = Kernel( + bz2.decompress(base64.b64decode(quantization_code)), + [ + "int4WeightCompression", + "int4WeightExtractionFloat", + "int4WeightExtractionHalf", + "int8WeightExtractionFloat", + "int8WeightExtractionHalf", + ], + ) +except Exception as exception: + kernels = None + logger.warning("Failed to load cpm_kernels:" + str(exception)) + + +class W8A16Linear(torch.autograd.Function): + @staticmethod + def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width): + ctx.inp_shape = inp.size() + ctx.weight_bit_width = weight_bit_width + out_features = quant_w.size(0) + inp = inp.contiguous().view(-1, inp.size(-1)) + weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width) + ctx.weight_shape = weight.size() + output = inp.mm(weight.t()) + ctx.save_for_backward(inp, quant_w, scale_w) + return output.view(*(ctx.inp_shape[:-1] + (out_features,))) + + @staticmethod + def backward(ctx, grad_output: torch.Tensor): + inp, quant_w, scale_w = ctx.saved_tensors + weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width) + grad_output = grad_output.contiguous().view(-1, weight.size(0)) + grad_input = grad_output.mm(weight) + grad_weight = grad_output.t().mm(inp) + return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None + + +def compress_int4_weight(weight: torch.Tensor): # (n, m) + with torch.cuda.device(weight.device): + n, m = weight.size(0), weight.size(1) + assert m % 2 == 0 + m = m // 2 + out = torch.empty(n, m, dtype=torch.int8, device="cuda") + stream = torch.cuda.current_stream() + + gridDim = (n, 1, 1) + blockDim = (min(round_up(m, 32), 1024), 1, 1) + + kernels.int4WeightCompression( + gridDim, + blockDim, + 0, + stream, + [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)], + ) + return out + + +def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int): + assert scale_list.dtype in [torch.half, torch.bfloat16] + assert weight.dtype in [torch.int8] + if source_bit_width == 8: + return weight.to(scale_list.dtype) * scale_list[:, None] + elif source_bit_width == 4: + func = ( + kernels.int4WeightExtractionHalf if scale_list.dtype == torch.half else kernels.int4WeightExtractionBFloat16 + ) + else: + assert False, "Unsupported bit-width" + + with torch.cuda.device(weight.device): + n, m = weight.size(0), weight.size(1) + out = torch.empty(n, m * (8 // source_bit_width), dtype=scale_list.dtype, device="cuda") + stream = torch.cuda.current_stream() + + gridDim = (n, 1, 1) + blockDim = (min(round_up(m, 32), 1024), 1, 1) + + func( + gridDim, + blockDim, + 0, + stream, + [ + ctypes.c_void_p(weight.data_ptr()), + ctypes.c_void_p(scale_list.data_ptr()), + ctypes.c_void_p(out.data_ptr()), + ctypes.c_int32(n), + ctypes.c_int32(m), + ], + ) + return out + + +class QuantizedLinear(torch.nn.Module): + def __init__(self, weight_bit_width: int, weight, bias=None, device="cpu", dtype=None, empty_init=False, *args, + **kwargs): + super().__init__() + self.weight_bit_width = weight_bit_width + + shape = weight.shape + + if weight is None or empty_init: + self.weight = torch.empty(shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=device) + self.weight_scale = torch.empty(shape[0], dtype=dtype, device=device) + else: + self.weight_scale = weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1) + self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8) + if weight_bit_width == 4: + self.weight = compress_int4_weight(self.weight) + + self.weight = Parameter(self.weight.to(device), requires_grad=False) + self.weight_scale = Parameter(self.weight_scale.to(device), requires_grad=False) + self.bias = Parameter(bias.to(device), requires_grad=False) if bias is not None else None + + def forward(self, input): + output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width) + if self.bias is not None: + output = output + self.bias + return output + + +def quantize(model, weight_bit_width, empty_init=False, device=None): + """Replace fp16 linear with quantized linear""" + for layer in model.layers: + layer.self_attention.query_key_value = QuantizedLinear( + weight_bit_width=weight_bit_width, + weight=layer.self_attention.query_key_value.weight.to(torch.cuda.current_device()), + bias=layer.self_attention.query_key_value.bias, + dtype=layer.self_attention.query_key_value.weight.dtype, + device=layer.self_attention.query_key_value.weight.device if device is None else device, + empty_init=empty_init + ) + layer.self_attention.dense = QuantizedLinear( + weight_bit_width=weight_bit_width, + weight=layer.self_attention.dense.weight.to(torch.cuda.current_device()), + bias=layer.self_attention.dense.bias, + dtype=layer.self_attention.dense.weight.dtype, + device=layer.self_attention.dense.weight.device if device is None else device, + empty_init=empty_init + ) + layer.mlp.dense_h_to_4h = QuantizedLinear( + weight_bit_width=weight_bit_width, + weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()), + bias=layer.mlp.dense_h_to_4h.bias, + dtype=layer.mlp.dense_h_to_4h.weight.dtype, + device=layer.mlp.dense_h_to_4h.weight.device if device is None else device, + empty_init=empty_init + ) + layer.mlp.dense_4h_to_h = QuantizedLinear( + weight_bit_width=weight_bit_width, + weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()), + bias=layer.mlp.dense_4h_to_h.bias, + dtype=layer.mlp.dense_4h_to_h.weight.dtype, + device=layer.mlp.dense_4h_to_h.weight.device if device is None else device, + empty_init=empty_init + ) + + return model diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/rng_state.pth b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a4a75bee3df49fe6a8702b423e777413cdbb34ab --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b5c072ed3027d8e49ce6492bc0cac52571156d2630e130d487672c01375ed18 +size 14244 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/scheduler.pt b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..55013a617a7007e3aa12cef92fa465f45a0777d4 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c8d3eda235544386868d7d3c8b8ed7efcab36f20814c55292344c7ad9f51c2c +size 1064 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/special_tokens_map.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/special_tokens_map.json @@ -0,0 +1 @@ +{} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenization_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenizer.model b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenizer_config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5faafd38f2e2934804feb0e7d71ebf08b0839bf5 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenizer_config.json @@ -0,0 +1,18 @@ +{ + "added_tokens_decoder": {}, + "additional_special_tokens": [], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/trainer_state.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4ac36a582c86f3f51f90527761f8bdd771ed73ef --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/trainer_state.json @@ -0,0 +1,1819 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 28.235294117647058, + "eval_steps": 500, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.09, + "learning_rate": 0.009985714285714285, + "loss": 2.6971, + "step": 1 + }, + { + "epoch": 0.19, + "learning_rate": 0.009971428571428572, + "loss": 2.3927, + "step": 2 + }, + { + "epoch": 0.28, + "learning_rate": 0.009957142857142857, + "loss": 2.2539, + "step": 3 + }, + { + "epoch": 0.38, + "learning_rate": 0.009942857142857144, + "loss": 2.1408, + "step": 4 + }, + { + "epoch": 0.47, + "learning_rate": 0.009928571428571429, + "loss": 2.2672, + "step": 5 + }, + { + "epoch": 0.56, + "learning_rate": 0.009914285714285714, + "loss": 1.6433, + "step": 6 + }, + { + "epoch": 0.66, + "learning_rate": 0.0099, + "loss": 2.1405, + "step": 7 + }, + { + "epoch": 0.75, + "learning_rate": 0.009885714285714286, + "loss": 2.1464, + "step": 8 + }, + { + "epoch": 0.85, + "learning_rate": 0.009871428571428571, + "loss": 1.8498, + "step": 9 + }, + { + "epoch": 0.94, + "learning_rate": 0.009857142857142858, + "loss": 1.6896, + "step": 10 + }, + { + "epoch": 1.04, + "learning_rate": 0.009842857142857143, + "loss": 2.1932, + "step": 11 + }, + { + "epoch": 1.13, + "learning_rate": 0.00982857142857143, + "loss": 1.8236, + "step": 12 + }, + { + "epoch": 1.22, + "learning_rate": 0.009814285714285715, + "loss": 1.735, + "step": 13 + }, + { + "epoch": 1.32, + "learning_rate": 0.0098, + "loss": 1.7488, + "step": 14 + }, + { + "epoch": 1.41, + "learning_rate": 0.009785714285714285, + "loss": 1.8336, + "step": 15 + }, + { + "epoch": 1.51, + "learning_rate": 0.009771428571428572, + "loss": 1.9438, + "step": 16 + }, + { + "epoch": 1.6, + "learning_rate": 0.009757142857142858, + "loss": 1.7178, + "step": 17 + }, + { + "epoch": 1.69, + "learning_rate": 0.009742857142857143, + "loss": 1.5714, + "step": 18 + }, + { + "epoch": 1.79, + "learning_rate": 0.009728571428571428, + "loss": 1.537, + "step": 19 + }, + { + "epoch": 1.88, + "learning_rate": 0.009714285714285715, + "loss": 1.6764, + "step": 20 + }, + { + "epoch": 1.98, + "learning_rate": 0.0097, + "loss": 1.8919, + "step": 21 + }, + { + "epoch": 2.07, + "learning_rate": 0.009685714285714285, + "loss": 1.346, + "step": 22 + }, + { + "epoch": 2.16, + "learning_rate": 0.009671428571428572, + "loss": 1.5036, + "step": 23 + }, + { + "epoch": 2.26, + "learning_rate": 0.009657142857142857, + "loss": 1.6788, + "step": 24 + }, + { + "epoch": 2.35, + "learning_rate": 0.009642857142857144, + "loss": 1.6667, + "step": 25 + }, + { + "epoch": 2.45, + "learning_rate": 0.009628571428571429, + "loss": 1.7153, + "step": 26 + }, + { + "epoch": 2.54, + "learning_rate": 0.009614285714285714, + "loss": 1.601, + "step": 27 + }, + { + "epoch": 2.64, + "learning_rate": 0.0096, + "loss": 1.3002, + "step": 28 + }, + { + "epoch": 2.73, + "learning_rate": 0.009585714285714286, + "loss": 1.3294, + "step": 29 + }, + { + "epoch": 2.82, + "learning_rate": 0.009571428571428573, + "loss": 1.7477, + "step": 30 + }, + { + "epoch": 2.92, + "learning_rate": 0.009557142857142858, + "loss": 1.7961, + "step": 31 + }, + { + "epoch": 3.01, + "learning_rate": 0.009542857142857143, + "loss": 1.4954, + "step": 32 + }, + { + "epoch": 3.11, + "learning_rate": 0.009528571428571428, + "loss": 1.6452, + "step": 33 + }, + { + "epoch": 3.2, + "learning_rate": 0.009514285714285715, + "loss": 1.3528, + "step": 34 + }, + { + "epoch": 3.29, + "learning_rate": 0.0095, + "loss": 1.4811, + "step": 35 + }, + { + "epoch": 3.39, + "learning_rate": 0.009485714285714287, + "loss": 1.4738, + "step": 36 + }, + { + "epoch": 3.48, + "learning_rate": 0.009471428571428572, + "loss": 1.174, + "step": 37 + }, + { + "epoch": 3.58, + "learning_rate": 0.009457142857142857, + "loss": 1.2346, + "step": 38 + }, + { + "epoch": 3.67, + "learning_rate": 0.009442857142857143, + "loss": 1.5327, + "step": 39 + }, + { + "epoch": 3.76, + "learning_rate": 0.009428571428571429, + "loss": 1.5249, + "step": 40 + }, + { + "epoch": 3.86, + "learning_rate": 0.009414285714285714, + "loss": 1.5086, + "step": 41 + }, + { + "epoch": 3.95, + "learning_rate": 0.0094, + "loss": 1.8425, + "step": 42 + }, + { + "epoch": 4.05, + "learning_rate": 0.009385714285714287, + "loss": 1.1943, + "step": 43 + }, + { + "epoch": 4.14, + "learning_rate": 0.009371428571428572, + "loss": 1.6835, + "step": 44 + }, + { + "epoch": 4.24, + "learning_rate": 0.009357142857142857, + "loss": 1.75, + "step": 45 + }, + { + "epoch": 4.33, + "learning_rate": 0.009342857142857142, + "loss": 1.2561, + "step": 46 + }, + { + "epoch": 4.42, + "learning_rate": 0.009328571428571429, + "loss": 1.3784, + "step": 47 + }, + { + "epoch": 4.52, + "learning_rate": 0.009314285714285714, + "loss": 1.2538, + "step": 48 + }, + { + "epoch": 4.61, + "learning_rate": 0.009300000000000001, + "loss": 1.4429, + "step": 49 + }, + { + "epoch": 4.71, + "learning_rate": 0.009285714285714286, + "loss": 1.3687, + "step": 50 + }, + { + "epoch": 4.8, + "learning_rate": 0.009271428571428571, + "loss": 1.1511, + "step": 51 + }, + { + "epoch": 4.89, + "learning_rate": 0.009257142857142858, + "loss": 1.181, + "step": 52 + }, + { + "epoch": 4.99, + "learning_rate": 0.009242857142857143, + "loss": 1.1753, + "step": 53 + }, + { + "epoch": 5.08, + "learning_rate": 0.009228571428571428, + "loss": 1.1562, + "step": 54 + }, + { + "epoch": 5.18, + "learning_rate": 0.009214285714285715, + "loss": 1.2936, + "step": 55 + }, + { + "epoch": 5.27, + "learning_rate": 0.0092, + "loss": 1.3591, + "step": 56 + }, + { + "epoch": 5.36, + "learning_rate": 0.009185714285714287, + "loss": 1.1376, + "step": 57 + }, + { + "epoch": 5.46, + "learning_rate": 0.009171428571428572, + "loss": 1.372, + "step": 58 + }, + { + "epoch": 5.55, + "learning_rate": 0.009157142857142857, + "loss": 1.5141, + "step": 59 + }, + { + "epoch": 5.65, + "learning_rate": 0.009142857142857144, + "loss": 1.2087, + "step": 60 + }, + { + "epoch": 5.74, + "learning_rate": 0.009128571428571429, + "loss": 1.136, + "step": 61 + }, + { + "epoch": 5.84, + "learning_rate": 0.009114285714285715, + "loss": 1.2948, + "step": 62 + }, + { + "epoch": 5.93, + "learning_rate": 0.0091, + "loss": 1.0592, + "step": 63 + }, + { + "epoch": 6.02, + "learning_rate": 0.009085714285714286, + "loss": 1.2321, + "step": 64 + }, + { + "epoch": 6.12, + "learning_rate": 0.009071428571428572, + "loss": 1.0827, + "step": 65 + }, + { + "epoch": 6.21, + "learning_rate": 0.009057142857142857, + "loss": 1.1136, + "step": 66 + }, + { + "epoch": 6.31, + "learning_rate": 0.009042857142857142, + "loss": 1.475, + "step": 67 + }, + { + "epoch": 6.4, + "learning_rate": 0.009028571428571427, + "loss": 1.1316, + "step": 68 + }, + { + "epoch": 6.49, + "learning_rate": 0.009014285714285714, + "loss": 1.1688, + "step": 69 + }, + { + "epoch": 6.59, + "learning_rate": 0.009000000000000001, + "loss": 1.0882, + "step": 70 + }, + { + "epoch": 6.68, + "learning_rate": 0.008985714285714286, + "loss": 1.1085, + "step": 71 + }, + { + "epoch": 6.78, + "learning_rate": 0.008971428571428571, + "loss": 1.2029, + "step": 72 + }, + { + "epoch": 6.87, + "learning_rate": 0.008957142857142856, + "loss": 1.098, + "step": 73 + }, + { + "epoch": 6.96, + "learning_rate": 0.008942857142857143, + "loss": 1.219, + "step": 74 + }, + { + "epoch": 7.06, + "learning_rate": 0.00892857142857143, + "loss": 1.0092, + "step": 75 + }, + { + "epoch": 7.15, + "learning_rate": 0.008914285714285715, + "loss": 1.0112, + "step": 76 + }, + { + "epoch": 7.25, + "learning_rate": 0.0089, + "loss": 1.1481, + "step": 77 + }, + { + "epoch": 7.34, + "learning_rate": 0.008885714285714287, + "loss": 0.9873, + "step": 78 + }, + { + "epoch": 7.44, + "learning_rate": 0.008871428571428572, + "loss": 1.0586, + "step": 79 + }, + { + "epoch": 7.53, + "learning_rate": 0.008857142857142857, + "loss": 1.1177, + "step": 80 + }, + { + "epoch": 7.62, + "learning_rate": 0.008842857142857142, + "loss": 0.7814, + "step": 81 + }, + { + "epoch": 7.72, + "learning_rate": 0.008828571428571429, + "loss": 1.2043, + "step": 82 + }, + { + "epoch": 7.81, + "learning_rate": 0.008814285714285715, + "loss": 1.0062, + "step": 83 + }, + { + "epoch": 7.91, + "learning_rate": 0.0088, + "loss": 1.0831, + "step": 84 + }, + { + "epoch": 8.0, + "learning_rate": 0.008785714285714286, + "loss": 0.9554, + "step": 85 + }, + { + "epoch": 8.09, + "learning_rate": 0.00877142857142857, + "loss": 1.1674, + "step": 86 + }, + { + "epoch": 8.19, + "learning_rate": 0.008757142857142857, + "loss": 0.8226, + "step": 87 + }, + { + "epoch": 8.28, + "learning_rate": 0.008742857142857144, + "loss": 0.9166, + "step": 88 + }, + { + "epoch": 8.38, + "learning_rate": 0.00872857142857143, + "loss": 0.734, + "step": 89 + }, + { + "epoch": 8.47, + "learning_rate": 0.008714285714285714, + "loss": 0.8641, + "step": 90 + }, + { + "epoch": 8.56, + "learning_rate": 0.0087, + "loss": 0.9517, + "step": 91 + }, + { + "epoch": 8.66, + "learning_rate": 0.008685714285714286, + "loss": 0.9995, + "step": 92 + }, + { + "epoch": 8.75, + "learning_rate": 0.008671428571428571, + "loss": 0.763, + "step": 93 + }, + { + "epoch": 8.85, + "learning_rate": 0.008657142857142858, + "loss": 1.0712, + "step": 94 + }, + { + "epoch": 8.94, + "learning_rate": 0.008642857142857143, + "loss": 1.1111, + "step": 95 + }, + { + "epoch": 9.04, + "learning_rate": 0.008628571428571428, + "loss": 0.9626, + "step": 96 + }, + { + "epoch": 9.13, + "learning_rate": 0.008614285714285715, + "loss": 0.6385, + "step": 97 + }, + { + "epoch": 9.22, + "learning_rate": 0.0086, + "loss": 0.8147, + "step": 98 + }, + { + "epoch": 9.32, + "learning_rate": 0.008585714285714285, + "loss": 0.8109, + "step": 99 + }, + { + "epoch": 9.41, + "learning_rate": 0.008571428571428572, + "loss": 1.0953, + "step": 100 + }, + { + "epoch": 9.51, + "learning_rate": 0.008557142857142859, + "loss": 0.7104, + "step": 101 + }, + { + "epoch": 9.6, + "learning_rate": 0.008542857142857144, + "loss": 0.9672, + "step": 102 + }, + { + "epoch": 9.69, + "learning_rate": 0.008528571428571429, + "loss": 0.7593, + "step": 103 + }, + { + "epoch": 9.79, + "learning_rate": 0.008514285714285714, + "loss": 1.0186, + "step": 104 + }, + { + "epoch": 9.88, + "learning_rate": 0.0085, + "loss": 0.7898, + "step": 105 + }, + { + "epoch": 9.98, + "learning_rate": 0.008485714285714286, + "loss": 0.7392, + "step": 106 + }, + { + "epoch": 10.07, + "learning_rate": 0.008471428571428572, + "loss": 0.7295, + "step": 107 + }, + { + "epoch": 10.16, + "learning_rate": 0.008457142857142858, + "loss": 0.7211, + "step": 108 + }, + { + "epoch": 10.26, + "learning_rate": 0.008442857142857143, + "loss": 0.769, + "step": 109 + }, + { + "epoch": 10.35, + "learning_rate": 0.00842857142857143, + "loss": 0.718, + "step": 110 + }, + { + "epoch": 10.45, + "learning_rate": 0.008414285714285714, + "loss": 0.6411, + "step": 111 + }, + { + "epoch": 10.54, + "learning_rate": 0.0084, + "loss": 0.8016, + "step": 112 + }, + { + "epoch": 10.64, + "learning_rate": 0.008385714285714286, + "loss": 0.6633, + "step": 113 + }, + { + "epoch": 10.73, + "learning_rate": 0.008371428571428571, + "loss": 0.7257, + "step": 114 + }, + { + "epoch": 10.82, + "learning_rate": 0.008357142857142858, + "loss": 0.7785, + "step": 115 + }, + { + "epoch": 10.92, + "learning_rate": 0.008342857142857143, + "loss": 0.8927, + "step": 116 + }, + { + "epoch": 11.01, + "learning_rate": 0.008328571428571428, + "loss": 0.7242, + "step": 117 + }, + { + "epoch": 11.11, + "learning_rate": 0.008314285714285715, + "loss": 0.8297, + "step": 118 + }, + { + "epoch": 11.2, + "learning_rate": 0.0083, + "loss": 0.6761, + "step": 119 + }, + { + "epoch": 11.29, + "learning_rate": 0.008285714285714287, + "loss": 0.6699, + "step": 120 + }, + { + "epoch": 11.39, + "learning_rate": 0.008271428571428572, + "loss": 0.5365, + "step": 121 + }, + { + "epoch": 11.48, + "learning_rate": 0.008257142857142857, + "loss": 0.9045, + "step": 122 + }, + { + "epoch": 11.58, + "learning_rate": 0.008242857142857144, + "loss": 0.5071, + "step": 123 + }, + { + "epoch": 11.67, + "learning_rate": 0.008228571428571429, + "loss": 0.6472, + "step": 124 + }, + { + "epoch": 11.76, + "learning_rate": 0.008214285714285714, + "loss": 0.6232, + "step": 125 + }, + { + "epoch": 11.86, + "learning_rate": 0.008199999999999999, + "loss": 0.4905, + "step": 126 + }, + { + "epoch": 11.95, + "learning_rate": 0.008185714285714286, + "loss": 0.557, + "step": 127 + }, + { + "epoch": 12.05, + "learning_rate": 0.008171428571428573, + "loss": 0.5517, + "step": 128 + }, + { + "epoch": 12.14, + "learning_rate": 0.008157142857142858, + "loss": 0.6321, + "step": 129 + }, + { + "epoch": 12.24, + "learning_rate": 0.008142857142857143, + "loss": 0.6619, + "step": 130 + }, + { + "epoch": 12.33, + "learning_rate": 0.008128571428571428, + "loss": 0.5524, + "step": 131 + }, + { + "epoch": 12.42, + "learning_rate": 0.008114285714285715, + "loss": 0.4688, + "step": 132 + }, + { + "epoch": 12.52, + "learning_rate": 0.008100000000000001, + "loss": 0.3717, + "step": 133 + }, + { + "epoch": 12.61, + "learning_rate": 0.008085714285714286, + "loss": 0.5118, + "step": 134 + }, + { + "epoch": 12.71, + "learning_rate": 0.008071428571428571, + "loss": 0.4521, + "step": 135 + }, + { + "epoch": 12.8, + "learning_rate": 0.008057142857142856, + "loss": 0.5865, + "step": 136 + }, + { + "epoch": 12.89, + "learning_rate": 0.008042857142857143, + "loss": 0.5977, + "step": 137 + }, + { + "epoch": 12.99, + "learning_rate": 0.008028571428571428, + "loss": 0.6977, + "step": 138 + }, + { + "epoch": 13.08, + "learning_rate": 0.008014285714285713, + "loss": 0.5625, + "step": 139 + }, + { + "epoch": 13.18, + "learning_rate": 0.008, + "loss": 0.3611, + "step": 140 + }, + { + "epoch": 13.27, + "learning_rate": 0.007985714285714287, + "loss": 0.5168, + "step": 141 + }, + { + "epoch": 13.36, + "learning_rate": 0.007971428571428572, + "loss": 0.4429, + "step": 142 + }, + { + "epoch": 13.46, + "learning_rate": 0.007957142857142857, + "loss": 0.4998, + "step": 143 + }, + { + "epoch": 13.55, + "learning_rate": 0.007942857142857142, + "loss": 0.4437, + "step": 144 + }, + { + "epoch": 13.65, + "learning_rate": 0.007928571428571429, + "loss": 0.4958, + "step": 145 + }, + { + "epoch": 13.74, + "learning_rate": 0.007914285714285716, + "loss": 0.4021, + "step": 146 + }, + { + "epoch": 13.84, + "learning_rate": 0.0079, + "loss": 0.6163, + "step": 147 + }, + { + "epoch": 13.93, + "learning_rate": 0.007885714285714286, + "loss": 0.406, + "step": 148 + }, + { + "epoch": 14.02, + "learning_rate": 0.007871428571428571, + "loss": 0.4905, + "step": 149 + }, + { + "epoch": 14.12, + "learning_rate": 0.007857142857142858, + "loss": 0.3824, + "step": 150 + }, + { + "epoch": 14.21, + "learning_rate": 0.007842857142857143, + "loss": 0.3591, + "step": 151 + }, + { + "epoch": 14.31, + "learning_rate": 0.007828571428571428, + "loss": 0.342, + "step": 152 + }, + { + "epoch": 14.4, + "learning_rate": 0.007814285714285715, + "loss": 0.4565, + "step": 153 + }, + { + "epoch": 14.49, + "learning_rate": 0.0078000000000000005, + "loss": 0.3287, + "step": 154 + }, + { + "epoch": 14.59, + "learning_rate": 0.007785714285714286, + "loss": 0.4179, + "step": 155 + }, + { + "epoch": 14.68, + "learning_rate": 0.0077714285714285715, + "loss": 0.3586, + "step": 156 + }, + { + "epoch": 14.78, + "learning_rate": 0.007757142857142857, + "loss": 0.4618, + "step": 157 + }, + { + "epoch": 14.87, + "learning_rate": 0.0077428571428571425, + "loss": 0.4133, + "step": 158 + }, + { + "epoch": 14.96, + "learning_rate": 0.007728571428571429, + "loss": 0.4326, + "step": 159 + }, + { + "epoch": 15.06, + "learning_rate": 0.007714285714285715, + "loss": 0.3838, + "step": 160 + }, + { + "epoch": 15.15, + "learning_rate": 0.0077, + "loss": 0.2978, + "step": 161 + }, + { + "epoch": 15.25, + "learning_rate": 0.007685714285714286, + "loss": 0.3993, + "step": 162 + }, + { + "epoch": 15.34, + "learning_rate": 0.007671428571428571, + "loss": 0.3249, + "step": 163 + }, + { + "epoch": 15.44, + "learning_rate": 0.007657142857142857, + "loss": 0.2796, + "step": 164 + }, + { + "epoch": 15.53, + "learning_rate": 0.007642857142857142, + "loss": 0.3918, + "step": 165 + }, + { + "epoch": 15.62, + "learning_rate": 0.007628571428571429, + "loss": 0.4122, + "step": 166 + }, + { + "epoch": 15.72, + "learning_rate": 0.007614285714285715, + "loss": 0.3403, + "step": 167 + }, + { + "epoch": 15.81, + "learning_rate": 0.0076, + "loss": 0.3759, + "step": 168 + }, + { + "epoch": 15.91, + "learning_rate": 0.007585714285714286, + "loss": 0.3621, + "step": 169 + }, + { + "epoch": 16.0, + "learning_rate": 0.007571428571428571, + "loss": 0.2991, + "step": 170 + }, + { + "epoch": 16.09, + "learning_rate": 0.007557142857142857, + "loss": 0.3039, + "step": 171 + }, + { + "epoch": 16.19, + "learning_rate": 0.007542857142857144, + "loss": 0.4571, + "step": 172 + }, + { + "epoch": 16.28, + "learning_rate": 0.007528571428571429, + "loss": 0.2759, + "step": 173 + }, + { + "epoch": 16.38, + "learning_rate": 0.007514285714285715, + "loss": 0.2835, + "step": 174 + }, + { + "epoch": 16.47, + "learning_rate": 0.0075, + "loss": 0.3221, + "step": 175 + }, + { + "epoch": 16.56, + "learning_rate": 0.007485714285714286, + "loss": 0.3072, + "step": 176 + }, + { + "epoch": 16.66, + "learning_rate": 0.007471428571428572, + "loss": 0.2852, + "step": 177 + }, + { + "epoch": 16.75, + "learning_rate": 0.007457142857142857, + "loss": 0.2559, + "step": 178 + }, + { + "epoch": 16.85, + "learning_rate": 0.007442857142857143, + "loss": 0.2787, + "step": 179 + }, + { + "epoch": 16.94, + "learning_rate": 0.007428571428571429, + "loss": 0.3331, + "step": 180 + }, + { + "epoch": 17.04, + "learning_rate": 0.007414285714285714, + "loss": 0.1929, + "step": 181 + }, + { + "epoch": 17.13, + "learning_rate": 0.0074, + "loss": 0.2065, + "step": 182 + }, + { + "epoch": 17.22, + "learning_rate": 0.007385714285714285, + "loss": 0.2868, + "step": 183 + }, + { + "epoch": 17.32, + "learning_rate": 0.007371428571428571, + "loss": 0.2206, + "step": 184 + }, + { + "epoch": 17.41, + "learning_rate": 0.007357142857142858, + "loss": 0.2355, + "step": 185 + }, + { + "epoch": 17.51, + "learning_rate": 0.007342857142857143, + "loss": 0.3041, + "step": 186 + }, + { + "epoch": 17.6, + "learning_rate": 0.007328571428571429, + "loss": 0.3028, + "step": 187 + }, + { + "epoch": 17.69, + "learning_rate": 0.007314285714285714, + "loss": 0.2435, + "step": 188 + }, + { + "epoch": 17.79, + "learning_rate": 0.0073, + "loss": 0.1869, + "step": 189 + }, + { + "epoch": 17.88, + "learning_rate": 0.007285714285714285, + "loss": 0.3036, + "step": 190 + }, + { + "epoch": 17.98, + "learning_rate": 0.007271428571428571, + "loss": 0.246, + "step": 191 + }, + { + "epoch": 18.07, + "learning_rate": 0.007257142857142858, + "loss": 0.2316, + "step": 192 + }, + { + "epoch": 18.16, + "learning_rate": 0.007242857142857143, + "loss": 0.186, + "step": 193 + }, + { + "epoch": 18.26, + "learning_rate": 0.007228571428571429, + "loss": 0.2616, + "step": 194 + }, + { + "epoch": 18.35, + "learning_rate": 0.007214285714285715, + "loss": 0.2824, + "step": 195 + }, + { + "epoch": 18.45, + "learning_rate": 0.0072, + "loss": 0.2, + "step": 196 + }, + { + "epoch": 18.54, + "learning_rate": 0.007185714285714286, + "loss": 0.1978, + "step": 197 + }, + { + "epoch": 18.64, + "learning_rate": 0.007171428571428572, + "loss": 0.1897, + "step": 198 + }, + { + "epoch": 18.73, + "learning_rate": 0.007157142857142858, + "loss": 0.1958, + "step": 199 + }, + { + "epoch": 18.82, + "learning_rate": 0.0071428571428571435, + "loss": 0.203, + "step": 200 + }, + { + "epoch": 18.92, + "learning_rate": 0.0071285714285714286, + "loss": 0.2451, + "step": 201 + }, + { + "epoch": 19.01, + "learning_rate": 0.0071142857142857145, + "loss": 0.2045, + "step": 202 + }, + { + "epoch": 19.11, + "learning_rate": 0.0070999999999999995, + "loss": 0.1937, + "step": 203 + }, + { + "epoch": 19.2, + "learning_rate": 0.0070857142857142855, + "loss": 0.1814, + "step": 204 + }, + { + "epoch": 19.29, + "learning_rate": 0.007071428571428572, + "loss": 0.1869, + "step": 205 + }, + { + "epoch": 19.39, + "learning_rate": 0.007057142857142857, + "loss": 0.2089, + "step": 206 + }, + { + "epoch": 19.48, + "learning_rate": 0.007042857142857143, + "loss": 0.1924, + "step": 207 + }, + { + "epoch": 19.58, + "learning_rate": 0.007028571428571428, + "loss": 0.1512, + "step": 208 + }, + { + "epoch": 19.67, + "learning_rate": 0.007014285714285714, + "loss": 0.1375, + "step": 209 + }, + { + "epoch": 19.76, + "learning_rate": 0.006999999999999999, + "loss": 0.187, + "step": 210 + }, + { + "epoch": 19.86, + "learning_rate": 0.006985714285714286, + "loss": 0.2488, + "step": 211 + }, + { + "epoch": 19.95, + "learning_rate": 0.006971428571428572, + "loss": 0.1864, + "step": 212 + }, + { + "epoch": 20.05, + "learning_rate": 0.006957142857142857, + "loss": 0.1984, + "step": 213 + }, + { + "epoch": 20.14, + "learning_rate": 0.006942857142857143, + "loss": 0.156, + "step": 214 + }, + { + "epoch": 20.24, + "learning_rate": 0.006928571428571429, + "loss": 0.2082, + "step": 215 + }, + { + "epoch": 20.33, + "learning_rate": 0.006914285714285714, + "loss": 0.094, + "step": 216 + }, + { + "epoch": 20.42, + "learning_rate": 0.0069, + "loss": 0.1784, + "step": 217 + }, + { + "epoch": 20.52, + "learning_rate": 0.006885714285714287, + "loss": 0.1293, + "step": 218 + }, + { + "epoch": 20.61, + "learning_rate": 0.006871428571428572, + "loss": 0.1635, + "step": 219 + }, + { + "epoch": 20.71, + "learning_rate": 0.006857142857142858, + "loss": 0.1668, + "step": 220 + }, + { + "epoch": 20.8, + "learning_rate": 0.006842857142857143, + "loss": 0.1946, + "step": 221 + }, + { + "epoch": 20.89, + "learning_rate": 0.006828571428571429, + "loss": 0.2347, + "step": 222 + }, + { + "epoch": 20.99, + "learning_rate": 0.006814285714285714, + "loss": 0.1523, + "step": 223 + }, + { + "epoch": 21.08, + "learning_rate": 0.0068000000000000005, + "loss": 0.1337, + "step": 224 + }, + { + "epoch": 21.18, + "learning_rate": 0.006785714285714286, + "loss": 0.1511, + "step": 225 + }, + { + "epoch": 21.27, + "learning_rate": 0.0067714285714285715, + "loss": 0.1058, + "step": 226 + }, + { + "epoch": 21.36, + "learning_rate": 0.006757142857142857, + "loss": 0.172, + "step": 227 + }, + { + "epoch": 21.46, + "learning_rate": 0.0067428571428571425, + "loss": 0.1077, + "step": 228 + }, + { + "epoch": 21.55, + "learning_rate": 0.006728571428571428, + "loss": 0.1993, + "step": 229 + }, + { + "epoch": 21.65, + "learning_rate": 0.006714285714285714, + "loss": 0.1414, + "step": 230 + }, + { + "epoch": 21.74, + "learning_rate": 0.0067, + "loss": 0.126, + "step": 231 + }, + { + "epoch": 21.84, + "learning_rate": 0.006685714285714286, + "loss": 0.1528, + "step": 232 + }, + { + "epoch": 21.93, + "learning_rate": 0.006671428571428571, + "loss": 0.1316, + "step": 233 + }, + { + "epoch": 22.02, + "learning_rate": 0.006657142857142857, + "loss": 0.1565, + "step": 234 + }, + { + "epoch": 22.12, + "learning_rate": 0.006642857142857143, + "loss": 0.1088, + "step": 235 + }, + { + "epoch": 22.21, + "learning_rate": 0.006628571428571428, + "loss": 0.088, + "step": 236 + }, + { + "epoch": 22.31, + "learning_rate": 0.006614285714285715, + "loss": 0.1348, + "step": 237 + }, + { + "epoch": 22.4, + "learning_rate": 0.006600000000000001, + "loss": 0.1702, + "step": 238 + }, + { + "epoch": 22.49, + "learning_rate": 0.006585714285714286, + "loss": 0.132, + "step": 239 + }, + { + "epoch": 22.59, + "learning_rate": 0.006571428571428572, + "loss": 0.1115, + "step": 240 + }, + { + "epoch": 22.68, + "learning_rate": 0.006557142857142857, + "loss": 0.1173, + "step": 241 + }, + { + "epoch": 22.78, + "learning_rate": 0.006542857142857143, + "loss": 0.0967, + "step": 242 + }, + { + "epoch": 22.87, + "learning_rate": 0.006528571428571428, + "loss": 0.1484, + "step": 243 + }, + { + "epoch": 22.96, + "learning_rate": 0.006514285714285715, + "loss": 0.1566, + "step": 244 + }, + { + "epoch": 23.06, + "learning_rate": 0.006500000000000001, + "loss": 0.162, + "step": 245 + }, + { + "epoch": 23.15, + "learning_rate": 0.006485714285714286, + "loss": 0.1099, + "step": 246 + }, + { + "epoch": 23.25, + "learning_rate": 0.0064714285714285716, + "loss": 0.1087, + "step": 247 + }, + { + "epoch": 23.34, + "learning_rate": 0.006457142857142857, + "loss": 0.116, + "step": 248 + }, + { + "epoch": 23.44, + "learning_rate": 0.0064428571428571425, + "loss": 0.1096, + "step": 249 + }, + { + "epoch": 23.53, + "learning_rate": 0.006428571428571429, + "loss": 0.0972, + "step": 250 + }, + { + "epoch": 23.62, + "learning_rate": 0.006414285714285714, + "loss": 0.0889, + "step": 251 + }, + { + "epoch": 23.72, + "learning_rate": 0.0064, + "loss": 0.1199, + "step": 252 + }, + { + "epoch": 23.81, + "learning_rate": 0.006385714285714286, + "loss": 0.1337, + "step": 253 + }, + { + "epoch": 23.91, + "learning_rate": 0.006371428571428571, + "loss": 0.0977, + "step": 254 + }, + { + "epoch": 24.0, + "learning_rate": 0.006357142857142857, + "loss": 0.146, + "step": 255 + }, + { + "epoch": 24.09, + "learning_rate": 0.006342857142857142, + "loss": 0.1102, + "step": 256 + }, + { + "epoch": 24.19, + "learning_rate": 0.006328571428571429, + "loss": 0.1025, + "step": 257 + }, + { + "epoch": 24.28, + "learning_rate": 0.006314285714285715, + "loss": 0.09, + "step": 258 + }, + { + "epoch": 24.38, + "learning_rate": 0.0063, + "loss": 0.1302, + "step": 259 + }, + { + "epoch": 24.47, + "learning_rate": 0.006285714285714286, + "loss": 0.0739, + "step": 260 + }, + { + "epoch": 24.56, + "learning_rate": 0.006271428571428571, + "loss": 0.1172, + "step": 261 + }, + { + "epoch": 24.66, + "learning_rate": 0.006257142857142857, + "loss": 0.1048, + "step": 262 + }, + { + "epoch": 24.75, + "learning_rate": 0.006242857142857144, + "loss": 0.0977, + "step": 263 + }, + { + "epoch": 24.85, + "learning_rate": 0.006228571428571429, + "loss": 0.1056, + "step": 264 + }, + { + "epoch": 24.94, + "learning_rate": 0.006214285714285715, + "loss": 0.1252, + "step": 265 + }, + { + "epoch": 25.04, + "learning_rate": 0.0062, + "loss": 0.1107, + "step": 266 + }, + { + "epoch": 25.13, + "learning_rate": 0.006185714285714286, + "loss": 0.0887, + "step": 267 + }, + { + "epoch": 25.22, + "learning_rate": 0.006171428571428571, + "loss": 0.0836, + "step": 268 + }, + { + "epoch": 25.32, + "learning_rate": 0.0061571428571428576, + "loss": 0.0957, + "step": 269 + }, + { + "epoch": 25.41, + "learning_rate": 0.0061428571428571435, + "loss": 0.1165, + "step": 270 + }, + { + "epoch": 25.51, + "learning_rate": 0.0061285714285714285, + "loss": 0.1135, + "step": 271 + }, + { + "epoch": 25.6, + "learning_rate": 0.0061142857142857145, + "loss": 0.0901, + "step": 272 + }, + { + "epoch": 25.69, + "learning_rate": 0.0061, + "loss": 0.0751, + "step": 273 + }, + { + "epoch": 25.79, + "learning_rate": 0.0060857142857142854, + "loss": 0.109, + "step": 274 + }, + { + "epoch": 25.88, + "learning_rate": 0.006071428571428571, + "loss": 0.102, + "step": 275 + }, + { + "epoch": 25.98, + "learning_rate": 0.006057142857142858, + "loss": 0.0916, + "step": 276 + }, + { + "epoch": 26.07, + "learning_rate": 0.006042857142857143, + "loss": 0.0821, + "step": 277 + }, + { + "epoch": 26.16, + "learning_rate": 0.006028571428571429, + "loss": 0.0797, + "step": 278 + }, + { + "epoch": 26.26, + "learning_rate": 0.006014285714285714, + "loss": 0.0804, + "step": 279 + }, + { + "epoch": 26.35, + "learning_rate": 0.006, + "loss": 0.0987, + "step": 280 + }, + { + "epoch": 26.45, + "learning_rate": 0.005985714285714285, + "loss": 0.1192, + "step": 281 + }, + { + "epoch": 26.54, + "learning_rate": 0.005971428571428572, + "loss": 0.0699, + "step": 282 + }, + { + "epoch": 26.64, + "learning_rate": 0.005957142857142858, + "loss": 0.0902, + "step": 283 + }, + { + "epoch": 26.73, + "learning_rate": 0.005942857142857143, + "loss": 0.0916, + "step": 284 + }, + { + "epoch": 26.82, + "learning_rate": 0.005928571428571429, + "loss": 0.0753, + "step": 285 + }, + { + "epoch": 26.92, + "learning_rate": 0.005914285714285714, + "loss": 0.0964, + "step": 286 + }, + { + "epoch": 27.01, + "learning_rate": 0.0059, + "loss": 0.1108, + "step": 287 + }, + { + "epoch": 27.11, + "learning_rate": 0.005885714285714286, + "loss": 0.1062, + "step": 288 + }, + { + "epoch": 27.2, + "learning_rate": 0.005871428571428572, + "loss": 0.0846, + "step": 289 + }, + { + "epoch": 27.29, + "learning_rate": 0.005857142857142858, + "loss": 0.0986, + "step": 290 + }, + { + "epoch": 27.39, + "learning_rate": 0.005842857142857143, + "loss": 0.0713, + "step": 291 + }, + { + "epoch": 27.48, + "learning_rate": 0.005828571428571429, + "loss": 0.0829, + "step": 292 + }, + { + "epoch": 27.58, + "learning_rate": 0.0058142857142857145, + "loss": 0.1026, + "step": 293 + }, + { + "epoch": 27.67, + "learning_rate": 0.0058, + "loss": 0.0785, + "step": 294 + }, + { + "epoch": 27.76, + "learning_rate": 0.005785714285714286, + "loss": 0.0729, + "step": 295 + }, + { + "epoch": 27.86, + "learning_rate": 0.005771428571428572, + "loss": 0.0738, + "step": 296 + }, + { + "epoch": 27.95, + "learning_rate": 0.005757142857142857, + "loss": 0.079, + "step": 297 + }, + { + "epoch": 28.05, + "learning_rate": 0.005742857142857143, + "loss": 0.0761, + "step": 298 + }, + { + "epoch": 28.14, + "learning_rate": 0.005728571428571428, + "loss": 0.0792, + "step": 299 + }, + { + "epoch": 28.24, + "learning_rate": 0.005714285714285714, + "loss": 0.0881, + "step": 300 + } + ], + "logging_steps": 1.0, + "max_steps": 700, + "num_train_epochs": 70, + "save_steps": 100, + "total_flos": 3.525522965397504e+17, + "trial_name": null, + "trial_params": null +} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/training_args.bin b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..17f9bfbf1a7cdd9e0e808e0672d55ad9ad4efb5f --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00669a32a6ddac0a3243bbc04d3f1f70ffc8f89f2626c1fdafa93ce68c311aa0 +size 4664 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..363fcab7ed6e9634e198cf5555ceb88932c9a245 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/.ipynb_checkpoints/config-checkpoint.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/.ipynb_checkpoints/config-checkpoint.json new file mode 100644 index 0000000000000000000000000000000000000000..50d927dc68b4eaa40bd4812b7417b3f2bd61f599 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/.ipynb_checkpoints/config-checkpoint.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "chatglm3-6b", + "add_bias_linear": false, + "add_qkv_bias": true, + "apply_query_key_layer_scaling": true, + "apply_residual_connection_post_layernorm": false, + "architectures": [ + "ChatGLMForConditionalGeneration" + ], + "attention_dropout": 0.0, + "attention_softmax_in_fp32": true, + "auto_map": { + "AutoConfig": "configuration_chatglm.ChatGLMConfig", + "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification" + }, + "bias_dropout_fusion": true, + "classifier_dropout": null, + "eos_token_id": 2, + "ffn_hidden_size": 13696, + "fp32_residual_connection": false, + "hidden_dropout": 0.0, + "hidden_size": 4096, + "kv_channels": 128, + "layernorm_epsilon": 1e-05, + "model_type": "chatglm", + "multi_query_attention": true, + "multi_query_group_num": 2, + "num_attention_heads": 32, + "num_layers": 28, + "original_rope": true, + "pad_token_id": 0, + "padded_vocab_size": 65024, + "post_layer_norm": true, + "pre_seq_len": 128, + "prefix_projection": false, + "quantization_bit": 0, + "rmsnorm": true, + "seq_length": 8192, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.34.0", + "use_cache": true, + "vocab_size": 65024 +} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/Untitled.ipynb b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/Untitled.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c639653712eb0c035e8a63da023f415678f56a25 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/Untitled.ipynb @@ -0,0 +1,99 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "8be7ec39-c93d-4529-bef3-6b65b66a8bcd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: huggingface_hub in /opt/conda/lib/python3.10/site-packages (0.17.3)\n", + "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (3.9.0)\n", + "Requirement already satisfied: fsspec in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2023.10.0)\n", + "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2.31.0)\n", + "Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.65.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.8.0)\n", + "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (23.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2.0.4)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (1.26.18)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2023.7.22)\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install huggingface_hub" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a34bc1d4-4e94-4fa8-9c6d-778ea504b70b", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5e2dc9023df04cf390302198d09374c1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HTML(value='
torch.FloatTensor: + if torch.isnan(scores).any() or torch.isinf(scores).any(): + scores.zero_() + scores[..., 5] = 5e4 + return scores + + +class PrefixEncoder(torch.nn.Module): + """ + The torch.nn model to encode the prefix + Input shape: (batch-size, prefix-length) + Output shape: (batch-size, prefix-length, 2*layers*hidden) + """ + + def __init__(self, config: ChatGLMConfig): + super().__init__() + self.prefix_projection = config.prefix_projection + if self.prefix_projection: + # Use a two-layer MLP to encode the prefix + kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2 + self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size) + self.trans = torch.nn.Sequential( + torch.nn.Linear(kv_size, config.hidden_size), + torch.nn.Tanh(), + torch.nn.Linear(config.hidden_size, kv_size) + ) + else: + self.embedding = torch.nn.Embedding(config.pre_seq_len, + config.num_layers * config.kv_channels * config.multi_query_group_num * 2) + + def forward(self, prefix: torch.Tensor): + if self.prefix_projection: + prefix_tokens = self.embedding(prefix) + past_key_values = self.trans(prefix_tokens) + else: + past_key_values = self.embedding(prefix) + return past_key_values + + +def split_tensor_along_last_dim( + tensor: torch.Tensor, + num_partitions: int, + contiguous_split_chunks: bool = False, +) -> List[torch.Tensor]: + """Split a tensor along its last dimension. + + Arguments: + tensor: input tensor. + num_partitions: number of partitions to split the tensor + contiguous_split_chunks: If True, make each chunk contiguous + in memory. + + Returns: + A list of Tensors + """ + # Get the size and dimension. + last_dim = tensor.dim() - 1 + last_dim_size = tensor.size()[last_dim] // num_partitions + # Split. + tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) + # Note: torch.split does not create contiguous tensors by default. + if contiguous_split_chunks: + return tuple(chunk.contiguous() for chunk in tensor_list) + + return tensor_list + + +class RotaryEmbedding(nn.Module): + def __init__(self, dim, original_impl=False, device=None, dtype=None): + super().__init__() + inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim)) + self.register_buffer("inv_freq", inv_freq) + self.dim = dim + self.original_impl = original_impl + + def forward_impl( + self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000 + ): + """Enhanced Transformer with Rotary Position Embedding. + + Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/ + transformers/rope/__init__.py. MIT License: + https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license. + """ + # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$ + theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem)) + + # Create position indexes `[0, 1, ..., seq_len - 1]` + seq_idx = torch.arange(seq_len, dtype=torch.float, device=device) + + # Calculate the product of position index and $\theta_i$ + idx_theta = torch.outer(seq_idx, theta).float() + + cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1) + + # this is to mimic the behaviour of complex32, else we will get different results + if dtype in (torch.float16, torch.bfloat16, torch.int8): + cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half() + return cache + + def forward(self, max_seq_len, offset=0): + return self.forward_impl( + max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device + ) + + +@torch.jit.script +def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor: + # x: [sq, b, np, hn] + sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3) + rot_dim = rope_cache.shape[-2] * 2 + x, x_pass = x[..., :rot_dim], x[..., rot_dim:] + # truncate to support variable sizes + rope_cache = rope_cache[:sq] + xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2) + rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2) + x_out2 = torch.stack( + [ + xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1], + xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1], + ], + -1, + ) + x_out2 = x_out2.flatten(3) + return torch.cat((x_out2, x_pass), dim=-1) + + +class RMSNorm(torch.nn.Module): + def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs): + super().__init__() + self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype)) + self.eps = eps + + def forward(self, hidden_states: torch.Tensor): + input_dtype = hidden_states.dtype + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.eps) + + return (self.weight * hidden_states).to(input_dtype) + + +class CoreAttention(torch.nn.Module): + def __init__(self, config: ChatGLMConfig, layer_number): + super(CoreAttention, self).__init__() + + self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling + self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32 + if self.apply_query_key_layer_scaling: + self.attention_softmax_in_fp32 = True + self.layer_number = max(1, layer_number) + + projection_size = config.kv_channels * config.num_attention_heads + + # Per attention head and per partition values. + self.hidden_size_per_partition = projection_size + self.hidden_size_per_attention_head = projection_size // config.num_attention_heads + self.num_attention_heads_per_partition = config.num_attention_heads + + coeff = None + self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + if self.apply_query_key_layer_scaling: + coeff = self.layer_number + self.norm_factor *= coeff + self.coeff = coeff + + self.attention_dropout = torch.nn.Dropout(config.attention_dropout) + + def forward(self, query_layer, key_layer, value_layer, attention_mask): + pytorch_major_version = int(torch.__version__.split('.')[0]) + if pytorch_major_version >= 2: + query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]] + if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]: + context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, + is_causal=True) + else: + if attention_mask is not None: + attention_mask = ~attention_mask + context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, + attention_mask) + context_layer = context_layer.permute(2, 0, 1, 3) + new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) + context_layer = context_layer.reshape(*new_context_layer_shape) + else: + # Raw attention scores + + # [b, np, sq, sk] + output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0)) + + # [sq, b, np, hn] -> [sq, b * np, hn] + query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1) + # [sk, b, np, hn] -> [sk, b * np, hn] + key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) + + # preallocting input tensor: [b * np, sq, sk] + matmul_input_buffer = torch.empty( + output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype, + device=query_layer.device + ) + + # Raw attention scores. [b * np, sq, sk] + matmul_result = torch.baddbmm( + matmul_input_buffer, + query_layer.transpose(0, 1), # [b * np, sq, hn] + key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] + beta=0.0, + alpha=(1.0 / self.norm_factor), + ) + + # change view to [b, np, sq, sk] + attention_scores = matmul_result.view(*output_size) + + # =========================== + # Attention probs and dropout + # =========================== + + # attention scores and attention mask [b, np, sq, sk] + if self.attention_softmax_in_fp32: + attention_scores = attention_scores.float() + if self.coeff is not None: + attention_scores = attention_scores * self.coeff + if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]: + attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3], + device=attention_scores.device, dtype=torch.bool) + attention_mask.tril_() + attention_mask = ~attention_mask + if attention_mask is not None: + attention_scores = attention_scores.masked_fill(attention_mask, float("-inf")) + attention_probs = F.softmax(attention_scores, dim=-1) + attention_probs = attention_probs.type_as(value_layer) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.attention_dropout(attention_probs) + # ========================= + # Context layer. [sq, b, hp] + # ========================= + + # value_layer -> context layer. + # [sk, b, np, hn] --> [b, np, sq, hn] + + # context layer shape: [b, np, sq, hn] + output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3)) + # change view [sk, b * np, hn] + value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1) + # change view [b * np, sq, sk] + attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1) + # matmul: [b * np, sq, hn] + context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) + # change view [b, np, sq, hn] + context_layer = context_layer.view(*output_size) + # [b, np, sq, hn] --> [sq, b, np, hn] + context_layer = context_layer.permute(2, 0, 1, 3).contiguous() + # [sq, b, np, hn] --> [sq, b, hp] + new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) + context_layer = context_layer.view(*new_context_layer_shape) + + return context_layer + + +class SelfAttention(torch.nn.Module): + """Parallel self-attention layer abstract class. + + Self-attention layer takes input with size [s, b, h] + and returns output of the same size. + """ + + def __init__(self, config: ChatGLMConfig, layer_number, device=None): + super(SelfAttention, self).__init__() + self.layer_number = max(1, layer_number) + + self.projection_size = config.kv_channels * config.num_attention_heads + + # Per attention head and per partition values. + self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads + self.num_attention_heads_per_partition = config.num_attention_heads + + self.multi_query_attention = config.multi_query_attention + self.qkv_hidden_size = 3 * self.projection_size + if self.multi_query_attention: + self.num_multi_query_groups_per_partition = config.multi_query_group_num + self.qkv_hidden_size = ( + self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num + ) + self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size, + bias=config.add_bias_linear or config.add_qkv_bias, + device=device, **_config_to_kwargs(config) + ) + + self.core_attention = CoreAttention(config, self.layer_number) + + # Output. + self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear, + device=device, **_config_to_kwargs(config) + ) + + def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None): + if self.multi_query_attention: + num_attention_heads = self.num_multi_query_groups_per_partition + else: + num_attention_heads = self.num_attention_heads_per_partition + return torch.empty( + inference_max_sequence_len, + batch_size, + num_attention_heads, + self.hidden_size_per_attention_head, + dtype=dtype, + device=device, + ) + + def forward( + self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True + ): + # hidden_states: [sq, b, h] + + # ================================================= + # Pre-allocate memory for key-values for inference. + # ================================================= + # ===================== + # Query, Key, and Value + # ===================== + + # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)] + mixed_x_layer = self.query_key_value(hidden_states) + + if self.multi_query_attention: + (query_layer, key_layer, value_layer) = mixed_x_layer.split( + [ + self.num_attention_heads_per_partition * self.hidden_size_per_attention_head, + self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, + self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, + ], + dim=-1, + ) + query_layer = query_layer.view( + query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) + ) + key_layer = key_layer.view( + key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head) + ) + value_layer = value_layer.view( + value_layer.size()[:-1] + + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head) + ) + else: + new_tensor_shape = mixed_x_layer.size()[:-1] + \ + (self.num_attention_heads_per_partition, + 3 * self.hidden_size_per_attention_head) + mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) + + # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] + (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) + + # apply relative positional encoding (rotary embedding) + if rotary_pos_emb is not None: + query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb) + key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb) + + # adjust key and value for inference + if kv_cache is not None: + cache_k, cache_v = kv_cache + key_layer = torch.cat((cache_k, key_layer), dim=0) + value_layer = torch.cat((cache_v, value_layer), dim=0) + if use_cache: + kv_cache = (key_layer, value_layer) + else: + kv_cache = None + + if self.multi_query_attention: + key_layer = key_layer.unsqueeze(-2) + key_layer = key_layer.expand( + -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1 + ) + key_layer = key_layer.contiguous().view( + key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) + ) + value_layer = value_layer.unsqueeze(-2) + value_layer = value_layer.expand( + -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1 + ) + value_layer = value_layer.contiguous().view( + value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) + ) + + # ================================== + # core attention computation + # ================================== + + context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask) + + # ================= + # Output. [sq, b, h] + # ================= + + output = self.dense(context_layer) + + return output, kv_cache + + +def _config_to_kwargs(args): + common_kwargs = { + "dtype": args.torch_dtype, + } + return common_kwargs + + +class MLP(torch.nn.Module): + """MLP. + + MLP will take the input with h hidden state, project it to 4*h + hidden dimension, perform nonlinear transformation, and project the + state back into h hidden dimension. + """ + + def __init__(self, config: ChatGLMConfig, device=None): + super(MLP, self).__init__() + + self.add_bias = config.add_bias_linear + + # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf + self.dense_h_to_4h = nn.Linear( + config.hidden_size, + config.ffn_hidden_size * 2, + bias=self.add_bias, + device=device, + **_config_to_kwargs(config) + ) + + def swiglu(x): + x = torch.chunk(x, 2, dim=-1) + return F.silu(x[0]) * x[1] + + self.activation_func = swiglu + + # Project back to h. + self.dense_4h_to_h = nn.Linear( + config.ffn_hidden_size, + config.hidden_size, + bias=self.add_bias, + device=device, + **_config_to_kwargs(config) + ) + + def forward(self, hidden_states): + # [s, b, 4hp] + intermediate_parallel = self.dense_h_to_4h(hidden_states) + intermediate_parallel = self.activation_func(intermediate_parallel) + # [s, b, h] + output = self.dense_4h_to_h(intermediate_parallel) + return output + + +class GLMBlock(torch.nn.Module): + """A single transformer layer. + + Transformer layer takes input with size [s, b, h] and returns an + output of the same size. + """ + + def __init__(self, config: ChatGLMConfig, layer_number, device=None): + super(GLMBlock, self).__init__() + self.layer_number = layer_number + + self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm + + self.fp32_residual_connection = config.fp32_residual_connection + + LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm + # Layernorm on the input data. + self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, + dtype=config.torch_dtype) + + # Self attention. + self.self_attention = SelfAttention(config, layer_number, device=device) + self.hidden_dropout = config.hidden_dropout + + # Layernorm on the attention output + self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, + dtype=config.torch_dtype) + + # MLP + self.mlp = MLP(config, device=device) + + def forward( + self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True, + ): + # hidden_states: [s, b, h] + + # Layer norm at the beginning of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + # Self attention. + attention_output, kv_cache = self.self_attention( + layernorm_output, + attention_mask, + rotary_pos_emb, + kv_cache=kv_cache, + use_cache=use_cache + ) + + # Residual connection. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = hidden_states + + layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training) + layernorm_input = residual + layernorm_input + + # Layer norm post the self attention. + layernorm_output = self.post_attention_layernorm(layernorm_input) + + # MLP. + mlp_output = self.mlp(layernorm_output) + + # Second residual connection. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = layernorm_input + + output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training) + output = residual + output + + return output, kv_cache + + +class GLMTransformer(torch.nn.Module): + """Transformer class.""" + + def __init__(self, config: ChatGLMConfig, device=None): + super(GLMTransformer, self).__init__() + + self.fp32_residual_connection = config.fp32_residual_connection + self.post_layer_norm = config.post_layer_norm + + # Number of layers. + self.num_layers = config.num_layers + + # Transformer layers. + def build_layer(layer_number): + return GLMBlock(config, layer_number, device=device) + + self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)]) + + if self.post_layer_norm: + LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm + # Final layer norm before output. + self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, + dtype=config.torch_dtype) + + self.gradient_checkpointing = False + + def _get_layer(self, layer_number): + return self.layers[layer_number] + + def forward( + self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None, + use_cache: Optional[bool] = True, + output_hidden_states: Optional[bool] = False, + ): + if not kv_caches: + kv_caches = [None for _ in range(self.num_layers)] + presents = () if use_cache else None + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + all_self_attentions = None + all_hidden_states = () if output_hidden_states else None + for index in range(self.num_layers): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer = self._get_layer(index) + if self.gradient_checkpointing and self.training: + layer_ret = torch.utils.checkpoint.checkpoint( + layer, + hidden_states, + attention_mask, + rotary_pos_emb, + kv_caches[index], + use_cache + ) + else: + layer_ret = layer( + hidden_states, + attention_mask, + rotary_pos_emb, + kv_cache=kv_caches[index], + use_cache=use_cache + ) + hidden_states, kv_cache = layer_ret + if use_cache: + presents = presents + (kv_cache,) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # Final layer norm. + if self.post_layer_norm: + hidden_states = self.final_layernorm(hidden_states) + + return hidden_states, presents, all_hidden_states, all_self_attentions + + +class ChatGLMPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and + a simple interface for downloading and loading pretrained models. + """ + + is_parallelizable = False + supports_gradient_checkpointing = True + config_class = ChatGLMConfig + base_model_prefix = "transformer" + _no_split_modules = ["GLMBlock"] + + def _init_weights(self, module: nn.Module): + """Initialize the weights.""" + return + + def get_masks(self, input_ids, past_key_values, padding_mask=None): + batch_size, seq_length = input_ids.shape + full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device) + full_attention_mask.tril_() + past_length = 0 + if past_key_values: + past_length = past_key_values[0][0].shape[0] + if past_length: + full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length, + device=input_ids.device), full_attention_mask), dim=-1) + if padding_mask is not None: + full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1) + if not past_length and padding_mask is not None: + full_attention_mask -= padding_mask.unsqueeze(-1) - 1 + full_attention_mask = (full_attention_mask < 0.5).bool() + full_attention_mask.unsqueeze_(1) + return full_attention_mask + + def get_position_ids(self, input_ids, device): + batch_size, seq_length = input_ids.shape + position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1) + return position_ids + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, GLMTransformer): + module.gradient_checkpointing = value + + +class Embedding(torch.nn.Module): + """Language model embeddings.""" + + def __init__(self, config: ChatGLMConfig, device=None): + super(Embedding, self).__init__() + + self.hidden_size = config.hidden_size + # Word embeddings (parallel). + self.word_embeddings = nn.Embedding( + config.padded_vocab_size, + self.hidden_size, + dtype=config.torch_dtype, + device=device + ) + self.fp32_residual_connection = config.fp32_residual_connection + + def forward(self, input_ids): + # Embeddings. + words_embeddings = self.word_embeddings(input_ids) + embeddings = words_embeddings + # Data format change to avoid explicit tranposes : [b s h] --> [s b h]. + embeddings = embeddings.transpose(0, 1).contiguous() + # If the input flag for fp32 residual connection is set, convert for float. + if self.fp32_residual_connection: + embeddings = embeddings.float() + return embeddings + + +class ChatGLMModel(ChatGLMPreTrainedModel): + def __init__(self, config: ChatGLMConfig, device=None, empty_init=True): + super().__init__(config) + if empty_init: + init_method = skip_init + else: + init_method = default_init + init_kwargs = {} + if device is not None: + init_kwargs["device"] = device + self.embedding = init_method(Embedding, config, **init_kwargs) + self.num_layers = config.num_layers + self.multi_query_group_num = config.multi_query_group_num + self.kv_channels = config.kv_channels + + # Rotary positional embeddings + self.seq_length = config.seq_length + rotary_dim = ( + config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels + ) + + self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device, + dtype=config.torch_dtype) + self.encoder = init_method(GLMTransformer, config, **init_kwargs) + self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False, + dtype=config.torch_dtype, **init_kwargs) + self.pre_seq_len = config.pre_seq_len + self.prefix_projection = config.prefix_projection + if self.pre_seq_len is not None: + for param in self.parameters(): + param.requires_grad = False + self.prefix_tokens = torch.arange(self.pre_seq_len).long() + self.prefix_encoder = PrefixEncoder(config) + self.dropout = torch.nn.Dropout(0.1) + + def get_input_embeddings(self): + return self.embedding.word_embeddings + + def get_prompt(self, batch_size, device, dtype=torch.half): + prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device) + past_key_values = self.prefix_encoder(prefix_tokens).type(dtype) + past_key_values = past_key_values.view( + batch_size, + self.pre_seq_len, + self.num_layers * 2, + self.multi_query_group_num, + self.kv_channels + ) + # seq_len, b, nh, hidden_size + past_key_values = self.dropout(past_key_values) + past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2) + return past_key_values + + def forward( + self, + input_ids, + position_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.BoolTensor] = None, + full_attention_mask: Optional[torch.BoolTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, seq_length = input_ids.shape + + if inputs_embeds is None: + inputs_embeds = self.embedding(input_ids) + + if self.pre_seq_len is not None: + if past_key_values is None: + past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device, + dtype=inputs_embeds.dtype) + if attention_mask is not None: + attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)), + attention_mask], dim=-1) + + if full_attention_mask is None: + if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1): + full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask) + + # Rotary positional embeddings + rotary_pos_emb = self.rotary_pos_emb(self.seq_length) + if position_ids is not None: + rotary_pos_emb = rotary_pos_emb[position_ids] + else: + rotary_pos_emb = rotary_pos_emb[None, :seq_length] + rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous() + + # Run encoder. + hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder( + inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb, + kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states + ) + + if not return_dict: + return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + def quantize(self, weight_bit_width: int): + from .quantization import quantize + quantize(self.encoder, weight_bit_width) + return self + + +class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel): + def __init__(self, config: ChatGLMConfig, empty_init=True, device=None): + super().__init__(config) + + self.max_sequence_length = config.max_length + self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device) + self.config = config + self.quantized = False + + if self.config.quantization_bit: + self.quantize(self.config.quantization_bit, empty_init=True) + + def _update_model_kwargs_for_generation( + self, + outputs: ModelOutput, + model_kwargs: Dict[str, Any], + is_encoder_decoder: bool = False, + standardize_cache_format: bool = False, + ) -> Dict[str, Any]: + # update past_key_values + model_kwargs["past_key_values"] = self._extract_past_from_model_output( + outputs, standardize_cache_format=standardize_cache_format + ) + + # update attention mask + if "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + model_kwargs["attention_mask"] = torch.cat( + [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 + ) + + # update position ids + if "position_ids" in model_kwargs: + position_ids = model_kwargs["position_ids"] + new_position_id = position_ids[..., -1:].clone() + new_position_id += 1 + model_kwargs["position_ids"] = torch.cat( + [position_ids, new_position_id], dim=-1 + ) + + model_kwargs["is_first_forward"] = False + return model_kwargs + + def prepare_inputs_for_generation( + self, + input_ids: torch.LongTensor, + past_key_values: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + is_first_forward: bool = True, + **kwargs + ) -> dict: + # only last token for input_ids if past is not None + if position_ids is None: + position_ids = self.get_position_ids(input_ids, device=input_ids.device) + if not is_first_forward: + if past_key_values is not None: + position_ids = position_ids[..., -1:] + input_ids = input_ids[:, -1:] + return { + "input_ids": input_ids, + "past_key_values": past_key_values, + "position_ids": position_ids, + "attention_mask": attention_mask, + "return_last_logit": True, + "use_cache": use_cache + } + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + return_last_logit: Optional[bool] = False, + ): + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + if return_last_logit: + hidden_states = hidden_states[-1:] + lm_logits = self.transformer.output_layer(hidden_states) + lm_logits = lm_logits.transpose(0, 1).contiguous() + + loss = None + if labels is not None: + lm_logits = lm_logits.to(torch.float32) + + # Shift so that tokens < n predict n + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss(ignore_index=-100) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + lm_logits = lm_logits.to(hidden_states.dtype) + loss = loss.to(hidden_states.dtype) + + if not return_dict: + output = (lm_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + @staticmethod + def _reorder_cache( + past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor + ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]: + """ + This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or + [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct + beam_idx at every generation step. + + Output shares the same memory storage as `past`. + """ + return tuple( + ( + layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)), + layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)), + ) + for layer_past in past + ) + + def process_response(self, output, history): + content = "" + history = deepcopy(history) + for response in output.split("<|assistant|>"): + metadata, content = response.split("\n", maxsplit=1) + if not metadata.strip(): + content = content.strip() + history.append({"role": "assistant", "metadata": metadata, "content": content}) + content = content.replace("[[训练时间]]", "2023年") + else: + history.append({"role": "assistant", "metadata": metadata, "content": content}) + if history[0]["role"] == "system" and "tools" in history[0]: + content = "\n".join(content.split("\n")[1:-1]) + def tool_call(**kwargs): + return kwargs + parameters = eval(content) + content = {"name": metadata.strip(), "parameters": parameters} + else: + content = {"name": metadata.strip(), "content": content} + return content, history + + @torch.inference_mode() + def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", + max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None, + **kwargs): + if history is None: + history = [] + if logits_processor is None: + logits_processor = LogitsProcessorList() + logits_processor.append(InvalidScoreLogitsProcessor()) + gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, + "temperature": temperature, "logits_processor": logits_processor, **kwargs} + inputs = tokenizer.build_chat_input(query, history=history, role=role) + inputs = inputs.to(self.device) + eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"), + tokenizer.get_command("<|observation|>")] + outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id) + outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1] + response = tokenizer.decode(outputs) + history.append({"role": role, "content": query}) + response, history = self.process_response(response, history) + return response, history + + @torch.inference_mode() + def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", + past_key_values=None,max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8, + logits_processor=None, return_past_key_values=False, **kwargs): + if history is None: + history = [] + if logits_processor is None: + logits_processor = LogitsProcessorList() + logits_processor.append(InvalidScoreLogitsProcessor()) + eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"), + tokenizer.get_command("<|observation|>")] + gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p, + "temperature": temperature, "logits_processor": logits_processor, **kwargs} + if past_key_values is None: + inputs = tokenizer.build_chat_input(query, history=history, role=role) + else: + inputs = tokenizer.build_chat_input(query, role=role) + inputs = inputs.to(self.device) + if past_key_values is not None: + past_length = past_key_values[0][0].shape[0] + if self.transformer.pre_seq_len is not None: + past_length -= self.transformer.pre_seq_len + inputs.position_ids += past_length + attention_mask = inputs.attention_mask + attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1) + inputs['attention_mask'] = attention_mask + history.append({"role": role, "content": query}) + for outputs in self.stream_generate(**inputs, past_key_values=past_key_values, + eos_token_id=eos_token_id, return_past_key_values=return_past_key_values, + **gen_kwargs): + if return_past_key_values: + outputs, past_key_values = outputs + outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1] + response = tokenizer.decode(outputs) + if response and response[-1] != "�": + response, new_history = self.process_response(response, history) + if return_past_key_values: + yield response, new_history, past_key_values + else: + yield response, new_history + + @torch.inference_mode() + def stream_generate( + self, + input_ids, + generation_config: Optional[GenerationConfig] = None, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, + return_past_key_values=False, + **kwargs, + ): + batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1] + + if generation_config is None: + generation_config = self.generation_config + generation_config = copy.deepcopy(generation_config) + model_kwargs = generation_config.update(**kwargs) + model_kwargs["use_cache"] = generation_config.use_cache + bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id + + if isinstance(eos_token_id, int): + eos_token_id = [eos_token_id] + eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None + + has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None + if has_default_max_length and generation_config.max_new_tokens is None: + warnings.warn( + f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. " + "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we" + " recommend using `max_new_tokens` to control the maximum length of the generation.", + UserWarning, + ) + elif generation_config.max_new_tokens is not None: + generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length + if not has_default_max_length: + logger.warn( + f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(=" + f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. " + "Please refer to the documentation for more information. " + "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)", + UserWarning, + ) + + if input_ids_seq_length >= generation_config.max_length: + input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids" + logger.warning( + f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to" + f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider" + " increasing `max_new_tokens`." + ) + + # 2. Set generation parameters if not already defined + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + + logits_processor = self._get_logits_processor( + generation_config=generation_config, + input_ids_seq_length=input_ids_seq_length, + encoder_input_ids=input_ids, + prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, + logits_processor=logits_processor, + ) + + stopping_criteria = self._get_stopping_criteria( + generation_config=generation_config, stopping_criteria=stopping_criteria + ) + logits_warper = self._get_logits_warper(generation_config) + + unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) + scores = None + while True: + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + # forward pass to get next token + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=False, + output_hidden_states=False, + ) + + next_token_logits = outputs.logits[:, -1, :] + + # pre-process distribution + next_token_scores = logits_processor(input_ids, next_token_logits) + next_token_scores = logits_warper(input_ids, next_token_scores) + + # sample + probs = nn.functional.softmax(next_token_scores, dim=-1) + if generation_config.do_sample: + next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) + else: + next_tokens = torch.argmax(probs, dim=-1) + # update generated ids, model inputs, and length for next step + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + model_kwargs = self._update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder + ) + unfinished_sequences = unfinished_sequences.mul( + next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) + ) + if return_past_key_values: + yield input_ids, outputs.past_key_values + else: + yield input_ids + # stop when each sentence is finished, or if we exceed the maximum length + if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): + break + + def quantize(self, bits: int, empty_init=False, device=None, **kwargs): + if bits == 0: + return + + from .quantization import quantize + + if self.quantized: + logger.info("Already quantized.") + return self + + self.quantized = True + + self.config.quantization_bit = bits + + self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device, + **kwargs) + return self + + +class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel): + def __init__(self, config: ChatGLMConfig, empty_init=True, device=None): + super().__init__(config) + + self.num_labels = config.num_labels + self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device) + + self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half) + if config.classifier_dropout is not None: + self.dropout = nn.Dropout(config.classifier_dropout) + else: + self.dropout = None + self.config = config + + if self.config.quantization_bit: + self.quantize(self.config.quantization_bit, empty_init=True) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + full_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + inputs_embeds: Optional[torch.LongTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + full_attention_mask=full_attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + pooled_hidden_states = hidden_states[-1] + if self.dropout is not None: + pooled_hidden_states = self.dropout(pooled_hidden_states) + logits = self.classifier_head(pooled_hidden_states) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze().float(), labels.squeeze()) + else: + loss = loss_fct(logits.float(), labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits.float(), labels.view(-1, self.num_labels)) + + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/optimizer.pt b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bcf315d03d4eaeff2d678e8793973659a8ad1855 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:400e8a5e274a768dc0b9682a0501d298708c312857f56088f2a4a2def65fc62e +size 14682210 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/pytorch_model.bin b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..e3d93d3ca55b0927dee612ec601af9dbca54237b --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06951d06cd2537b8b6927e793abc5f91f918d00518df2a8282449b078a3a4a11 +size 7341306 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/quantization.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/quantization.py new file mode 100644 index 0000000000000000000000000000000000000000..cb95bfe82b203ff6a2aa962326d2c7a438d6a52f --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/quantization.py @@ -0,0 +1,188 @@ +from torch.nn import Linear +from torch.nn.parameter import Parameter + +import bz2 +import torch +import base64 +import ctypes +from transformers.utils import logging + +from typing import List +from functools import partial + +logger = logging.get_logger(__name__) + +try: + from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up + + class Kernel: + def __init__(self, code: bytes, function_names: List[str]): + self.code = code + self._function_names = function_names + self._cmodule = LazyKernelCModule(self.code) + + for name in self._function_names: + setattr(self, name, KernelFunction(self._cmodule, name)) + + quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ" + + kernels = Kernel( + bz2.decompress(base64.b64decode(quantization_code)), + [ + "int4WeightCompression", + "int4WeightExtractionFloat", + "int4WeightExtractionHalf", + "int8WeightExtractionFloat", + "int8WeightExtractionHalf", + ], + ) +except Exception as exception: + kernels = None + logger.warning("Failed to load cpm_kernels:" + str(exception)) + + +class W8A16Linear(torch.autograd.Function): + @staticmethod + def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width): + ctx.inp_shape = inp.size() + ctx.weight_bit_width = weight_bit_width + out_features = quant_w.size(0) + inp = inp.contiguous().view(-1, inp.size(-1)) + weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width) + ctx.weight_shape = weight.size() + output = inp.mm(weight.t()) + ctx.save_for_backward(inp, quant_w, scale_w) + return output.view(*(ctx.inp_shape[:-1] + (out_features,))) + + @staticmethod + def backward(ctx, grad_output: torch.Tensor): + inp, quant_w, scale_w = ctx.saved_tensors + weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width) + grad_output = grad_output.contiguous().view(-1, weight.size(0)) + grad_input = grad_output.mm(weight) + grad_weight = grad_output.t().mm(inp) + return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None + + +def compress_int4_weight(weight: torch.Tensor): # (n, m) + with torch.cuda.device(weight.device): + n, m = weight.size(0), weight.size(1) + assert m % 2 == 0 + m = m // 2 + out = torch.empty(n, m, dtype=torch.int8, device="cuda") + stream = torch.cuda.current_stream() + + gridDim = (n, 1, 1) + blockDim = (min(round_up(m, 32), 1024), 1, 1) + + kernels.int4WeightCompression( + gridDim, + blockDim, + 0, + stream, + [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)], + ) + return out + + +def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int): + assert scale_list.dtype in [torch.half, torch.bfloat16] + assert weight.dtype in [torch.int8] + if source_bit_width == 8: + return weight.to(scale_list.dtype) * scale_list[:, None] + elif source_bit_width == 4: + func = ( + kernels.int4WeightExtractionHalf if scale_list.dtype == torch.half else kernels.int4WeightExtractionBFloat16 + ) + else: + assert False, "Unsupported bit-width" + + with torch.cuda.device(weight.device): + n, m = weight.size(0), weight.size(1) + out = torch.empty(n, m * (8 // source_bit_width), dtype=scale_list.dtype, device="cuda") + stream = torch.cuda.current_stream() + + gridDim = (n, 1, 1) + blockDim = (min(round_up(m, 32), 1024), 1, 1) + + func( + gridDim, + blockDim, + 0, + stream, + [ + ctypes.c_void_p(weight.data_ptr()), + ctypes.c_void_p(scale_list.data_ptr()), + ctypes.c_void_p(out.data_ptr()), + ctypes.c_int32(n), + ctypes.c_int32(m), + ], + ) + return out + + +class QuantizedLinear(torch.nn.Module): + def __init__(self, weight_bit_width: int, weight, bias=None, device="cpu", dtype=None, empty_init=False, *args, + **kwargs): + super().__init__() + self.weight_bit_width = weight_bit_width + + shape = weight.shape + + if weight is None or empty_init: + self.weight = torch.empty(shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=device) + self.weight_scale = torch.empty(shape[0], dtype=dtype, device=device) + else: + self.weight_scale = weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1) + self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8) + if weight_bit_width == 4: + self.weight = compress_int4_weight(self.weight) + + self.weight = Parameter(self.weight.to(device), requires_grad=False) + self.weight_scale = Parameter(self.weight_scale.to(device), requires_grad=False) + self.bias = Parameter(bias.to(device), requires_grad=False) if bias is not None else None + + def forward(self, input): + output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width) + if self.bias is not None: + output = output + self.bias + return output + + +def quantize(model, weight_bit_width, empty_init=False, device=None): + """Replace fp16 linear with quantized linear""" + for layer in model.layers: + layer.self_attention.query_key_value = QuantizedLinear( + weight_bit_width=weight_bit_width, + weight=layer.self_attention.query_key_value.weight.to(torch.cuda.current_device()), + bias=layer.self_attention.query_key_value.bias, + dtype=layer.self_attention.query_key_value.weight.dtype, + device=layer.self_attention.query_key_value.weight.device if device is None else device, + empty_init=empty_init + ) + layer.self_attention.dense = QuantizedLinear( + weight_bit_width=weight_bit_width, + weight=layer.self_attention.dense.weight.to(torch.cuda.current_device()), + bias=layer.self_attention.dense.bias, + dtype=layer.self_attention.dense.weight.dtype, + device=layer.self_attention.dense.weight.device if device is None else device, + empty_init=empty_init + ) + layer.mlp.dense_h_to_4h = QuantizedLinear( + weight_bit_width=weight_bit_width, + weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()), + bias=layer.mlp.dense_h_to_4h.bias, + dtype=layer.mlp.dense_h_to_4h.weight.dtype, + device=layer.mlp.dense_h_to_4h.weight.device if device is None else device, + empty_init=empty_init + ) + layer.mlp.dense_4h_to_h = QuantizedLinear( + weight_bit_width=weight_bit_width, + weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()), + bias=layer.mlp.dense_4h_to_h.bias, + dtype=layer.mlp.dense_4h_to_h.weight.dtype, + device=layer.mlp.dense_4h_to_h.weight.device if device is None else device, + empty_init=empty_init + ) + + return model diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/rng_state.pth b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5367c96ab2f1fef1e6f78de4c3cb38f6b50f37d5 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc0cb6255ebbd22879226b5a84d22302be0dc04a17df5c3e33192cc7f59bf84e +size 14244 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/scheduler.pt b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d467cfe25a20f7e6079f110ede561db40957ece --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acca6ccdfe33e0ab0bbeb7e3423cfda45eccc967cbf8152c875aec962ac04588 +size 1064 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/special_tokens_map.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/special_tokens_map.json @@ -0,0 +1 @@ +{} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenization_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenizer.model b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenizer_config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5faafd38f2e2934804feb0e7d71ebf08b0839bf5 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenizer_config.json @@ -0,0 +1,18 @@ +{ + "added_tokens_decoder": {}, + "additional_special_tokens": [], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "ChatGLMTokenizer", + "tokenizer_file": null +} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/trainer_state.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e119522202901d2c6817509eb52a0b5aec492e91 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/trainer_state.json @@ -0,0 +1,2419 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 37.64705882352941, + "eval_steps": 500, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.09, + "learning_rate": 0.009985714285714285, + "loss": 2.6971, + "step": 1 + }, + { + "epoch": 0.19, + "learning_rate": 0.009971428571428572, + "loss": 2.3927, + "step": 2 + }, + { + "epoch": 0.28, + "learning_rate": 0.009957142857142857, + "loss": 2.2539, + "step": 3 + }, + { + "epoch": 0.38, + "learning_rate": 0.009942857142857144, + "loss": 2.1408, + "step": 4 + }, + { + "epoch": 0.47, + "learning_rate": 0.009928571428571429, + "loss": 2.2672, + "step": 5 + }, + { + "epoch": 0.56, + "learning_rate": 0.009914285714285714, + "loss": 1.6433, + "step": 6 + }, + { + "epoch": 0.66, + "learning_rate": 0.0099, + "loss": 2.1405, + "step": 7 + }, + { + "epoch": 0.75, + "learning_rate": 0.009885714285714286, + "loss": 2.1464, + "step": 8 + }, + { + "epoch": 0.85, + "learning_rate": 0.009871428571428571, + "loss": 1.8498, + "step": 9 + }, + { + "epoch": 0.94, + "learning_rate": 0.009857142857142858, + "loss": 1.6896, + "step": 10 + }, + { + "epoch": 1.04, + "learning_rate": 0.009842857142857143, + "loss": 2.1932, + "step": 11 + }, + { + "epoch": 1.13, + "learning_rate": 0.00982857142857143, + "loss": 1.8236, + "step": 12 + }, + { + "epoch": 1.22, + "learning_rate": 0.009814285714285715, + "loss": 1.735, + "step": 13 + }, + { + "epoch": 1.32, + "learning_rate": 0.0098, + "loss": 1.7488, + "step": 14 + }, + { + "epoch": 1.41, + "learning_rate": 0.009785714285714285, + "loss": 1.8336, + "step": 15 + }, + { + "epoch": 1.51, + "learning_rate": 0.009771428571428572, + "loss": 1.9438, + "step": 16 + }, + { + "epoch": 1.6, + "learning_rate": 0.009757142857142858, + "loss": 1.7178, + "step": 17 + }, + { + "epoch": 1.69, + "learning_rate": 0.009742857142857143, + "loss": 1.5714, + "step": 18 + }, + { + "epoch": 1.79, + "learning_rate": 0.009728571428571428, + "loss": 1.537, + "step": 19 + }, + { + "epoch": 1.88, + "learning_rate": 0.009714285714285715, + "loss": 1.6764, + "step": 20 + }, + { + "epoch": 1.98, + "learning_rate": 0.0097, + "loss": 1.8919, + "step": 21 + }, + { + "epoch": 2.07, + "learning_rate": 0.009685714285714285, + "loss": 1.346, + "step": 22 + }, + { + "epoch": 2.16, + "learning_rate": 0.009671428571428572, + "loss": 1.5036, + "step": 23 + }, + { + "epoch": 2.26, + "learning_rate": 0.009657142857142857, + "loss": 1.6788, + "step": 24 + }, + { + "epoch": 2.35, + "learning_rate": 0.009642857142857144, + "loss": 1.6667, + "step": 25 + }, + { + "epoch": 2.45, + "learning_rate": 0.009628571428571429, + "loss": 1.7153, + "step": 26 + }, + { + "epoch": 2.54, + "learning_rate": 0.009614285714285714, + "loss": 1.601, + "step": 27 + }, + { + "epoch": 2.64, + "learning_rate": 0.0096, + "loss": 1.3002, + "step": 28 + }, + { + "epoch": 2.73, + "learning_rate": 0.009585714285714286, + "loss": 1.3294, + "step": 29 + }, + { + "epoch": 2.82, + "learning_rate": 0.009571428571428573, + "loss": 1.7477, + "step": 30 + }, + { + "epoch": 2.92, + "learning_rate": 0.009557142857142858, + "loss": 1.7961, + "step": 31 + }, + { + "epoch": 3.01, + "learning_rate": 0.009542857142857143, + "loss": 1.4954, + "step": 32 + }, + { + "epoch": 3.11, + "learning_rate": 0.009528571428571428, + "loss": 1.6452, + "step": 33 + }, + { + "epoch": 3.2, + "learning_rate": 0.009514285714285715, + "loss": 1.3528, + "step": 34 + }, + { + "epoch": 3.29, + "learning_rate": 0.0095, + "loss": 1.4811, + "step": 35 + }, + { + "epoch": 3.39, + "learning_rate": 0.009485714285714287, + "loss": 1.4738, + "step": 36 + }, + { + "epoch": 3.48, + "learning_rate": 0.009471428571428572, + "loss": 1.174, + "step": 37 + }, + { + "epoch": 3.58, + "learning_rate": 0.009457142857142857, + "loss": 1.2346, + "step": 38 + }, + { + "epoch": 3.67, + "learning_rate": 0.009442857142857143, + "loss": 1.5327, + "step": 39 + }, + { + "epoch": 3.76, + "learning_rate": 0.009428571428571429, + "loss": 1.5249, + "step": 40 + }, + { + "epoch": 3.86, + "learning_rate": 0.009414285714285714, + "loss": 1.5086, + "step": 41 + }, + { + "epoch": 3.95, + "learning_rate": 0.0094, + "loss": 1.8425, + "step": 42 + }, + { + "epoch": 4.05, + "learning_rate": 0.009385714285714287, + "loss": 1.1943, + "step": 43 + }, + { + "epoch": 4.14, + "learning_rate": 0.009371428571428572, + "loss": 1.6835, + "step": 44 + }, + { + "epoch": 4.24, + "learning_rate": 0.009357142857142857, + "loss": 1.75, + "step": 45 + }, + { + "epoch": 4.33, + "learning_rate": 0.009342857142857142, + "loss": 1.2561, + "step": 46 + }, + { + "epoch": 4.42, + "learning_rate": 0.009328571428571429, + "loss": 1.3784, + "step": 47 + }, + { + "epoch": 4.52, + "learning_rate": 0.009314285714285714, + "loss": 1.2538, + "step": 48 + }, + { + "epoch": 4.61, + "learning_rate": 0.009300000000000001, + "loss": 1.4429, + "step": 49 + }, + { + "epoch": 4.71, + "learning_rate": 0.009285714285714286, + "loss": 1.3687, + "step": 50 + }, + { + "epoch": 4.8, + "learning_rate": 0.009271428571428571, + "loss": 1.1511, + "step": 51 + }, + { + "epoch": 4.89, + "learning_rate": 0.009257142857142858, + "loss": 1.181, + "step": 52 + }, + { + "epoch": 4.99, + "learning_rate": 0.009242857142857143, + "loss": 1.1753, + "step": 53 + }, + { + "epoch": 5.08, + "learning_rate": 0.009228571428571428, + "loss": 1.1562, + "step": 54 + }, + { + "epoch": 5.18, + "learning_rate": 0.009214285714285715, + "loss": 1.2936, + "step": 55 + }, + { + "epoch": 5.27, + "learning_rate": 0.0092, + "loss": 1.3591, + "step": 56 + }, + { + "epoch": 5.36, + "learning_rate": 0.009185714285714287, + "loss": 1.1376, + "step": 57 + }, + { + "epoch": 5.46, + "learning_rate": 0.009171428571428572, + "loss": 1.372, + "step": 58 + }, + { + "epoch": 5.55, + "learning_rate": 0.009157142857142857, + "loss": 1.5141, + "step": 59 + }, + { + "epoch": 5.65, + "learning_rate": 0.009142857142857144, + "loss": 1.2087, + "step": 60 + }, + { + "epoch": 5.74, + "learning_rate": 0.009128571428571429, + "loss": 1.136, + "step": 61 + }, + { + "epoch": 5.84, + "learning_rate": 0.009114285714285715, + "loss": 1.2948, + "step": 62 + }, + { + "epoch": 5.93, + "learning_rate": 0.0091, + "loss": 1.0592, + "step": 63 + }, + { + "epoch": 6.02, + "learning_rate": 0.009085714285714286, + "loss": 1.2321, + "step": 64 + }, + { + "epoch": 6.12, + "learning_rate": 0.009071428571428572, + "loss": 1.0827, + "step": 65 + }, + { + "epoch": 6.21, + "learning_rate": 0.009057142857142857, + "loss": 1.1136, + "step": 66 + }, + { + "epoch": 6.31, + "learning_rate": 0.009042857142857142, + "loss": 1.475, + "step": 67 + }, + { + "epoch": 6.4, + "learning_rate": 0.009028571428571427, + "loss": 1.1316, + "step": 68 + }, + { + "epoch": 6.49, + "learning_rate": 0.009014285714285714, + "loss": 1.1688, + "step": 69 + }, + { + "epoch": 6.59, + "learning_rate": 0.009000000000000001, + "loss": 1.0882, + "step": 70 + }, + { + "epoch": 6.68, + "learning_rate": 0.008985714285714286, + "loss": 1.1085, + "step": 71 + }, + { + "epoch": 6.78, + "learning_rate": 0.008971428571428571, + "loss": 1.2029, + "step": 72 + }, + { + "epoch": 6.87, + "learning_rate": 0.008957142857142856, + "loss": 1.098, + "step": 73 + }, + { + "epoch": 6.96, + "learning_rate": 0.008942857142857143, + "loss": 1.219, + "step": 74 + }, + { + "epoch": 7.06, + "learning_rate": 0.00892857142857143, + "loss": 1.0092, + "step": 75 + }, + { + "epoch": 7.15, + "learning_rate": 0.008914285714285715, + "loss": 1.0112, + "step": 76 + }, + { + "epoch": 7.25, + "learning_rate": 0.0089, + "loss": 1.1481, + "step": 77 + }, + { + "epoch": 7.34, + "learning_rate": 0.008885714285714287, + "loss": 0.9873, + "step": 78 + }, + { + "epoch": 7.44, + "learning_rate": 0.008871428571428572, + "loss": 1.0586, + "step": 79 + }, + { + "epoch": 7.53, + "learning_rate": 0.008857142857142857, + "loss": 1.1177, + "step": 80 + }, + { + "epoch": 7.62, + "learning_rate": 0.008842857142857142, + "loss": 0.7814, + "step": 81 + }, + { + "epoch": 7.72, + "learning_rate": 0.008828571428571429, + "loss": 1.2043, + "step": 82 + }, + { + "epoch": 7.81, + "learning_rate": 0.008814285714285715, + "loss": 1.0062, + "step": 83 + }, + { + "epoch": 7.91, + "learning_rate": 0.0088, + "loss": 1.0831, + "step": 84 + }, + { + "epoch": 8.0, + "learning_rate": 0.008785714285714286, + "loss": 0.9554, + "step": 85 + }, + { + "epoch": 8.09, + "learning_rate": 0.00877142857142857, + "loss": 1.1674, + "step": 86 + }, + { + "epoch": 8.19, + "learning_rate": 0.008757142857142857, + "loss": 0.8226, + "step": 87 + }, + { + "epoch": 8.28, + "learning_rate": 0.008742857142857144, + "loss": 0.9166, + "step": 88 + }, + { + "epoch": 8.38, + "learning_rate": 0.00872857142857143, + "loss": 0.734, + "step": 89 + }, + { + "epoch": 8.47, + "learning_rate": 0.008714285714285714, + "loss": 0.8641, + "step": 90 + }, + { + "epoch": 8.56, + "learning_rate": 0.0087, + "loss": 0.9517, + "step": 91 + }, + { + "epoch": 8.66, + "learning_rate": 0.008685714285714286, + "loss": 0.9995, + "step": 92 + }, + { + "epoch": 8.75, + "learning_rate": 0.008671428571428571, + "loss": 0.763, + "step": 93 + }, + { + "epoch": 8.85, + "learning_rate": 0.008657142857142858, + "loss": 1.0712, + "step": 94 + }, + { + "epoch": 8.94, + "learning_rate": 0.008642857142857143, + "loss": 1.1111, + "step": 95 + }, + { + "epoch": 9.04, + "learning_rate": 0.008628571428571428, + "loss": 0.9626, + "step": 96 + }, + { + "epoch": 9.13, + "learning_rate": 0.008614285714285715, + "loss": 0.6385, + "step": 97 + }, + { + "epoch": 9.22, + "learning_rate": 0.0086, + "loss": 0.8147, + "step": 98 + }, + { + "epoch": 9.32, + "learning_rate": 0.008585714285714285, + "loss": 0.8109, + "step": 99 + }, + { + "epoch": 9.41, + "learning_rate": 0.008571428571428572, + "loss": 1.0953, + "step": 100 + }, + { + "epoch": 9.51, + "learning_rate": 0.008557142857142859, + "loss": 0.7104, + "step": 101 + }, + { + "epoch": 9.6, + "learning_rate": 0.008542857142857144, + "loss": 0.9672, + "step": 102 + }, + { + "epoch": 9.69, + "learning_rate": 0.008528571428571429, + "loss": 0.7593, + "step": 103 + }, + { + "epoch": 9.79, + "learning_rate": 0.008514285714285714, + "loss": 1.0186, + "step": 104 + }, + { + "epoch": 9.88, + "learning_rate": 0.0085, + "loss": 0.7898, + "step": 105 + }, + { + "epoch": 9.98, + "learning_rate": 0.008485714285714286, + "loss": 0.7392, + "step": 106 + }, + { + "epoch": 10.07, + "learning_rate": 0.008471428571428572, + "loss": 0.7295, + "step": 107 + }, + { + "epoch": 10.16, + "learning_rate": 0.008457142857142858, + "loss": 0.7211, + "step": 108 + }, + { + "epoch": 10.26, + "learning_rate": 0.008442857142857143, + "loss": 0.769, + "step": 109 + }, + { + "epoch": 10.35, + "learning_rate": 0.00842857142857143, + "loss": 0.718, + "step": 110 + }, + { + "epoch": 10.45, + "learning_rate": 0.008414285714285714, + "loss": 0.6411, + "step": 111 + }, + { + "epoch": 10.54, + "learning_rate": 0.0084, + "loss": 0.8016, + "step": 112 + }, + { + "epoch": 10.64, + "learning_rate": 0.008385714285714286, + "loss": 0.6633, + "step": 113 + }, + { + "epoch": 10.73, + "learning_rate": 0.008371428571428571, + "loss": 0.7257, + "step": 114 + }, + { + "epoch": 10.82, + "learning_rate": 0.008357142857142858, + "loss": 0.7785, + "step": 115 + }, + { + "epoch": 10.92, + "learning_rate": 0.008342857142857143, + "loss": 0.8927, + "step": 116 + }, + { + "epoch": 11.01, + "learning_rate": 0.008328571428571428, + "loss": 0.7242, + "step": 117 + }, + { + "epoch": 11.11, + "learning_rate": 0.008314285714285715, + "loss": 0.8297, + "step": 118 + }, + { + "epoch": 11.2, + "learning_rate": 0.0083, + "loss": 0.6761, + "step": 119 + }, + { + "epoch": 11.29, + "learning_rate": 0.008285714285714287, + "loss": 0.6699, + "step": 120 + }, + { + "epoch": 11.39, + "learning_rate": 0.008271428571428572, + "loss": 0.5365, + "step": 121 + }, + { + "epoch": 11.48, + "learning_rate": 0.008257142857142857, + "loss": 0.9045, + "step": 122 + }, + { + "epoch": 11.58, + "learning_rate": 0.008242857142857144, + "loss": 0.5071, + "step": 123 + }, + { + "epoch": 11.67, + "learning_rate": 0.008228571428571429, + "loss": 0.6472, + "step": 124 + }, + { + "epoch": 11.76, + "learning_rate": 0.008214285714285714, + "loss": 0.6232, + "step": 125 + }, + { + "epoch": 11.86, + "learning_rate": 0.008199999999999999, + "loss": 0.4905, + "step": 126 + }, + { + "epoch": 11.95, + "learning_rate": 0.008185714285714286, + "loss": 0.557, + "step": 127 + }, + { + "epoch": 12.05, + "learning_rate": 0.008171428571428573, + "loss": 0.5517, + "step": 128 + }, + { + "epoch": 12.14, + "learning_rate": 0.008157142857142858, + "loss": 0.6321, + "step": 129 + }, + { + "epoch": 12.24, + "learning_rate": 0.008142857142857143, + "loss": 0.6619, + "step": 130 + }, + { + "epoch": 12.33, + "learning_rate": 0.008128571428571428, + "loss": 0.5524, + "step": 131 + }, + { + "epoch": 12.42, + "learning_rate": 0.008114285714285715, + "loss": 0.4688, + "step": 132 + }, + { + "epoch": 12.52, + "learning_rate": 0.008100000000000001, + "loss": 0.3717, + "step": 133 + }, + { + "epoch": 12.61, + "learning_rate": 0.008085714285714286, + "loss": 0.5118, + "step": 134 + }, + { + "epoch": 12.71, + "learning_rate": 0.008071428571428571, + "loss": 0.4521, + "step": 135 + }, + { + "epoch": 12.8, + "learning_rate": 0.008057142857142856, + "loss": 0.5865, + "step": 136 + }, + { + "epoch": 12.89, + "learning_rate": 0.008042857142857143, + "loss": 0.5977, + "step": 137 + }, + { + "epoch": 12.99, + "learning_rate": 0.008028571428571428, + "loss": 0.6977, + "step": 138 + }, + { + "epoch": 13.08, + "learning_rate": 0.008014285714285713, + "loss": 0.5625, + "step": 139 + }, + { + "epoch": 13.18, + "learning_rate": 0.008, + "loss": 0.3611, + "step": 140 + }, + { + "epoch": 13.27, + "learning_rate": 0.007985714285714287, + "loss": 0.5168, + "step": 141 + }, + { + "epoch": 13.36, + "learning_rate": 0.007971428571428572, + "loss": 0.4429, + "step": 142 + }, + { + "epoch": 13.46, + "learning_rate": 0.007957142857142857, + "loss": 0.4998, + "step": 143 + }, + { + "epoch": 13.55, + "learning_rate": 0.007942857142857142, + "loss": 0.4437, + "step": 144 + }, + { + "epoch": 13.65, + "learning_rate": 0.007928571428571429, + "loss": 0.4958, + "step": 145 + }, + { + "epoch": 13.74, + "learning_rate": 0.007914285714285716, + "loss": 0.4021, + "step": 146 + }, + { + "epoch": 13.84, + "learning_rate": 0.0079, + "loss": 0.6163, + "step": 147 + }, + { + "epoch": 13.93, + "learning_rate": 0.007885714285714286, + "loss": 0.406, + "step": 148 + }, + { + "epoch": 14.02, + "learning_rate": 0.007871428571428571, + "loss": 0.4905, + "step": 149 + }, + { + "epoch": 14.12, + "learning_rate": 0.007857142857142858, + "loss": 0.3824, + "step": 150 + }, + { + "epoch": 14.21, + "learning_rate": 0.007842857142857143, + "loss": 0.3591, + "step": 151 + }, + { + "epoch": 14.31, + "learning_rate": 0.007828571428571428, + "loss": 0.342, + "step": 152 + }, + { + "epoch": 14.4, + "learning_rate": 0.007814285714285715, + "loss": 0.4565, + "step": 153 + }, + { + "epoch": 14.49, + "learning_rate": 0.0078000000000000005, + "loss": 0.3287, + "step": 154 + }, + { + "epoch": 14.59, + "learning_rate": 0.007785714285714286, + "loss": 0.4179, + "step": 155 + }, + { + "epoch": 14.68, + "learning_rate": 0.0077714285714285715, + "loss": 0.3586, + "step": 156 + }, + { + "epoch": 14.78, + "learning_rate": 0.007757142857142857, + "loss": 0.4618, + "step": 157 + }, + { + "epoch": 14.87, + "learning_rate": 0.0077428571428571425, + "loss": 0.4133, + "step": 158 + }, + { + "epoch": 14.96, + "learning_rate": 0.007728571428571429, + "loss": 0.4326, + "step": 159 + }, + { + "epoch": 15.06, + "learning_rate": 0.007714285714285715, + "loss": 0.3838, + "step": 160 + }, + { + "epoch": 15.15, + "learning_rate": 0.0077, + "loss": 0.2978, + "step": 161 + }, + { + "epoch": 15.25, + "learning_rate": 0.007685714285714286, + "loss": 0.3993, + "step": 162 + }, + { + "epoch": 15.34, + "learning_rate": 0.007671428571428571, + "loss": 0.3249, + "step": 163 + }, + { + "epoch": 15.44, + "learning_rate": 0.007657142857142857, + "loss": 0.2796, + "step": 164 + }, + { + "epoch": 15.53, + "learning_rate": 0.007642857142857142, + "loss": 0.3918, + "step": 165 + }, + { + "epoch": 15.62, + "learning_rate": 0.007628571428571429, + "loss": 0.4122, + "step": 166 + }, + { + "epoch": 15.72, + "learning_rate": 0.007614285714285715, + "loss": 0.3403, + "step": 167 + }, + { + "epoch": 15.81, + "learning_rate": 0.0076, + "loss": 0.3759, + "step": 168 + }, + { + "epoch": 15.91, + "learning_rate": 0.007585714285714286, + "loss": 0.3621, + "step": 169 + }, + { + "epoch": 16.0, + "learning_rate": 0.007571428571428571, + "loss": 0.2991, + "step": 170 + }, + { + "epoch": 16.09, + "learning_rate": 0.007557142857142857, + "loss": 0.3039, + "step": 171 + }, + { + "epoch": 16.19, + "learning_rate": 0.007542857142857144, + "loss": 0.4571, + "step": 172 + }, + { + "epoch": 16.28, + "learning_rate": 0.007528571428571429, + "loss": 0.2759, + "step": 173 + }, + { + "epoch": 16.38, + "learning_rate": 0.007514285714285715, + "loss": 0.2835, + "step": 174 + }, + { + "epoch": 16.47, + "learning_rate": 0.0075, + "loss": 0.3221, + "step": 175 + }, + { + "epoch": 16.56, + "learning_rate": 0.007485714285714286, + "loss": 0.3072, + "step": 176 + }, + { + "epoch": 16.66, + "learning_rate": 0.007471428571428572, + "loss": 0.2852, + "step": 177 + }, + { + "epoch": 16.75, + "learning_rate": 0.007457142857142857, + "loss": 0.2559, + "step": 178 + }, + { + "epoch": 16.85, + "learning_rate": 0.007442857142857143, + "loss": 0.2787, + "step": 179 + }, + { + "epoch": 16.94, + "learning_rate": 0.007428571428571429, + "loss": 0.3331, + "step": 180 + }, + { + "epoch": 17.04, + "learning_rate": 0.007414285714285714, + "loss": 0.1929, + "step": 181 + }, + { + "epoch": 17.13, + "learning_rate": 0.0074, + "loss": 0.2065, + "step": 182 + }, + { + "epoch": 17.22, + "learning_rate": 0.007385714285714285, + "loss": 0.2868, + "step": 183 + }, + { + "epoch": 17.32, + "learning_rate": 0.007371428571428571, + "loss": 0.2206, + "step": 184 + }, + { + "epoch": 17.41, + "learning_rate": 0.007357142857142858, + "loss": 0.2355, + "step": 185 + }, + { + "epoch": 17.51, + "learning_rate": 0.007342857142857143, + "loss": 0.3041, + "step": 186 + }, + { + "epoch": 17.6, + "learning_rate": 0.007328571428571429, + "loss": 0.3028, + "step": 187 + }, + { + "epoch": 17.69, + "learning_rate": 0.007314285714285714, + "loss": 0.2435, + "step": 188 + }, + { + "epoch": 17.79, + "learning_rate": 0.0073, + "loss": 0.1869, + "step": 189 + }, + { + "epoch": 17.88, + "learning_rate": 0.007285714285714285, + "loss": 0.3036, + "step": 190 + }, + { + "epoch": 17.98, + "learning_rate": 0.007271428571428571, + "loss": 0.246, + "step": 191 + }, + { + "epoch": 18.07, + "learning_rate": 0.007257142857142858, + "loss": 0.2316, + "step": 192 + }, + { + "epoch": 18.16, + "learning_rate": 0.007242857142857143, + "loss": 0.186, + "step": 193 + }, + { + "epoch": 18.26, + "learning_rate": 0.007228571428571429, + "loss": 0.2616, + "step": 194 + }, + { + "epoch": 18.35, + "learning_rate": 0.007214285714285715, + "loss": 0.2824, + "step": 195 + }, + { + "epoch": 18.45, + "learning_rate": 0.0072, + "loss": 0.2, + "step": 196 + }, + { + "epoch": 18.54, + "learning_rate": 0.007185714285714286, + "loss": 0.1978, + "step": 197 + }, + { + "epoch": 18.64, + "learning_rate": 0.007171428571428572, + "loss": 0.1897, + "step": 198 + }, + { + "epoch": 18.73, + "learning_rate": 0.007157142857142858, + "loss": 0.1958, + "step": 199 + }, + { + "epoch": 18.82, + "learning_rate": 0.0071428571428571435, + "loss": 0.203, + "step": 200 + }, + { + "epoch": 18.92, + "learning_rate": 0.0071285714285714286, + "loss": 0.2451, + "step": 201 + }, + { + "epoch": 19.01, + "learning_rate": 0.0071142857142857145, + "loss": 0.2045, + "step": 202 + }, + { + "epoch": 19.11, + "learning_rate": 0.0070999999999999995, + "loss": 0.1937, + "step": 203 + }, + { + "epoch": 19.2, + "learning_rate": 0.0070857142857142855, + "loss": 0.1814, + "step": 204 + }, + { + "epoch": 19.29, + "learning_rate": 0.007071428571428572, + "loss": 0.1869, + "step": 205 + }, + { + "epoch": 19.39, + "learning_rate": 0.007057142857142857, + "loss": 0.2089, + "step": 206 + }, + { + "epoch": 19.48, + "learning_rate": 0.007042857142857143, + "loss": 0.1924, + "step": 207 + }, + { + "epoch": 19.58, + "learning_rate": 0.007028571428571428, + "loss": 0.1512, + "step": 208 + }, + { + "epoch": 19.67, + "learning_rate": 0.007014285714285714, + "loss": 0.1375, + "step": 209 + }, + { + "epoch": 19.76, + "learning_rate": 0.006999999999999999, + "loss": 0.187, + "step": 210 + }, + { + "epoch": 19.86, + "learning_rate": 0.006985714285714286, + "loss": 0.2488, + "step": 211 + }, + { + "epoch": 19.95, + "learning_rate": 0.006971428571428572, + "loss": 0.1864, + "step": 212 + }, + { + "epoch": 20.05, + "learning_rate": 0.006957142857142857, + "loss": 0.1984, + "step": 213 + }, + { + "epoch": 20.14, + "learning_rate": 0.006942857142857143, + "loss": 0.156, + "step": 214 + }, + { + "epoch": 20.24, + "learning_rate": 0.006928571428571429, + "loss": 0.2082, + "step": 215 + }, + { + "epoch": 20.33, + "learning_rate": 0.006914285714285714, + "loss": 0.094, + "step": 216 + }, + { + "epoch": 20.42, + "learning_rate": 0.0069, + "loss": 0.1784, + "step": 217 + }, + { + "epoch": 20.52, + "learning_rate": 0.006885714285714287, + "loss": 0.1293, + "step": 218 + }, + { + "epoch": 20.61, + "learning_rate": 0.006871428571428572, + "loss": 0.1635, + "step": 219 + }, + { + "epoch": 20.71, + "learning_rate": 0.006857142857142858, + "loss": 0.1668, + "step": 220 + }, + { + "epoch": 20.8, + "learning_rate": 0.006842857142857143, + "loss": 0.1946, + "step": 221 + }, + { + "epoch": 20.89, + "learning_rate": 0.006828571428571429, + "loss": 0.2347, + "step": 222 + }, + { + "epoch": 20.99, + "learning_rate": 0.006814285714285714, + "loss": 0.1523, + "step": 223 + }, + { + "epoch": 21.08, + "learning_rate": 0.0068000000000000005, + "loss": 0.1337, + "step": 224 + }, + { + "epoch": 21.18, + "learning_rate": 0.006785714285714286, + "loss": 0.1511, + "step": 225 + }, + { + "epoch": 21.27, + "learning_rate": 0.0067714285714285715, + "loss": 0.1058, + "step": 226 + }, + { + "epoch": 21.36, + "learning_rate": 0.006757142857142857, + "loss": 0.172, + "step": 227 + }, + { + "epoch": 21.46, + "learning_rate": 0.0067428571428571425, + "loss": 0.1077, + "step": 228 + }, + { + "epoch": 21.55, + "learning_rate": 0.006728571428571428, + "loss": 0.1993, + "step": 229 + }, + { + "epoch": 21.65, + "learning_rate": 0.006714285714285714, + "loss": 0.1414, + "step": 230 + }, + { + "epoch": 21.74, + "learning_rate": 0.0067, + "loss": 0.126, + "step": 231 + }, + { + "epoch": 21.84, + "learning_rate": 0.006685714285714286, + "loss": 0.1528, + "step": 232 + }, + { + "epoch": 21.93, + "learning_rate": 0.006671428571428571, + "loss": 0.1316, + "step": 233 + }, + { + "epoch": 22.02, + "learning_rate": 0.006657142857142857, + "loss": 0.1565, + "step": 234 + }, + { + "epoch": 22.12, + "learning_rate": 0.006642857142857143, + "loss": 0.1088, + "step": 235 + }, + { + "epoch": 22.21, + "learning_rate": 0.006628571428571428, + "loss": 0.088, + "step": 236 + }, + { + "epoch": 22.31, + "learning_rate": 0.006614285714285715, + "loss": 0.1348, + "step": 237 + }, + { + "epoch": 22.4, + "learning_rate": 0.006600000000000001, + "loss": 0.1702, + "step": 238 + }, + { + "epoch": 22.49, + "learning_rate": 0.006585714285714286, + "loss": 0.132, + "step": 239 + }, + { + "epoch": 22.59, + "learning_rate": 0.006571428571428572, + "loss": 0.1115, + "step": 240 + }, + { + "epoch": 22.68, + "learning_rate": 0.006557142857142857, + "loss": 0.1173, + "step": 241 + }, + { + "epoch": 22.78, + "learning_rate": 0.006542857142857143, + "loss": 0.0967, + "step": 242 + }, + { + "epoch": 22.87, + "learning_rate": 0.006528571428571428, + "loss": 0.1484, + "step": 243 + }, + { + "epoch": 22.96, + "learning_rate": 0.006514285714285715, + "loss": 0.1566, + "step": 244 + }, + { + "epoch": 23.06, + "learning_rate": 0.006500000000000001, + "loss": 0.162, + "step": 245 + }, + { + "epoch": 23.15, + "learning_rate": 0.006485714285714286, + "loss": 0.1099, + "step": 246 + }, + { + "epoch": 23.25, + "learning_rate": 0.0064714285714285716, + "loss": 0.1087, + "step": 247 + }, + { + "epoch": 23.34, + "learning_rate": 0.006457142857142857, + "loss": 0.116, + "step": 248 + }, + { + "epoch": 23.44, + "learning_rate": 0.0064428571428571425, + "loss": 0.1096, + "step": 249 + }, + { + "epoch": 23.53, + "learning_rate": 0.006428571428571429, + "loss": 0.0972, + "step": 250 + }, + { + "epoch": 23.62, + "learning_rate": 0.006414285714285714, + "loss": 0.0889, + "step": 251 + }, + { + "epoch": 23.72, + "learning_rate": 0.0064, + "loss": 0.1199, + "step": 252 + }, + { + "epoch": 23.81, + "learning_rate": 0.006385714285714286, + "loss": 0.1337, + "step": 253 + }, + { + "epoch": 23.91, + "learning_rate": 0.006371428571428571, + "loss": 0.0977, + "step": 254 + }, + { + "epoch": 24.0, + "learning_rate": 0.006357142857142857, + "loss": 0.146, + "step": 255 + }, + { + "epoch": 24.09, + "learning_rate": 0.006342857142857142, + "loss": 0.1102, + "step": 256 + }, + { + "epoch": 24.19, + "learning_rate": 0.006328571428571429, + "loss": 0.1025, + "step": 257 + }, + { + "epoch": 24.28, + "learning_rate": 0.006314285714285715, + "loss": 0.09, + "step": 258 + }, + { + "epoch": 24.38, + "learning_rate": 0.0063, + "loss": 0.1302, + "step": 259 + }, + { + "epoch": 24.47, + "learning_rate": 0.006285714285714286, + "loss": 0.0739, + "step": 260 + }, + { + "epoch": 24.56, + "learning_rate": 0.006271428571428571, + "loss": 0.1172, + "step": 261 + }, + { + "epoch": 24.66, + "learning_rate": 0.006257142857142857, + "loss": 0.1048, + "step": 262 + }, + { + "epoch": 24.75, + "learning_rate": 0.006242857142857144, + "loss": 0.0977, + "step": 263 + }, + { + "epoch": 24.85, + "learning_rate": 0.006228571428571429, + "loss": 0.1056, + "step": 264 + }, + { + "epoch": 24.94, + "learning_rate": 0.006214285714285715, + "loss": 0.1252, + "step": 265 + }, + { + "epoch": 25.04, + "learning_rate": 0.0062, + "loss": 0.1107, + "step": 266 + }, + { + "epoch": 25.13, + "learning_rate": 0.006185714285714286, + "loss": 0.0887, + "step": 267 + }, + { + "epoch": 25.22, + "learning_rate": 0.006171428571428571, + "loss": 0.0836, + "step": 268 + }, + { + "epoch": 25.32, + "learning_rate": 0.0061571428571428576, + "loss": 0.0957, + "step": 269 + }, + { + "epoch": 25.41, + "learning_rate": 0.0061428571428571435, + "loss": 0.1165, + "step": 270 + }, + { + "epoch": 25.51, + "learning_rate": 0.0061285714285714285, + "loss": 0.1135, + "step": 271 + }, + { + "epoch": 25.6, + "learning_rate": 0.0061142857142857145, + "loss": 0.0901, + "step": 272 + }, + { + "epoch": 25.69, + "learning_rate": 0.0061, + "loss": 0.0751, + "step": 273 + }, + { + "epoch": 25.79, + "learning_rate": 0.0060857142857142854, + "loss": 0.109, + "step": 274 + }, + { + "epoch": 25.88, + "learning_rate": 0.006071428571428571, + "loss": 0.102, + "step": 275 + }, + { + "epoch": 25.98, + "learning_rate": 0.006057142857142858, + "loss": 0.0916, + "step": 276 + }, + { + "epoch": 26.07, + "learning_rate": 0.006042857142857143, + "loss": 0.0821, + "step": 277 + }, + { + "epoch": 26.16, + "learning_rate": 0.006028571428571429, + "loss": 0.0797, + "step": 278 + }, + { + "epoch": 26.26, + "learning_rate": 0.006014285714285714, + "loss": 0.0804, + "step": 279 + }, + { + "epoch": 26.35, + "learning_rate": 0.006, + "loss": 0.0987, + "step": 280 + }, + { + "epoch": 26.45, + "learning_rate": 0.005985714285714285, + "loss": 0.1192, + "step": 281 + }, + { + "epoch": 26.54, + "learning_rate": 0.005971428571428572, + "loss": 0.0699, + "step": 282 + }, + { + "epoch": 26.64, + "learning_rate": 0.005957142857142858, + "loss": 0.0902, + "step": 283 + }, + { + "epoch": 26.73, + "learning_rate": 0.005942857142857143, + "loss": 0.0916, + "step": 284 + }, + { + "epoch": 26.82, + "learning_rate": 0.005928571428571429, + "loss": 0.0753, + "step": 285 + }, + { + "epoch": 26.92, + "learning_rate": 0.005914285714285714, + "loss": 0.0964, + "step": 286 + }, + { + "epoch": 27.01, + "learning_rate": 0.0059, + "loss": 0.1108, + "step": 287 + }, + { + "epoch": 27.11, + "learning_rate": 0.005885714285714286, + "loss": 0.1062, + "step": 288 + }, + { + "epoch": 27.2, + "learning_rate": 0.005871428571428572, + "loss": 0.0846, + "step": 289 + }, + { + "epoch": 27.29, + "learning_rate": 0.005857142857142858, + "loss": 0.0986, + "step": 290 + }, + { + "epoch": 27.39, + "learning_rate": 0.005842857142857143, + "loss": 0.0713, + "step": 291 + }, + { + "epoch": 27.48, + "learning_rate": 0.005828571428571429, + "loss": 0.0829, + "step": 292 + }, + { + "epoch": 27.58, + "learning_rate": 0.0058142857142857145, + "loss": 0.1026, + "step": 293 + }, + { + "epoch": 27.67, + "learning_rate": 0.0058, + "loss": 0.0785, + "step": 294 + }, + { + "epoch": 27.76, + "learning_rate": 0.005785714285714286, + "loss": 0.0729, + "step": 295 + }, + { + "epoch": 27.86, + "learning_rate": 0.005771428571428572, + "loss": 0.0738, + "step": 296 + }, + { + "epoch": 27.95, + "learning_rate": 0.005757142857142857, + "loss": 0.079, + "step": 297 + }, + { + "epoch": 28.05, + "learning_rate": 0.005742857142857143, + "loss": 0.0761, + "step": 298 + }, + { + "epoch": 28.14, + "learning_rate": 0.005728571428571428, + "loss": 0.0792, + "step": 299 + }, + { + "epoch": 28.24, + "learning_rate": 0.005714285714285714, + "loss": 0.0881, + "step": 300 + }, + { + "epoch": 28.33, + "learning_rate": 0.005699999999999999, + "loss": 0.1073, + "step": 301 + }, + { + "epoch": 28.42, + "learning_rate": 0.005685714285714286, + "loss": 0.0686, + "step": 302 + }, + { + "epoch": 28.52, + "learning_rate": 0.005671428571428572, + "loss": 0.0701, + "step": 303 + }, + { + "epoch": 28.61, + "learning_rate": 0.005657142857142857, + "loss": 0.1114, + "step": 304 + }, + { + "epoch": 28.71, + "learning_rate": 0.005642857142857143, + "loss": 0.0595, + "step": 305 + }, + { + "epoch": 28.8, + "learning_rate": 0.005628571428571428, + "loss": 0.086, + "step": 306 + }, + { + "epoch": 28.89, + "learning_rate": 0.005614285714285714, + "loss": 0.0877, + "step": 307 + }, + { + "epoch": 28.99, + "learning_rate": 0.005600000000000001, + "loss": 0.0582, + "step": 308 + }, + { + "epoch": 29.08, + "learning_rate": 0.005585714285714286, + "loss": 0.0645, + "step": 309 + }, + { + "epoch": 29.18, + "learning_rate": 0.005571428571428572, + "loss": 0.1025, + "step": 310 + }, + { + "epoch": 29.27, + "learning_rate": 0.005557142857142857, + "loss": 0.0612, + "step": 311 + }, + { + "epoch": 29.36, + "learning_rate": 0.005542857142857143, + "loss": 0.0706, + "step": 312 + }, + { + "epoch": 29.46, + "learning_rate": 0.005528571428571429, + "loss": 0.0636, + "step": 313 + }, + { + "epoch": 29.55, + "learning_rate": 0.005514285714285714, + "loss": 0.0721, + "step": 314 + }, + { + "epoch": 29.65, + "learning_rate": 0.0055000000000000005, + "loss": 0.1062, + "step": 315 + }, + { + "epoch": 29.74, + "learning_rate": 0.0054857142857142865, + "loss": 0.0739, + "step": 316 + }, + { + "epoch": 29.84, + "learning_rate": 0.0054714285714285715, + "loss": 0.0688, + "step": 317 + }, + { + "epoch": 29.93, + "learning_rate": 0.0054571428571428575, + "loss": 0.0715, + "step": 318 + }, + { + "epoch": 30.02, + "learning_rate": 0.0054428571428571425, + "loss": 0.0628, + "step": 319 + }, + { + "epoch": 30.12, + "learning_rate": 0.0054285714285714284, + "loss": 0.0831, + "step": 320 + }, + { + "epoch": 30.21, + "learning_rate": 0.005414285714285715, + "loss": 0.0833, + "step": 321 + }, + { + "epoch": 30.31, + "learning_rate": 0.0054, + "loss": 0.09, + "step": 322 + }, + { + "epoch": 30.4, + "learning_rate": 0.005385714285714286, + "loss": 0.0469, + "step": 323 + }, + { + "epoch": 30.49, + "learning_rate": 0.005371428571428571, + "loss": 0.0631, + "step": 324 + }, + { + "epoch": 30.59, + "learning_rate": 0.005357142857142857, + "loss": 0.0685, + "step": 325 + }, + { + "epoch": 30.68, + "learning_rate": 0.005342857142857142, + "loss": 0.0798, + "step": 326 + }, + { + "epoch": 30.78, + "learning_rate": 0.005328571428571428, + "loss": 0.0653, + "step": 327 + }, + { + "epoch": 30.87, + "learning_rate": 0.005314285714285715, + "loss": 0.0615, + "step": 328 + }, + { + "epoch": 30.96, + "learning_rate": 0.0053, + "loss": 0.0548, + "step": 329 + }, + { + "epoch": 31.06, + "learning_rate": 0.005285714285714286, + "loss": 0.0592, + "step": 330 + }, + { + "epoch": 31.15, + "learning_rate": 0.005271428571428572, + "loss": 0.0628, + "step": 331 + }, + { + "epoch": 31.25, + "learning_rate": 0.005257142857142857, + "loss": 0.0604, + "step": 332 + }, + { + "epoch": 31.34, + "learning_rate": 0.005242857142857143, + "loss": 0.0833, + "step": 333 + }, + { + "epoch": 31.44, + "learning_rate": 0.005228571428571429, + "loss": 0.0748, + "step": 334 + }, + { + "epoch": 31.53, + "learning_rate": 0.005214285714285715, + "loss": 0.0495, + "step": 335 + }, + { + "epoch": 31.62, + "learning_rate": 0.005200000000000001, + "loss": 0.0589, + "step": 336 + }, + { + "epoch": 31.72, + "learning_rate": 0.005185714285714286, + "loss": 0.0655, + "step": 337 + }, + { + "epoch": 31.81, + "learning_rate": 0.005171428571428572, + "loss": 0.0695, + "step": 338 + }, + { + "epoch": 31.91, + "learning_rate": 0.005157142857142857, + "loss": 0.0609, + "step": 339 + }, + { + "epoch": 32.0, + "learning_rate": 0.005142857142857143, + "loss": 0.0636, + "step": 340 + }, + { + "epoch": 32.09, + "learning_rate": 0.005128571428571429, + "loss": 0.0606, + "step": 341 + }, + { + "epoch": 32.19, + "learning_rate": 0.0051142857142857144, + "loss": 0.0739, + "step": 342 + }, + { + "epoch": 32.28, + "learning_rate": 0.0051, + "loss": 0.0535, + "step": 343 + }, + { + "epoch": 32.38, + "learning_rate": 0.005085714285714285, + "loss": 0.0598, + "step": 344 + }, + { + "epoch": 32.47, + "learning_rate": 0.005071428571428571, + "loss": 0.06, + "step": 345 + }, + { + "epoch": 32.56, + "learning_rate": 0.005057142857142856, + "loss": 0.0734, + "step": 346 + }, + { + "epoch": 32.66, + "learning_rate": 0.005042857142857143, + "loss": 0.078, + "step": 347 + }, + { + "epoch": 32.75, + "learning_rate": 0.005028571428571429, + "loss": 0.0618, + "step": 348 + }, + { + "epoch": 32.85, + "learning_rate": 0.005014285714285714, + "loss": 0.0655, + "step": 349 + }, + { + "epoch": 32.94, + "learning_rate": 0.005, + "loss": 0.0615, + "step": 350 + }, + { + "epoch": 33.04, + "learning_rate": 0.004985714285714286, + "loss": 0.0556, + "step": 351 + }, + { + "epoch": 33.13, + "learning_rate": 0.004971428571428572, + "loss": 0.0637, + "step": 352 + }, + { + "epoch": 33.22, + "learning_rate": 0.004957142857142857, + "loss": 0.0518, + "step": 353 + }, + { + "epoch": 33.32, + "learning_rate": 0.004942857142857143, + "loss": 0.0466, + "step": 354 + }, + { + "epoch": 33.41, + "learning_rate": 0.004928571428571429, + "loss": 0.0732, + "step": 355 + }, + { + "epoch": 33.51, + "learning_rate": 0.004914285714285715, + "loss": 0.0584, + "step": 356 + }, + { + "epoch": 33.6, + "learning_rate": 0.0049, + "loss": 0.0586, + "step": 357 + }, + { + "epoch": 33.69, + "learning_rate": 0.004885714285714286, + "loss": 0.0481, + "step": 358 + }, + { + "epoch": 33.79, + "learning_rate": 0.004871428571428572, + "loss": 0.0552, + "step": 359 + }, + { + "epoch": 33.88, + "learning_rate": 0.004857142857142858, + "loss": 0.0567, + "step": 360 + }, + { + "epoch": 33.98, + "learning_rate": 0.004842857142857143, + "loss": 0.0664, + "step": 361 + }, + { + "epoch": 34.07, + "learning_rate": 0.004828571428571429, + "loss": 0.0701, + "step": 362 + }, + { + "epoch": 34.16, + "learning_rate": 0.0048142857142857145, + "loss": 0.069, + "step": 363 + }, + { + "epoch": 34.26, + "learning_rate": 0.0048, + "loss": 0.066, + "step": 364 + }, + { + "epoch": 34.35, + "learning_rate": 0.004785714285714286, + "loss": 0.0546, + "step": 365 + }, + { + "epoch": 34.45, + "learning_rate": 0.004771428571428571, + "loss": 0.0616, + "step": 366 + }, + { + "epoch": 34.54, + "learning_rate": 0.004757142857142857, + "loss": 0.0374, + "step": 367 + }, + { + "epoch": 34.64, + "learning_rate": 0.004742857142857143, + "loss": 0.046, + "step": 368 + }, + { + "epoch": 34.73, + "learning_rate": 0.004728571428571428, + "loss": 0.0459, + "step": 369 + }, + { + "epoch": 34.82, + "learning_rate": 0.004714285714285714, + "loss": 0.0648, + "step": 370 + }, + { + "epoch": 34.92, + "learning_rate": 0.0047, + "loss": 0.0699, + "step": 371 + }, + { + "epoch": 35.01, + "learning_rate": 0.004685714285714286, + "loss": 0.0605, + "step": 372 + }, + { + "epoch": 35.11, + "learning_rate": 0.004671428571428571, + "loss": 0.0704, + "step": 373 + }, + { + "epoch": 35.2, + "learning_rate": 0.004657142857142857, + "loss": 0.0444, + "step": 374 + }, + { + "epoch": 35.29, + "learning_rate": 0.004642857142857143, + "loss": 0.062, + "step": 375 + }, + { + "epoch": 35.39, + "learning_rate": 0.004628571428571429, + "loss": 0.0464, + "step": 376 + }, + { + "epoch": 35.48, + "learning_rate": 0.004614285714285714, + "loss": 0.0548, + "step": 377 + }, + { + "epoch": 35.58, + "learning_rate": 0.0046, + "loss": 0.0555, + "step": 378 + }, + { + "epoch": 35.67, + "learning_rate": 0.004585714285714286, + "loss": 0.0654, + "step": 379 + }, + { + "epoch": 35.76, + "learning_rate": 0.004571428571428572, + "loss": 0.0592, + "step": 380 + }, + { + "epoch": 35.86, + "learning_rate": 0.004557142857142858, + "loss": 0.0521, + "step": 381 + }, + { + "epoch": 35.95, + "learning_rate": 0.004542857142857143, + "loss": 0.0633, + "step": 382 + }, + { + "epoch": 36.05, + "learning_rate": 0.004528571428571429, + "loss": 0.047, + "step": 383 + }, + { + "epoch": 36.14, + "learning_rate": 0.004514285714285714, + "loss": 0.0476, + "step": 384 + }, + { + "epoch": 36.24, + "learning_rate": 0.0045000000000000005, + "loss": 0.051, + "step": 385 + }, + { + "epoch": 36.33, + "learning_rate": 0.004485714285714286, + "loss": 0.064, + "step": 386 + }, + { + "epoch": 36.42, + "learning_rate": 0.0044714285714285715, + "loss": 0.0309, + "step": 387 + }, + { + "epoch": 36.52, + "learning_rate": 0.0044571428571428574, + "loss": 0.0632, + "step": 388 + }, + { + "epoch": 36.61, + "learning_rate": 0.004442857142857143, + "loss": 0.0583, + "step": 389 + }, + { + "epoch": 36.71, + "learning_rate": 0.004428571428571428, + "loss": 0.0524, + "step": 390 + }, + { + "epoch": 36.8, + "learning_rate": 0.004414285714285714, + "loss": 0.0574, + "step": 391 + }, + { + "epoch": 36.89, + "learning_rate": 0.0044, + "loss": 0.043, + "step": 392 + }, + { + "epoch": 36.99, + "learning_rate": 0.004385714285714285, + "loss": 0.0482, + "step": 393 + }, + { + "epoch": 37.08, + "learning_rate": 0.004371428571428572, + "loss": 0.0585, + "step": 394 + }, + { + "epoch": 37.18, + "learning_rate": 0.004357142857142857, + "loss": 0.0467, + "step": 395 + }, + { + "epoch": 37.27, + "learning_rate": 0.004342857142857143, + "loss": 0.0498, + "step": 396 + }, + { + "epoch": 37.36, + "learning_rate": 0.004328571428571429, + "loss": 0.0578, + "step": 397 + }, + { + "epoch": 37.46, + "learning_rate": 0.004314285714285714, + "loss": 0.0469, + "step": 398 + }, + { + "epoch": 37.55, + "learning_rate": 0.0043, + "loss": 0.0447, + "step": 399 + }, + { + "epoch": 37.65, + "learning_rate": 0.004285714285714286, + "loss": 0.0669, + "step": 400 + } + ], + "logging_steps": 1.0, + "max_steps": 700, + "num_train_epochs": 70, + "save_steps": 100, + "total_flos": 4.700697287196672e+17, + "trial_name": null, + "trial_params": null +} diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/training_args.bin b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..17f9bfbf1a7cdd9e0e808e0672d55ad9ad4efb5f --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00669a32a6ddac0a3243bbc04d3f1f70ffc8f89f2626c1fdafa93ce68c311aa0 +size 4664 diff --git a/linghua_pt-20231202-155337-128-1e-2/train.log b/linghua_pt-20231202-155337-128-1e-2/train.log new file mode 100644 index 0000000000000000000000000000000000000000..f0083873ad0e076c5283417ad46ac56382c053a5 --- /dev/null +++ b/linghua_pt-20231202-155337-128-1e-2/train.log @@ -0,0 +1,2712 @@ +[2023-12-02 15:53:38,497] torch.distributed.run: [WARNING] master_addr is only used for static rdzv_backend and when rdzv_endpoint is not specified. +12/02/2023 15:53:40 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False +12/02/2023 15:53:40 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=None, +disable_tqdm=False, +dispatch_batches=None, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +generation_config=None, +generation_max_length=None, +generation_num_beams=None, +gradient_accumulation_steps=16, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=0.01, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=output/linghua_pt-20231202-155337-128-1e-2/runs/Dec02_15-53-40_2e1f45f46fdb, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1.0, +logging_strategy=steps, +lr_scheduler_type=linear, +max_grad_norm=1.0, +max_steps=700, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=3.0, +optim=adamw_torch, +optim_args=None, +output_dir=output/linghua_pt-20231202-155337-128-1e-2, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +predict_with_generate=False, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=[], +resume_from_checkpoint=None, +run_name=output/linghua_pt-20231202-155337-128-1e-2, +save_on_each_node=False, +save_safetensors=False, +save_steps=100, +save_strategy=steps, +save_total_limit=None, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +sortish_sampler=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=0, +weight_decay=0.0, +) +[INFO|configuration_utils.py:713] 2023-12-02 15:53:40,603 >> loading configuration file chatglm3-6b/config.json +[INFO|configuration_utils.py:713] 2023-12-02 15:53:40,607 >> loading configuration file chatglm3-6b/config.json +[INFO|configuration_utils.py:775] 2023-12-02 15:53:40,608 >> Model config ChatGLMConfig { + "_name_or_path": "chatglm3-6b", + "add_bias_linear": false, + "add_qkv_bias": true, + "apply_query_key_layer_scaling": true, + "apply_residual_connection_post_layernorm": false, + "architectures": [ + "ChatGLMModel" + ], + "attention_dropout": 0.0, + "attention_softmax_in_fp32": true, + "auto_map": { + "AutoConfig": "configuration_chatglm.ChatGLMConfig", + "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification" + }, + "bias_dropout_fusion": true, + "classifier_dropout": null, + "eos_token_id": 2, + "ffn_hidden_size": 13696, + "fp32_residual_connection": false, + "hidden_dropout": 0.0, + "hidden_size": 4096, + "kv_channels": 128, + "layernorm_epsilon": 1e-05, + "model_type": "chatglm", + "multi_query_attention": true, + "multi_query_group_num": 2, + "num_attention_heads": 32, + "num_layers": 28, + "original_rope": true, + "pad_token_id": 0, + "padded_vocab_size": 65024, + "post_layer_norm": true, + "pre_seq_len": null, + "prefix_projection": false, + "quantization_bit": 0, + "rmsnorm": true, + "seq_length": 8192, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.34.0", + "use_cache": true, + "vocab_size": 65024 +} + +[INFO|tokenization_utils_base.py:2041] 2023-12-02 15:53:40,612 >> loading file tokenizer.model +[INFO|tokenization_utils_base.py:2041] 2023-12-02 15:53:40,612 >> loading file added_tokens.json +[INFO|tokenization_utils_base.py:2041] 2023-12-02 15:53:40,612 >> loading file special_tokens_map.json +[INFO|tokenization_utils_base.py:2041] 2023-12-02 15:53:40,612 >> loading file tokenizer_config.json +[INFO|tokenization_utils_base.py:2041] 2023-12-02 15:53:40,612 >> loading file tokenizer.json +[INFO|modeling_utils.py:2990] 2023-12-02 15:53:40,832 >> loading weights file chatglm3-6b/pytorch_model.bin.index.json +[INFO|configuration_utils.py:770] 2023-12-02 15:53:40,833 >> Generate config GenerationConfig { + "eos_token_id": 2, + "pad_token_id": 0 +} + + Loading checkpoint shards: 0%| | 0/7 [00:00> All model checkpoint weights were used when initializing ChatGLMForConditionalGeneration. + +[WARNING|modeling_utils.py:3777] 2023-12-02 15:53:50,295 >> Some weights of ChatGLMForConditionalGeneration were not initialized from the model checkpoint at chatglm3-6b and are newly initialized: ['transformer.prefix_encoder.embedding.weight'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +[INFO|modeling_utils.py:3352] 2023-12-02 15:53:50,297 >> Generation config file not found, using a generation config created from the model config. +Sanity Check >>>>>>>>>>>>> + '[gMASK]': 64790 -> -100 + 'sop': 64792 -> -100 + '<|system|>': 64794 -> -100 + '': 30910 -> -100 + '\n': 13 -> -100 + '': 30910 -> -100 + '你的': 31822 -> -100 + '名字': 32873 -> -100 + '是': 54532 -> -100 + '神': 54826 -> -100 + '里': 54662 -> -100 + '绫': 60309 -> -100 + '华': 54855 -> -100 + ',': 31123 -> -100 + '你是': 34607 -> -100 + '稻': 56929 -> -100 + '妻': 55769 -> -100 + '「': 31519 -> -100 + '社': 54731 -> -100 + '奉': 56053 -> -100 + '行': 54560 -> -100 + '」': 31522 -> -100 + '神': 54826 -> -100 + '里': 54662 -> -100 + '家': 54561 -> -100 + '的大': 31922 -> -100 + '小姐': 36028 -> -100 + '。': 31155 -> -100 + '请': 55073 -> -100 + '详细的': 42196 -> -100 + '回答': 33287 -> -100 + '用户': 32053 -> -100 + '的一切': 34688 -> -100 + '问题': 31639 -> -100 + '。': 31155 -> -100 + '<|user|>': 64795 -> -100 + '': 30910 -> -100 + '\n': 13 -> -100 + '你': 36474 -> -100 + '好': 54591 -> -100 + '呀': 56657 -> -100 + '!': 31404 -> -100 + '<|assistant|>': 64796 -> -100 + '': 30910 -> 30910 + '\n': 13 -> 13 + '你': 36474 -> 36474 + '好': 54591 -> 54591 + '呀': 56657 -> 56657 + ',': 31123 -> 31123 + '旅行': 33450 -> 33450 + '者': 54631 -> 54631 + '!': 31404 -> 31404 + '我是': 33030 -> 33030 + '神': 54826 -> 54826 + '里': 54662 -> 54662 + '绫': 60309 -> 60309 + '华': 54855 -> 54855 + ',': 31123 -> 31123 + '很高兴': 48895 -> 48895 + '认识': 32254 -> 32254 + '你': 54622 -> 54622 + '!': 31404 -> 31404 + '': 2 -> 2 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 + '': 0 -> -100 +<<<<<<<<<<<<< Sanity Check +[INFO|trainer.py:576] 2023-12-02 15:53:52,690 >> max_steps is given, it will override any value given in num_train_epochs +[INFO|trainer.py:1760] 2023-12-02 15:53:53,665 >> ***** Running training ***** +[INFO|trainer.py:1761] 2023-12-02 15:53:53,665 >> Num examples = 170 +[INFO|trainer.py:1762] 2023-12-02 15:53:53,665 >> Num Epochs = 70 +[INFO|trainer.py:1763] 2023-12-02 15:53:53,666 >> Instantaneous batch size per device = 1 +[INFO|trainer.py:1766] 2023-12-02 15:53:53,666 >> Total train batch size (w. parallel, distributed & accumulation) = 16 +[INFO|trainer.py:1767] 2023-12-02 15:53:53,666 >> Gradient Accumulation steps = 16 +[INFO|trainer.py:1768] 2023-12-02 15:53:53,666 >> Total optimization steps = 700 +[INFO|trainer.py:1769] 2023-12-02 15:53:53,666 >> Number of trainable parameters = 1,835,008 + 0%| | 0/700 [00:00> Configuration saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/config.json +[INFO|configuration_utils.py:544] 2023-12-02 16:14:12,099 >> Configuration saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/generation_config.json +[INFO|modeling_utils.py:2118] 2023-12-02 16:14:12,111 >> Model weights saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/pytorch_model.bin +[INFO|tokenization_utils_base.py:2437] 2023-12-02 16:14:12,112 >> tokenizer config file saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-12-02 16:14:12,112 >> Special tokens file saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/special_tokens_map.json +/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. + warnings.warn( + 14%|█████████████▊ | 101/700 [20:30<2:01:40, 12.19s/it] {'loss': 0.7104, 'learning_rate': 0.008557142857142859, 'epoch': 9.51} + 14%|█████████████▊ | 101/700 [20:30<2:01:40, 12.19s/it] 15%|█████████████▉ | 102/700 [20:42<2:01:26, 12.18s/it] {'loss': 0.9672, 'learning_rate': 0.008542857142857144, 'epoch': 9.6} + 15%|█████████████▉ | 102/700 [20:42<2:01:26, 12.18s/it] 15%|██████████████▏ | 103/700 [20:54<2:01:10, 12.18s/it] {'loss': 0.7593, 'learning_rate': 0.008528571428571429, 'epoch': 9.69} + 15%|██████████████▏ | 103/700 [20:54<2:01:10, 12.18s/it] 15%|██████████████▎ | 104/700 [21:07<2:00:59, 12.18s/it] {'loss': 1.0186, 'learning_rate': 0.008514285714285714, 'epoch': 9.79} + 15%|██████████████▎ | 104/700 [21:07<2:00:59, 12.18s/it] 15%|██████████████▍ | 105/700 [21:19<2:00:45, 12.18s/it] {'loss': 0.7898, 'learning_rate': 0.0085, 'epoch': 9.88} + 15%|██████████████▍ | 105/700 [21:19<2:00:45, 12.18s/it] 15%|██████████████▌ | 106/700 [21:31<2:00:33, 12.18s/it] {'loss': 0.7392, 'learning_rate': 0.008485714285714286, 'epoch': 9.98} + 15%|██████████████▌ | 106/700 [21:31<2:00:33, 12.18s/it] 15%|██████████████▋ | 107/700 [21:43<2:00:20, 12.18s/it] {'loss': 0.7295, 'learning_rate': 0.008471428571428572, 'epoch': 10.07} + 15%|██████████████▋ | 107/700 [21:43<2:00:20, 12.18s/it] 15%|██████████████▊ | 108/700 [21:55<2:00:08, 12.18s/it] {'loss': 0.7211, 'learning_rate': 0.008457142857142858, 'epoch': 10.16} + 15%|██████████████▊ | 108/700 [21:55<2:00:08, 12.18s/it] 16%|██████████████▉ | 109/700 [22:07<1:59:54, 12.17s/it] {'loss': 0.769, 'learning_rate': 0.008442857142857143, 'epoch': 10.26} + 16%|██████████████▉ | 109/700 [22:07<1:59:54, 12.17s/it] 16%|███████████████ | 110/700 [22:20<1:59:43, 12.18s/it] {'loss': 0.718, 'learning_rate': 0.00842857142857143, 'epoch': 10.35} + 16%|███████████████ | 110/700 [22:20<1:59:43, 12.18s/it] 16%|███████████████▏ | 111/700 [22:32<1:59:29, 12.17s/it] {'loss': 0.6411, 'learning_rate': 0.008414285714285714, 'epoch': 10.45} + 16%|███████████████▏ | 111/700 [22:32<1:59:29, 12.17s/it] 16%|███████████████▎ | 112/700 [22:44<1:59:16, 12.17s/it] {'loss': 0.8016, 'learning_rate': 0.0084, 'epoch': 10.54} + 16%|███████████████▎ | 112/700 [22:44<1:59:16, 12.17s/it] 16%|███████████████▍ | 113/700 [22:56<1:59:03, 12.17s/it] {'loss': 0.6633, 'learning_rate': 0.008385714285714286, 'epoch': 10.64} + 16%|███████████████▍ | 113/700 [22:56<1:59:03, 12.17s/it] 16%|███████████████▋ | 114/700 [23:08<1:58:50, 12.17s/it] {'loss': 0.7257, 'learning_rate': 0.008371428571428571, 'epoch': 10.73} + 16%|███████████████▋ | 114/700 [23:08<1:58:50, 12.17s/it] 16%|███████████████▊ | 115/700 [23:21<1:58:38, 12.17s/it] {'loss': 0.7785, 'learning_rate': 0.008357142857142858, 'epoch': 10.82} + 16%|███████████████▊ | 115/700 [23:21<1:58:38, 12.17s/it] 17%|███████████████▉ | 116/700 [23:33<1:58:27, 12.17s/it] {'loss': 0.8927, 'learning_rate': 0.008342857142857143, 'epoch': 10.92} + 17%|███████████████▉ | 116/700 [23:33<1:58:27, 12.17s/it] 17%|████████████████ | 117/700 [23:45<1:58:19, 12.18s/it] {'loss': 0.7242, 'learning_rate': 0.008328571428571428, 'epoch': 11.01} + 17%|████████████████ | 117/700 [23:45<1:58:19, 12.18s/it] 17%|████████████████▏ | 118/700 [23:57<1:58:07, 12.18s/it] {'loss': 0.8297, 'learning_rate': 0.008314285714285715, 'epoch': 11.11} + 17%|████████████████▏ | 118/700 [23:57<1:58:07, 12.18s/it] 17%|████████████████▎ | 119/700 [24:09<1:57:56, 12.18s/it] {'loss': 0.6761, 'learning_rate': 0.0083, 'epoch': 11.2} + 17%|████████████████▎ | 119/700 [24:09<1:57:56, 12.18s/it] 17%|████████████████▍ | 120/700 [24:21<1:57:41, 12.18s/it] {'loss': 0.6699, 'learning_rate': 0.008285714285714287, 'epoch': 11.29} + 17%|████████████████▍ | 120/700 [24:21<1:57:41, 12.18s/it] 17%|████████████████▌ | 121/700 [24:34<1:57:28, 12.17s/it] {'loss': 0.5365, 'learning_rate': 0.008271428571428572, 'epoch': 11.39} + 17%|████████████████▌ | 121/700 [24:34<1:57:28, 12.17s/it] 17%|████████████████▋ | 122/700 [24:46<1:57:15, 12.17s/it] {'loss': 0.9045, 'learning_rate': 0.008257142857142857, 'epoch': 11.48} + 17%|████████████████▋ | 122/700 [24:46<1:57:15, 12.17s/it] 18%|████████████████▊ | 123/700 [24:58<1:57:04, 12.17s/it] {'loss': 0.5071, 'learning_rate': 0.008242857142857144, 'epoch': 11.58} + 18%|████████████████▊ | 123/700 [24:58<1:57:04, 12.17s/it] 18%|█████████████████ | 124/700 [25:10<1:56:51, 12.17s/it] {'loss': 0.6472, 'learning_rate': 0.008228571428571429, 'epoch': 11.67} + 18%|█████████████████ | 124/700 [25:10<1:56:51, 12.17s/it] 18%|█████████████████▏ | 125/700 [25:22<1:56:38, 12.17s/it] {'loss': 0.6232, 'learning_rate': 0.008214285714285714, 'epoch': 11.76} + 18%|█████████████████▏ | 125/700 [25:22<1:56:38, 12.17s/it] 18%|█████████████████▎ | 126/700 [25:34<1:56:25, 12.17s/it] {'loss': 0.4905, 'learning_rate': 0.008199999999999999, 'epoch': 11.86} + 18%|█████████████████▎ | 126/700 [25:34<1:56:25, 12.17s/it] 18%|█████████████████▍ | 127/700 [25:47<1:56:12, 12.17s/it] {'loss': 0.557, 'learning_rate': 0.008185714285714286, 'epoch': 11.95} + 18%|█████████████████▍ | 127/700 [25:47<1:56:12, 12.17s/it] 18%|█████████████████▌ | 128/700 [25:59<1:56:00, 12.17s/it] {'loss': 0.5517, 'learning_rate': 0.008171428571428573, 'epoch': 12.05} + 18%|█████████████████▌ | 128/700 [25:59<1:56:00, 12.17s/it] 18%|█████████████████▋ | 129/700 [26:11<1:55:50, 12.17s/it] {'loss': 0.6321, 'learning_rate': 0.008157142857142858, 'epoch': 12.14} + 18%|█████████████████▋ | 129/700 [26:11<1:55:50, 12.17s/it] 19%|█████████████████▊ | 130/700 [26:23<1:55:37, 12.17s/it] {'loss': 0.6619, 'learning_rate': 0.008142857142857143, 'epoch': 12.24} + 19%|█████████████████▊ | 130/700 [26:23<1:55:37, 12.17s/it] 19%|█████████████████▉ | 131/700 [26:35<1:55:27, 12.17s/it] {'loss': 0.5524, 'learning_rate': 0.008128571428571428, 'epoch': 12.33} + 19%|█████████████████▉ | 131/700 [26:35<1:55:27, 12.17s/it] 19%|██████████████████ | 132/700 [26:47<1:55:14, 12.17s/it] {'loss': 0.4688, 'learning_rate': 0.008114285714285715, 'epoch': 12.42} + 19%|██████████████████ | 132/700 [26:47<1:55:14, 12.17s/it] 19%|██████████████████▏ | 133/700 [27:00<1:55:01, 12.17s/it] {'loss': 0.3717, 'learning_rate': 0.008100000000000001, 'epoch': 12.52} + 19%|██████████████████▏ | 133/700 [27:00<1:55:01, 12.17s/it] 19%|██████████████████▍ | 134/700 [27:12<1:54:48, 12.17s/it] {'loss': 0.5118, 'learning_rate': 0.008085714285714286, 'epoch': 12.61} + 19%|██████████████████▍ | 134/700 [27:12<1:54:48, 12.17s/it] 19%|██████████████████▌ | 135/700 [27:24<1:54:38, 12.17s/it] {'loss': 0.4521, 'learning_rate': 0.008071428571428571, 'epoch': 12.71} + 19%|██████████████████▌ | 135/700 [27:24<1:54:38, 12.17s/it] 19%|██████████████████▋ | 136/700 [27:36<1:54:25, 12.17s/it] {'loss': 0.5865, 'learning_rate': 0.008057142857142856, 'epoch': 12.8} + 19%|██████████████████▋ | 136/700 [27:36<1:54:25, 12.17s/it] 20%|██████████████████▊ | 137/700 [27:48<1:54:14, 12.17s/it] {'loss': 0.5977, 'learning_rate': 0.008042857142857143, 'epoch': 12.89} + 20%|██████████████████▊ | 137/700 [27:48<1:54:14, 12.17s/it] 20%|██████████████████▉ | 138/700 [28:00<1:54:00, 12.17s/it] {'loss': 0.6977, 'learning_rate': 0.008028571428571428, 'epoch': 12.99} + 20%|██████████████████▉ | 138/700 [28:00<1:54:00, 12.17s/it] 20%|███████████████████ | 139/700 [28:13<1:53:47, 12.17s/it] {'loss': 0.5625, 'learning_rate': 0.008014285714285713, 'epoch': 13.08} + 20%|███████████████████ | 139/700 [28:13<1:53:47, 12.17s/it] 20%|███████████████████▏ | 140/700 [28:25<1:53:34, 12.17s/it] {'loss': 0.3611, 'learning_rate': 0.008, 'epoch': 13.18} + 20%|███████████████████▏ | 140/700 [28:25<1:53:34, 12.17s/it] 20%|███████████████████▎ | 141/700 [28:37<1:53:22, 12.17s/it] {'loss': 0.5168, 'learning_rate': 0.007985714285714287, 'epoch': 13.27} + 20%|███████████████████▎ | 141/700 [28:37<1:53:22, 12.17s/it] 20%|███████████████████▍ | 142/700 [28:49<1:53:12, 12.17s/it] {'loss': 0.4429, 'learning_rate': 0.007971428571428572, 'epoch': 13.36} + 20%|███████████████████▍ | 142/700 [28:49<1:53:12, 12.17s/it] 20%|███████████████████▌ | 143/700 [29:01<1:52:58, 12.17s/it] {'loss': 0.4998, 'learning_rate': 0.007957142857142857, 'epoch': 13.46} + 20%|███████████████████▌ | 143/700 [29:01<1:52:58, 12.17s/it] 21%|███████████████████▋ | 144/700 [29:14<1:52:47, 12.17s/it] {'loss': 0.4437, 'learning_rate': 0.007942857142857142, 'epoch': 13.55} + 21%|███████████████████▋ | 144/700 [29:14<1:52:47, 12.17s/it] 21%|███████████████████▉ | 145/700 [29:26<1:52:34, 12.17s/it] {'loss': 0.4958, 'learning_rate': 0.007928571428571429, 'epoch': 13.65} + 21%|███████████████████▉ | 145/700 [29:26<1:52:34, 12.17s/it] 21%|████████████████████ | 146/700 [29:38<1:52:22, 12.17s/it] {'loss': 0.4021, 'learning_rate': 0.007914285714285716, 'epoch': 13.74} + 21%|████████████████████ | 146/700 [29:38<1:52:22, 12.17s/it] 21%|████████████████████▏ | 147/700 [29:50<1:52:10, 12.17s/it] {'loss': 0.6163, 'learning_rate': 0.0079, 'epoch': 13.84} + 21%|████████████████████▏ | 147/700 [29:50<1:52:10, 12.17s/it] 21%|████████████████████▎ | 148/700 [30:02<1:51:58, 12.17s/it] {'loss': 0.406, 'learning_rate': 0.007885714285714286, 'epoch': 13.93} + 21%|████████████████████▎ | 148/700 [30:02<1:51:58, 12.17s/it] 21%|████████████████████▍ | 149/700 [30:14<1:51:47, 12.17s/it] {'loss': 0.4905, 'learning_rate': 0.007871428571428571, 'epoch': 14.02} + 21%|████████████████████▍ | 149/700 [30:14<1:51:47, 12.17s/it] 21%|████████████████████▌ | 150/700 [30:27<1:51:34, 12.17s/it] {'loss': 0.3824, 'learning_rate': 0.007857142857142858, 'epoch': 14.12} + 21%|████████████████████▌ | 150/700 [30:27<1:51:34, 12.17s/it] 22%|████████████████████▋ | 151/700 [30:39<1:51:21, 12.17s/it] {'loss': 0.3591, 'learning_rate': 0.007842857142857143, 'epoch': 14.21} + 22%|████████████████████▋ | 151/700 [30:39<1:51:21, 12.17s/it] 22%|████████████████████▊ | 152/700 [30:51<1:51:08, 12.17s/it] {'loss': 0.342, 'learning_rate': 0.007828571428571428, 'epoch': 14.31} + 22%|████████████████████▊ | 152/700 [30:51<1:51:08, 12.17s/it] 22%|████████████████████▉ | 153/700 [31:03<1:50:57, 12.17s/it] {'loss': 0.4565, 'learning_rate': 0.007814285714285715, 'epoch': 14.4} + 22%|████████████████████▉ | 153/700 [31:03<1:50:57, 12.17s/it] 22%|█████████████████████ | 154/700 [31:15<1:50:45, 12.17s/it] {'loss': 0.3287, 'learning_rate': 0.0078000000000000005, 'epoch': 14.49} + 22%|█████████████████████ | 154/700 [31:15<1:50:45, 12.17s/it] 22%|█████████████████████▎ | 155/700 [31:27<1:50:32, 12.17s/it] {'loss': 0.4179, 'learning_rate': 0.007785714285714286, 'epoch': 14.59} + 22%|█████████████████████▎ | 155/700 [31:27<1:50:32, 12.17s/it] 22%|█████████████████████▍ | 156/700 [31:40<1:50:19, 12.17s/it] {'loss': 0.3586, 'learning_rate': 0.0077714285714285715, 'epoch': 14.68} + 22%|█████████████████████▍ | 156/700 [31:40<1:50:19, 12.17s/it] 22%|█████████████████████▌ | 157/700 [31:52<1:50:07, 12.17s/it] {'loss': 0.4618, 'learning_rate': 0.007757142857142857, 'epoch': 14.78} + 22%|█████████████████████▌ | 157/700 [31:52<1:50:07, 12.17s/it] 23%|█████████████████████▋ | 158/700 [32:04<1:49:56, 12.17s/it] {'loss': 0.4133, 'learning_rate': 0.0077428571428571425, 'epoch': 14.87} + 23%|█████████████████████▋ | 158/700 [32:04<1:49:56, 12.17s/it] 23%|█████████████████████▊ | 159/700 [32:16<1:49:46, 12.18s/it] {'loss': 0.4326, 'learning_rate': 0.007728571428571429, 'epoch': 14.96} + 23%|█████████████████████▊ | 159/700 [32:16<1:49:46, 12.18s/it] 23%|█████████████████████▉ | 160/700 [32:28<1:49:35, 12.18s/it] {'loss': 0.3838, 'learning_rate': 0.007714285714285715, 'epoch': 15.06} + 23%|█████████████████████▉ | 160/700 [32:28<1:49:35, 12.18s/it] 23%|██████████████████████ | 161/700 [32:40<1:49:24, 12.18s/it] {'loss': 0.2978, 'learning_rate': 0.0077, 'epoch': 15.15} + 23%|██████████████████████ | 161/700 [32:40<1:49:24, 12.18s/it] 23%|██████████████████████▏ | 162/700 [32:53<1:49:13, 12.18s/it] {'loss': 0.3993, 'learning_rate': 0.007685714285714286, 'epoch': 15.25} + 23%|██████████████████████▏ | 162/700 [32:53<1:49:13, 12.18s/it] 23%|██████████████████████▎ | 163/700 [33:05<1:49:02, 12.18s/it] {'loss': 0.3249, 'learning_rate': 0.007671428571428571, 'epoch': 15.34} + 23%|██████████████████████▎ | 163/700 [33:05<1:49:02, 12.18s/it] 23%|██████████████████████▍ | 164/700 [33:17<1:48:51, 12.19s/it] {'loss': 0.2796, 'learning_rate': 0.007657142857142857, 'epoch': 15.44} + 23%|██████████████████████▍ | 164/700 [33:17<1:48:51, 12.19s/it] 24%|██████████████████████▋ | 165/700 [33:29<1:48:41, 12.19s/it] {'loss': 0.3918, 'learning_rate': 0.007642857142857142, 'epoch': 15.53} + 24%|██████████████████████▋ | 165/700 [33:29<1:48:41, 12.19s/it] 24%|██████████████████████▊ | 166/700 [33:41<1:48:27, 12.19s/it] {'loss': 0.4122, 'learning_rate': 0.007628571428571429, 'epoch': 15.62} + 24%|██████████████████████▊ | 166/700 [33:41<1:48:27, 12.19s/it] 24%|██████████████████████▉ | 167/700 [33:54<1:48:12, 12.18s/it] {'loss': 0.3403, 'learning_rate': 0.007614285714285715, 'epoch': 15.72} + 24%|██████████████████████▉ | 167/700 [33:54<1:48:12, 12.18s/it] 24%|███████████████████████ | 168/700 [34:06<1:47:57, 12.18s/it] {'loss': 0.3759, 'learning_rate': 0.0076, 'epoch': 15.81} + 24%|███████████████████████ | 168/700 [34:06<1:47:57, 12.18s/it] 24%|███████████████████████▏ | 169/700 [34:18<1:47:43, 12.17s/it] {'loss': 0.3621, 'learning_rate': 0.007585714285714286, 'epoch': 15.91} + 24%|███████████████████████▏ | 169/700 [34:18<1:47:43, 12.17s/it] 24%|███████████████████████▎ | 170/700 [34:30<1:47:32, 12.17s/it] {'loss': 0.2991, 'learning_rate': 0.007571428571428571, 'epoch': 16.0} + 24%|███████████████████████▎ | 170/700 [34:30<1:47:32, 12.17s/it] 24%|███████████████████████▍ | 171/700 [34:42<1:47:20, 12.17s/it] {'loss': 0.3039, 'learning_rate': 0.007557142857142857, 'epoch': 16.09} + 24%|███████████████████████▍ | 171/700 [34:42<1:47:20, 12.17s/it] 25%|███████████████████████▌ | 172/700 [34:54<1:47:07, 12.17s/it] {'loss': 0.4571, 'learning_rate': 0.007542857142857144, 'epoch': 16.19} + 25%|███████████████████████▌ | 172/700 [34:54<1:47:07, 12.17s/it] 25%|███████████████████████▋ | 173/700 [35:07<1:46:55, 12.17s/it] {'loss': 0.2759, 'learning_rate': 0.007528571428571429, 'epoch': 16.28} + 25%|███████████████████████▋ | 173/700 [35:07<1:46:55, 12.17s/it] 25%|███████████████████████▊ | 174/700 [35:19<1:46:41, 12.17s/it] {'loss': 0.2835, 'learning_rate': 0.007514285714285715, 'epoch': 16.38} + 25%|███████████████████████▊ | 174/700 [35:19<1:46:41, 12.17s/it] 25%|████████████████████████ | 175/700 [35:31<1:46:29, 12.17s/it] {'loss': 0.3221, 'learning_rate': 0.0075, 'epoch': 16.47} + 25%|████████████████████████ | 175/700 [35:31<1:46:29, 12.17s/it] 25%|████████████████████████▏ | 176/700 [35:43<1:46:16, 12.17s/it] {'loss': 0.3072, 'learning_rate': 0.007485714285714286, 'epoch': 16.56} + 25%|████████████████████████▏ | 176/700 [35:43<1:46:16, 12.17s/it] 25%|████████████████████████▎ | 177/700 [35:55<1:46:05, 12.17s/it] {'loss': 0.2852, 'learning_rate': 0.007471428571428572, 'epoch': 16.66} + 25%|████████████████████████▎ | 177/700 [35:55<1:46:05, 12.17s/it] 25%|████████████████████████▍ | 178/700 [36:07<1:45:55, 12.18s/it] {'loss': 0.2559, 'learning_rate': 0.007457142857142857, 'epoch': 16.75} + 25%|████████████████████████▍ | 178/700 [36:07<1:45:55, 12.18s/it] 26%|████████████████████████▌ | 179/700 [36:20<1:45:43, 12.17s/it] {'loss': 0.2787, 'learning_rate': 0.007442857142857143, 'epoch': 16.85} + 26%|████████████████████████▌ | 179/700 [36:20<1:45:43, 12.17s/it] 26%|████████████████████████▋ | 180/700 [36:32<1:45:30, 12.17s/it] {'loss': 0.3331, 'learning_rate': 0.007428571428571429, 'epoch': 16.94} + 26%|████████████████████████▋ | 180/700 [36:32<1:45:30, 12.17s/it] 26%|████████████████████████▊ | 181/700 [36:44<1:45:19, 12.18s/it] {'loss': 0.1929, 'learning_rate': 0.007414285714285714, 'epoch': 17.04} + 26%|████████████████████████▊ | 181/700 [36:44<1:45:19, 12.18s/it] 26%|████████████████████████▉ | 182/700 [36:56<1:45:05, 12.17s/it] {'loss': 0.2065, 'learning_rate': 0.0074, 'epoch': 17.13} + 26%|████████████████████████▉ | 182/700 [36:56<1:45:05, 12.17s/it] 26%|█████████████████████████ | 183/700 [37:08<1:44:52, 12.17s/it] {'loss': 0.2868, 'learning_rate': 0.007385714285714285, 'epoch': 17.22} + 26%|█████████████████████████ | 183/700 [37:08<1:44:52, 12.17s/it] 26%|█████████████████████████▏ | 184/700 [37:20<1:44:39, 12.17s/it] {'loss': 0.2206, 'learning_rate': 0.007371428571428571, 'epoch': 17.32} + 26%|█████████████████████████▏ | 184/700 [37:20<1:44:39, 12.17s/it] 26%|█████████████████████████▎ | 185/700 [37:33<1:44:29, 12.17s/it] {'loss': 0.2355, 'learning_rate': 0.007357142857142858, 'epoch': 17.41} + 26%|█████████████████████████▎ | 185/700 [37:33<1:44:29, 12.17s/it] 27%|█████████████████████████▌ | 186/700 [37:45<1:44:18, 12.18s/it] {'loss': 0.3041, 'learning_rate': 0.007342857142857143, 'epoch': 17.51} + 27%|█████████████████████████▌ | 186/700 [37:45<1:44:18, 12.18s/it] 27%|█████████████████████████▋ | 187/700 [37:57<1:44:05, 12.18s/it] {'loss': 0.3028, 'learning_rate': 0.007328571428571429, 'epoch': 17.6} + 27%|█████████████████████████▋ | 187/700 [37:57<1:44:05, 12.18s/it] 27%|█████████████████████████▊ | 188/700 [38:09<1:43:53, 12.17s/it] {'loss': 0.2435, 'learning_rate': 0.007314285714285714, 'epoch': 17.69} + 27%|█████████████████████████▊ | 188/700 [38:09<1:43:53, 12.17s/it] 27%|█████████████████████████▉ | 189/700 [38:21<1:43:39, 12.17s/it] {'loss': 0.1869, 'learning_rate': 0.0073, 'epoch': 17.79} + 27%|█████████████████████████▉ | 189/700 [38:21<1:43:39, 12.17s/it] 27%|██████████████████████████ | 190/700 [38:34<1:43:26, 12.17s/it] {'loss': 0.3036, 'learning_rate': 0.007285714285714285, 'epoch': 17.88} + 27%|██████████████████████████ | 190/700 [38:34<1:43:26, 12.17s/it] 27%|██████████████████████████▏ | 191/700 [38:46<1:43:14, 12.17s/it] {'loss': 0.246, 'learning_rate': 0.007271428571428571, 'epoch': 17.98} + 27%|██████████████████████████▏ | 191/700 [38:46<1:43:14, 12.17s/it] 27%|██████████████████████████▎ | 192/700 [38:58<1:43:01, 12.17s/it] {'loss': 0.2316, 'learning_rate': 0.007257142857142858, 'epoch': 18.07} + 27%|██████████████████████████▎ | 192/700 [38:58<1:43:01, 12.17s/it] 28%|██████████████████████████▍ | 193/700 [39:10<1:42:49, 12.17s/it] {'loss': 0.186, 'learning_rate': 0.007242857142857143, 'epoch': 18.16} + 28%|██████████████████████████▍ | 193/700 [39:10<1:42:49, 12.17s/it] 28%|██████████████████████████▌ | 194/700 [39:22<1:42:38, 12.17s/it] {'loss': 0.2616, 'learning_rate': 0.007228571428571429, 'epoch': 18.26} + 28%|██████████████████████████▌ | 194/700 [39:22<1:42:38, 12.17s/it] 28%|██████████████████████████▋ | 195/700 [39:34<1:42:25, 12.17s/it] {'loss': 0.2824, 'learning_rate': 0.007214285714285715, 'epoch': 18.35} + 28%|██████████████████████████▋ | 195/700 [39:34<1:42:25, 12.17s/it] 28%|██████████████████████████▉ | 196/700 [39:47<1:42:12, 12.17s/it] {'loss': 0.2, 'learning_rate': 0.0072, 'epoch': 18.45} + 28%|██████████████████████████▉ | 196/700 [39:47<1:42:12, 12.17s/it] 28%|███████████████████████████ | 197/700 [39:59<1:42:01, 12.17s/it] {'loss': 0.1978, 'learning_rate': 0.007185714285714286, 'epoch': 18.54} + 28%|███████████████████████████ | 197/700 [39:59<1:42:01, 12.17s/it] 28%|███████████████████████████▏ | 198/700 [40:11<1:41:50, 12.17s/it] {'loss': 0.1897, 'learning_rate': 0.007171428571428572, 'epoch': 18.64} + 28%|███████████████████████████▏ | 198/700 [40:11<1:41:50, 12.17s/it] 28%|███████████████████████████▎ | 199/700 [40:23<1:41:41, 12.18s/it] {'loss': 0.1958, 'learning_rate': 0.007157142857142858, 'epoch': 18.73} + 28%|███████████████████████████▎ | 199/700 [40:23<1:41:41, 12.18s/it] 29%|███████████████████████████▍ | 200/700 [40:35<1:41:26, 12.17s/it] {'loss': 0.203, 'learning_rate': 0.0071428571428571435, 'epoch': 18.82} + 29%|███████████████████████████▍ | 200/700 [40:35<1:41:26, 12.17s/it]Saving PrefixEncoder +[INFO|configuration_utils.py:460] 2023-12-02 16:34:29,439 >> Configuration saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/config.json +[INFO|configuration_utils.py:544] 2023-12-02 16:34:29,439 >> Configuration saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/generation_config.json +[INFO|modeling_utils.py:2118] 2023-12-02 16:34:29,451 >> Model weights saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/pytorch_model.bin +[INFO|tokenization_utils_base.py:2437] 2023-12-02 16:34:29,452 >> tokenizer config file saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-12-02 16:34:29,452 >> Special tokens file saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/special_tokens_map.json +/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. + warnings.warn( + 29%|███████████████████████████▌ | 201/700 [40:47<1:41:18, 12.18s/it] {'loss': 0.2451, 'learning_rate': 0.0071285714285714286, 'epoch': 18.92} + 29%|███████████████████████████▌ | 201/700 [40:47<1:41:18, 12.18s/it] 29%|███████████████████████████▋ | 202/700 [41:00<1:41:03, 12.17s/it] {'loss': 0.2045, 'learning_rate': 0.0071142857142857145, 'epoch': 19.01} + 29%|███████████████████████████▋ | 202/700 [41:00<1:41:03, 12.17s/it] 29%|███████████████████████████▊ | 203/700 [41:12<1:40:48, 12.17s/it] {'loss': 0.1937, 'learning_rate': 0.0070999999999999995, 'epoch': 19.11} + 29%|███████████████████████████▊ | 203/700 [41:12<1:40:48, 12.17s/it] 29%|███████████████████████████▉ | 204/700 [41:24<1:40:35, 12.17s/it] {'loss': 0.1814, 'learning_rate': 0.0070857142857142855, 'epoch': 19.2} + 29%|███████████████████████████▉ | 204/700 [41:24<1:40:35, 12.17s/it] 29%|████████████████████████████ | 205/700 [41:36<1:40:21, 12.16s/it] {'loss': 0.1869, 'learning_rate': 0.007071428571428572, 'epoch': 19.29} + 29%|████████████████████████████ | 205/700 [41:36<1:40:21, 12.16s/it] 29%|████████████████████████████▎ | 206/700 [41:48<1:40:08, 12.16s/it] {'loss': 0.2089, 'learning_rate': 0.007057142857142857, 'epoch': 19.39} + 29%|████████████████████████████▎ | 206/700 [41:48<1:40:08, 12.16s/it] 30%|████████████████████████████▍ | 207/700 [42:00<1:39:55, 12.16s/it] {'loss': 0.1924, 'learning_rate': 0.007042857142857143, 'epoch': 19.48} + 30%|████████████████████████████▍ | 207/700 [42:00<1:39:55, 12.16s/it] 30%|████████████████████████████▌ | 208/700 [42:13<1:39:43, 12.16s/it] {'loss': 0.1512, 'learning_rate': 0.007028571428571428, 'epoch': 19.58} + 30%|████████████████████████████▌ | 208/700 [42:13<1:39:43, 12.16s/it] 30%|████████████████████████████▋ | 209/700 [42:25<1:39:30, 12.16s/it] {'loss': 0.1375, 'learning_rate': 0.007014285714285714, 'epoch': 19.67} + 30%|████████████████████████████▋ | 209/700 [42:25<1:39:30, 12.16s/it] 30%|████████████████████████████▊ | 210/700 [42:37<1:39:18, 12.16s/it] {'loss': 0.187, 'learning_rate': 0.006999999999999999, 'epoch': 19.76} + 30%|████████████████████████████▊ | 210/700 [42:37<1:39:18, 12.16s/it] 30%|████████████████████████████▉ | 211/700 [42:49<1:39:06, 12.16s/it] {'loss': 0.2488, 'learning_rate': 0.006985714285714286, 'epoch': 19.86} + 30%|████████████████████████████▉ | 211/700 [42:49<1:39:06, 12.16s/it] 30%|█████████████████████████████ | 212/700 [43:01<1:38:53, 12.16s/it] {'loss': 0.1864, 'learning_rate': 0.006971428571428572, 'epoch': 19.95} + 30%|█████████████████████████████ | 212/700 [43:01<1:38:53, 12.16s/it] 30%|█████████████████████████████▏ | 213/700 [43:13<1:38:41, 12.16s/it] {'loss': 0.1984, 'learning_rate': 0.006957142857142857, 'epoch': 20.05} + 30%|█████████████████████████████▏ | 213/700 [43:13<1:38:41, 12.16s/it] 31%|█████████████████████████████▎ | 214/700 [43:25<1:38:29, 12.16s/it] {'loss': 0.156, 'learning_rate': 0.006942857142857143, 'epoch': 20.14} + 31%|█████████████████████████████▎ | 214/700 [43:25<1:38:29, 12.16s/it] 31%|█████████████████████████████▍ | 215/700 [43:38<1:38:17, 12.16s/it] {'loss': 0.2082, 'learning_rate': 0.006928571428571429, 'epoch': 20.24} + 31%|█████████████████████████████▍ | 215/700 [43:38<1:38:17, 12.16s/it] 31%|█████████████████████████████▌ | 216/700 [43:50<1:38:05, 12.16s/it] {'loss': 0.094, 'learning_rate': 0.006914285714285714, 'epoch': 20.33} + 31%|█████████████████████████████▌ | 216/700 [43:50<1:38:05, 12.16s/it] 31%|█████████████████████████████▊ | 217/700 [44:02<1:37:53, 12.16s/it] {'loss': 0.1784, 'learning_rate': 0.0069, 'epoch': 20.42} + 31%|█████████████████████████████▊ | 217/700 [44:02<1:37:53, 12.16s/it] 31%|█████████████████████████████▉ | 218/700 [44:14<1:37:40, 12.16s/it] {'loss': 0.1293, 'learning_rate': 0.006885714285714287, 'epoch': 20.52} + 31%|█████████████████████████████▉ | 218/700 [44:14<1:37:40, 12.16s/it] 31%|██████████████████████████████ | 219/700 [44:26<1:37:28, 12.16s/it] {'loss': 0.1635, 'learning_rate': 0.006871428571428572, 'epoch': 20.61} + 31%|██████████████████████████████ | 219/700 [44:26<1:37:28, 12.16s/it] 31%|██████████████████████████████▏ | 220/700 [44:38<1:37:16, 12.16s/it] {'loss': 0.1668, 'learning_rate': 0.006857142857142858, 'epoch': 20.71} + 31%|██████████████████████████████▏ | 220/700 [44:38<1:37:16, 12.16s/it] 32%|██████████████████████████████▎ | 221/700 [44:51<1:37:04, 12.16s/it] {'loss': 0.1946, 'learning_rate': 0.006842857142857143, 'epoch': 20.8} + 32%|██████████████████████████████▎ | 221/700 [44:51<1:37:04, 12.16s/it] 32%|██████████████████████████████▍ | 222/700 [45:03<1:36:52, 12.16s/it] {'loss': 0.2347, 'learning_rate': 0.006828571428571429, 'epoch': 20.89} + 32%|██████████████████████████████▍ | 222/700 [45:03<1:36:52, 12.16s/it] 32%|██████████████████████████████▌ | 223/700 [45:15<1:36:39, 12.16s/it] {'loss': 0.1523, 'learning_rate': 0.006814285714285714, 'epoch': 20.99} + 32%|██████████████████████████████▌ | 223/700 [45:15<1:36:39, 12.16s/it] 32%|██████████████████████████████▋ | 224/700 [45:27<1:36:28, 12.16s/it] {'loss': 0.1337, 'learning_rate': 0.0068000000000000005, 'epoch': 21.08} + 32%|██████████████████████████████▋ | 224/700 [45:27<1:36:28, 12.16s/it] 32%|██████████████████████████████▊ | 225/700 [45:39<1:36:15, 12.16s/it] {'loss': 0.1511, 'learning_rate': 0.006785714285714286, 'epoch': 21.18} + 32%|██████████████████████████████▊ | 225/700 [45:39<1:36:15, 12.16s/it] 32%|██████████████████████████████▉ | 226/700 [45:51<1:36:03, 12.16s/it] {'loss': 0.1058, 'learning_rate': 0.0067714285714285715, 'epoch': 21.27} + 32%|██████████████████████████████▉ | 226/700 [45:51<1:36:03, 12.16s/it] 32%|███████████████████████████████▏ | 227/700 [46:04<1:35:51, 12.16s/it] {'loss': 0.172, 'learning_rate': 0.006757142857142857, 'epoch': 21.36} + 32%|███████████████████████████████▏ | 227/700 [46:04<1:35:51, 12.16s/it] 33%|███████████████████████████████▎ | 228/700 [46:16<1:35:39, 12.16s/it] {'loss': 0.1077, 'learning_rate': 0.0067428571428571425, 'epoch': 21.46} + 33%|███████████████████████████████▎ | 228/700 [46:16<1:35:39, 12.16s/it] 33%|███████████████████████████████▍ | 229/700 [46:28<1:35:28, 12.16s/it] {'loss': 0.1993, 'learning_rate': 0.006728571428571428, 'epoch': 21.55} + 33%|███████████████████████████████▍ | 229/700 [46:28<1:35:28, 12.16s/it] 33%|███████████████████████████████▌ | 230/700 [46:40<1:35:16, 12.16s/it] {'loss': 0.1414, 'learning_rate': 0.006714285714285714, 'epoch': 21.65} + 33%|███████████████████████████████▌ | 230/700 [46:40<1:35:16, 12.16s/it] 33%|███████████████████████████████▋ | 231/700 [46:52<1:35:04, 12.16s/it] {'loss': 0.126, 'learning_rate': 0.0067, 'epoch': 21.74} + 33%|███████████████████████████████▋ | 231/700 [46:52<1:35:04, 12.16s/it] 33%|███████████████████████████████▊ | 232/700 [47:04<1:34:52, 12.16s/it] {'loss': 0.1528, 'learning_rate': 0.006685714285714286, 'epoch': 21.84} + 33%|███████████████████████████████▊ | 232/700 [47:04<1:34:52, 12.16s/it] 33%|███████████████████████████████▉ | 233/700 [47:17<1:34:40, 12.16s/it] {'loss': 0.1316, 'learning_rate': 0.006671428571428571, 'epoch': 21.93} + 33%|███████████████████████████████▉ | 233/700 [47:17<1:34:40, 12.16s/it] 33%|████████████████████████████████ | 234/700 [47:29<1:34:28, 12.16s/it] {'loss': 0.1565, 'learning_rate': 0.006657142857142857, 'epoch': 22.02} + 33%|████████████████████████████████ | 234/700 [47:29<1:34:28, 12.16s/it] 34%|████████████████████████████████▏ | 235/700 [47:41<1:34:16, 12.16s/it] {'loss': 0.1088, 'learning_rate': 0.006642857142857143, 'epoch': 22.12} + 34%|████████████████████████████████▏ | 235/700 [47:41<1:34:16, 12.16s/it] 34%|████████████████████████████████▎ | 236/700 [47:53<1:34:03, 12.16s/it] {'loss': 0.088, 'learning_rate': 0.006628571428571428, 'epoch': 22.21} + 34%|████████████████████████████████▎ | 236/700 [47:53<1:34:03, 12.16s/it] 34%|████████████████████████████████▌ | 237/700 [48:05<1:33:51, 12.16s/it] {'loss': 0.1348, 'learning_rate': 0.006614285714285715, 'epoch': 22.31} + 34%|████████████████████████████████▌ | 237/700 [48:05<1:33:51, 12.16s/it] 34%|████████████████████████████████▋ | 238/700 [48:17<1:33:39, 12.16s/it] {'loss': 0.1702, 'learning_rate': 0.006600000000000001, 'epoch': 22.4} + 34%|████████████████████████████████▋ | 238/700 [48:17<1:33:39, 12.16s/it] 34%|████████████████████████████████▊ | 239/700 [48:30<1:33:27, 12.16s/it] {'loss': 0.132, 'learning_rate': 0.006585714285714286, 'epoch': 22.49} + 34%|████████████████████████████████▊ | 239/700 [48:30<1:33:27, 12.16s/it] 34%|████████████████████████████████▉ | 240/700 [48:42<1:33:15, 12.16s/it] {'loss': 0.1115, 'learning_rate': 0.006571428571428572, 'epoch': 22.59} + 34%|████████████████████████████████▉ | 240/700 [48:42<1:33:15, 12.16s/it] 34%|█████████████████████████████████ | 241/700 [48:54<1:33:03, 12.16s/it] {'loss': 0.1173, 'learning_rate': 0.006557142857142857, 'epoch': 22.68} + 34%|█████████████████████████████████ | 241/700 [48:54<1:33:03, 12.16s/it] 35%|█████████████████████████████████▏ | 242/700 [49:06<1:32:51, 12.16s/it] {'loss': 0.0967, 'learning_rate': 0.006542857142857143, 'epoch': 22.78} + 35%|█████████████████████████████████▏ | 242/700 [49:06<1:32:51, 12.16s/it] 35%|█████████████████████████████████▎ | 243/700 [49:18<1:32:39, 12.16s/it] {'loss': 0.1484, 'learning_rate': 0.006528571428571428, 'epoch': 22.87} + 35%|█████████████████████████████████▎ | 243/700 [49:18<1:32:39, 12.16s/it] 35%|█████████████████████████████████▍ | 244/700 [49:30<1:32:26, 12.16s/it] {'loss': 0.1566, 'learning_rate': 0.006514285714285715, 'epoch': 22.96} + 35%|█████████████████████████████████▍ | 244/700 [49:30<1:32:26, 12.16s/it] 35%|█████████████████████████████████▌ | 245/700 [49:43<1:32:15, 12.17s/it] {'loss': 0.162, 'learning_rate': 0.006500000000000001, 'epoch': 23.06} + 35%|█████████████████████████████████▌ | 245/700 [49:43<1:32:15, 12.17s/it] 35%|█████████████████████████████████▋ | 246/700 [49:55<1:32:03, 12.17s/it] {'loss': 0.1099, 'learning_rate': 0.006485714285714286, 'epoch': 23.15} + 35%|█████████████████████████████████▋ | 246/700 [49:55<1:32:03, 12.17s/it] 35%|█████████████████████████████████▊ | 247/700 [50:07<1:31:50, 12.17s/it] {'loss': 0.1087, 'learning_rate': 0.0064714285714285716, 'epoch': 23.25} + 35%|█████████████████████████████████▊ | 247/700 [50:07<1:31:50, 12.17s/it] 35%|██████████████████████████████████ | 248/700 [50:19<1:31:38, 12.16s/it] {'loss': 0.116, 'learning_rate': 0.006457142857142857, 'epoch': 23.34} + 35%|██████████████████████████████████ | 248/700 [50:19<1:31:38, 12.16s/it] 36%|██████████████████████████████████▏ | 249/700 [50:31<1:31:26, 12.16s/it] {'loss': 0.1096, 'learning_rate': 0.0064428571428571425, 'epoch': 23.44} + 36%|██████████████████████████████████▏ | 249/700 [50:31<1:31:26, 12.16s/it] 36%|██████████████████████████████████▎ | 250/700 [50:43<1:31:14, 12.16s/it] {'loss': 0.0972, 'learning_rate': 0.006428571428571429, 'epoch': 23.53} + 36%|██████████████████████████████████▎ | 250/700 [50:43<1:31:14, 12.16s/it] 36%|██████████████████████████████████▍ | 251/700 [50:56<1:31:01, 12.16s/it] {'loss': 0.0889, 'learning_rate': 0.006414285714285714, 'epoch': 23.62} + 36%|██████████████████████████████████▍ | 251/700 [50:56<1:31:01, 12.16s/it] 36%|██████████████████████████████████▌ | 252/700 [51:08<1:30:49, 12.16s/it] {'loss': 0.1199, 'learning_rate': 0.0064, 'epoch': 23.72} + 36%|██████████████████████████████████▌ | 252/700 [51:08<1:30:49, 12.16s/it] 36%|██████████████████████████████████▋ | 253/700 [51:20<1:30:37, 12.16s/it] {'loss': 0.1337, 'learning_rate': 0.006385714285714286, 'epoch': 23.81} + 36%|██████████████████████████████████▋ | 253/700 [51:20<1:30:37, 12.16s/it] 36%|██████████████████████████████████▊ | 254/700 [51:32<1:30:25, 12.16s/it] {'loss': 0.0977, 'learning_rate': 0.006371428571428571, 'epoch': 23.91} + 36%|██████████████████████████████████▊ | 254/700 [51:32<1:30:25, 12.16s/it] 36%|██████████████████████████████████▉ | 255/700 [51:44<1:30:12, 12.16s/it] {'loss': 0.146, 'learning_rate': 0.006357142857142857, 'epoch': 24.0} + 36%|██████████████████████████████████▉ | 255/700 [51:44<1:30:12, 12.16s/it] 37%|███████████████████████████████████ | 256/700 [51:56<1:30:00, 12.16s/it] {'loss': 0.1102, 'learning_rate': 0.006342857142857142, 'epoch': 24.09} + 37%|███████████████████████████████████ | 256/700 [51:56<1:30:00, 12.16s/it] 37%|███████████████████████████████████▏ | 257/700 [52:09<1:29:48, 12.16s/it] {'loss': 0.1025, 'learning_rate': 0.006328571428571429, 'epoch': 24.19} + 37%|███████████████████████████████████▏ | 257/700 [52:09<1:29:48, 12.16s/it] 37%|███████████████████████████████████▍ | 258/700 [52:21<1:29:36, 12.16s/it] {'loss': 0.09, 'learning_rate': 0.006314285714285715, 'epoch': 24.28} + 37%|███████████████████████████████████▍ | 258/700 [52:21<1:29:36, 12.16s/it] 37%|███████████████████████████████████▌ | 259/700 [52:33<1:29:24, 12.17s/it] {'loss': 0.1302, 'learning_rate': 0.0063, 'epoch': 24.38} + 37%|███████████████████████████████████▌ | 259/700 [52:33<1:29:24, 12.17s/it] 37%|███████████████████████████████████▋ | 260/700 [52:45<1:29:12, 12.16s/it] {'loss': 0.0739, 'learning_rate': 0.006285714285714286, 'epoch': 24.47} + 37%|███████████████████████████████████▋ | 260/700 [52:45<1:29:12, 12.16s/it] 37%|███████████████████████████████████▊ | 261/700 [52:57<1:29:00, 12.17s/it] {'loss': 0.1172, 'learning_rate': 0.006271428571428571, 'epoch': 24.56} + 37%|███████████████████████████████████▊ | 261/700 [52:57<1:29:00, 12.17s/it] 37%|███████████████████████████████████▉ | 262/700 [53:09<1:28:48, 12.17s/it] {'loss': 0.1048, 'learning_rate': 0.006257142857142857, 'epoch': 24.66} + 37%|███████████████████████████████████▉ | 262/700 [53:09<1:28:48, 12.17s/it] 38%|████████████████████████████████████ | 263/700 [53:21<1:28:36, 12.17s/it] {'loss': 0.0977, 'learning_rate': 0.006242857142857144, 'epoch': 24.75} + 38%|████████████████████████████████████ | 263/700 [53:21<1:28:36, 12.17s/it] 38%|████████████████████████████████████▏ | 264/700 [53:34<1:28:24, 12.17s/it] {'loss': 0.1056, 'learning_rate': 0.006228571428571429, 'epoch': 24.85} + 38%|████████████████████████████████████▏ | 264/700 [53:34<1:28:24, 12.17s/it] 38%|████████████████████████████████████▎ | 265/700 [53:46<1:28:11, 12.16s/it] {'loss': 0.1252, 'learning_rate': 0.006214285714285715, 'epoch': 24.94} + 38%|████████████████████████████████████▎ | 265/700 [53:46<1:28:11, 12.16s/it] 38%|████████████████████████████████████▍ | 266/700 [53:58<1:27:59, 12.17s/it] {'loss': 0.1107, 'learning_rate': 0.0062, 'epoch': 25.04} + 38%|████████████████████████████████████▍ | 266/700 [53:58<1:27:59, 12.17s/it] 38%|████████████████████████████████████▌ | 267/700 [54:10<1:27:47, 12.16s/it] {'loss': 0.0887, 'learning_rate': 0.006185714285714286, 'epoch': 25.13} + 38%|████████████████████████████████████▌ | 267/700 [54:10<1:27:47, 12.16s/it] 38%|████████████████████████████████████▊ | 268/700 [54:22<1:27:35, 12.17s/it] {'loss': 0.0836, 'learning_rate': 0.006171428571428571, 'epoch': 25.22} + 38%|████████████████████████████████████▊ | 268/700 [54:22<1:27:35, 12.17s/it] 38%|████████████████████████████████████▉ | 269/700 [54:34<1:27:23, 12.16s/it] {'loss': 0.0957, 'learning_rate': 0.0061571428571428576, 'epoch': 25.32} + 38%|████████████████████████████████████▉ | 269/700 [54:34<1:27:23, 12.16s/it] 39%|█████████████████████████████████████ | 270/700 [54:47<1:27:10, 12.16s/it] {'loss': 0.1165, 'learning_rate': 0.0061428571428571435, 'epoch': 25.41} + 39%|█████████████████████████████████████ | 270/700 [54:47<1:27:10, 12.16s/it] 39%|█████████████████████████████████████▏ | 271/700 [54:59<1:26:58, 12.16s/it] {'loss': 0.1135, 'learning_rate': 0.0061285714285714285, 'epoch': 25.51} + 39%|█████████████████████████████████████▏ | 271/700 [54:59<1:26:58, 12.16s/it] 39%|█████████████████████████████████████▎ | 272/700 [55:11<1:26:46, 12.16s/it] {'loss': 0.0901, 'learning_rate': 0.0061142857142857145, 'epoch': 25.6} + 39%|█████████████████████████████████████▎ | 272/700 [55:11<1:26:46, 12.16s/it] 39%|█████████████████████████████████████▍ | 273/700 [55:23<1:26:34, 12.17s/it] {'loss': 0.0751, 'learning_rate': 0.0061, 'epoch': 25.69} + 39%|█████████████████████████████████████▍ | 273/700 [55:23<1:26:34, 12.17s/it] 39%|█████████████████████████████████████▌ | 274/700 [55:35<1:26:22, 12.16s/it] {'loss': 0.109, 'learning_rate': 0.0060857142857142854, 'epoch': 25.79} + 39%|█████████████████████████████████████▌ | 274/700 [55:35<1:26:22, 12.16s/it] 39%|█████████████████████████████████████▋ | 275/700 [55:47<1:26:10, 12.16s/it] {'loss': 0.102, 'learning_rate': 0.006071428571428571, 'epoch': 25.88} + 39%|█████████████████████████████████████▋ | 275/700 [55:47<1:26:10, 12.16s/it] 39%|█████████████████████████████████████▊ | 276/700 [56:00<1:25:57, 12.16s/it] {'loss': 0.0916, 'learning_rate': 0.006057142857142858, 'epoch': 25.98} + 39%|█████████████████████████████████████▊ | 276/700 [56:00<1:25:57, 12.16s/it] 40%|█████████████████████████████████████▉ | 277/700 [56:12<1:25:45, 12.16s/it] {'loss': 0.0821, 'learning_rate': 0.006042857142857143, 'epoch': 26.07} + 40%|█████████████████████████████████████▉ | 277/700 [56:12<1:25:45, 12.16s/it] 40%|██████████████████████████████████████▏ | 278/700 [56:24<1:25:33, 12.16s/it] {'loss': 0.0797, 'learning_rate': 0.006028571428571429, 'epoch': 26.16} + 40%|██████████████████████████████████████▏ | 278/700 [56:24<1:25:33, 12.16s/it] 40%|██████████████████████████████████████▎ | 279/700 [56:36<1:25:20, 12.16s/it] {'loss': 0.0804, 'learning_rate': 0.006014285714285714, 'epoch': 26.26} + 40%|██████████████████████████████████████▎ | 279/700 [56:36<1:25:20, 12.16s/it] 40%|██████████████████████████████████████▍ | 280/700 [56:48<1:25:08, 12.16s/it] {'loss': 0.0987, 'learning_rate': 0.006, 'epoch': 26.35} + 40%|██████████████████████████████████████▍ | 280/700 [56:48<1:25:08, 12.16s/it] 40%|██████████████████████████████████████▌ | 281/700 [57:00<1:24:56, 12.16s/it] {'loss': 0.1192, 'learning_rate': 0.005985714285714285, 'epoch': 26.45} + 40%|██████████████████████████████████████▌ | 281/700 [57:00<1:24:56, 12.16s/it] 40%|██████████████████████████████████████▋ | 282/700 [57:13<1:24:44, 12.16s/it] {'loss': 0.0699, 'learning_rate': 0.005971428571428572, 'epoch': 26.54} + 40%|██████████████████████████████████████▋ | 282/700 [57:13<1:24:44, 12.16s/it] 40%|██████████████████████████████████████▊ | 283/700 [57:25<1:24:32, 12.16s/it] {'loss': 0.0902, 'learning_rate': 0.005957142857142858, 'epoch': 26.64} + 40%|██████████████████████████████████████▊ | 283/700 [57:25<1:24:32, 12.16s/it] 41%|██████████████████████████████████████▉ | 284/700 [57:37<1:24:20, 12.16s/it] {'loss': 0.0916, 'learning_rate': 0.005942857142857143, 'epoch': 26.73} + 41%|██████████████████████████████████████▉ | 284/700 [57:37<1:24:20, 12.16s/it] 41%|███████████████████████████████████████ | 285/700 [57:49<1:24:07, 12.16s/it] {'loss': 0.0753, 'learning_rate': 0.005928571428571429, 'epoch': 26.82} + 41%|███████████████████████████████████████ | 285/700 [57:49<1:24:07, 12.16s/it] 41%|███████████████████████████████████████▏ | 286/700 [58:01<1:23:55, 12.16s/it] {'loss': 0.0964, 'learning_rate': 0.005914285714285714, 'epoch': 26.92} + 41%|███████████████████████████████████████▏ | 286/700 [58:01<1:23:55, 12.16s/it] 41%|███████████████████████████████████████▎ | 287/700 [58:13<1:23:44, 12.16s/it] {'loss': 0.1108, 'learning_rate': 0.0059, 'epoch': 27.01} + 41%|███████████████████████████████████████▎ | 287/700 [58:13<1:23:44, 12.16s/it] 41%|███████████████████████████████████████▍ | 288/700 [58:26<1:23:31, 12.16s/it] {'loss': 0.1062, 'learning_rate': 0.005885714285714286, 'epoch': 27.11} + 41%|███████████████████████████████████████▍ | 288/700 [58:26<1:23:31, 12.16s/it] 41%|███████████████████████████████████████▋ | 289/700 [58:38<1:23:19, 12.16s/it] {'loss': 0.0846, 'learning_rate': 0.005871428571428572, 'epoch': 27.2} + 41%|███████████████████████████████████████▋ | 289/700 [58:38<1:23:19, 12.16s/it] 41%|███████████████████████████████████████▊ | 290/700 [58:50<1:23:07, 12.16s/it] {'loss': 0.0986, 'learning_rate': 0.005857142857142858, 'epoch': 27.29} + 41%|███████████████████████████████████████▊ | 290/700 [58:50<1:23:07, 12.16s/it] 42%|███████████████████████████████████████▉ | 291/700 [59:02<1:22:54, 12.16s/it] {'loss': 0.0713, 'learning_rate': 0.005842857142857143, 'epoch': 27.39} + 42%|███████████████████████████████████████▉ | 291/700 [59:02<1:22:54, 12.16s/it] 42%|████████████████████████████████████████ | 292/700 [59:14<1:22:42, 12.16s/it] {'loss': 0.0829, 'learning_rate': 0.005828571428571429, 'epoch': 27.48} + 42%|████████████████████████████████████████ | 292/700 [59:14<1:22:42, 12.16s/it] 42%|████████████████████████████████████████▏ | 293/700 [59:26<1:22:30, 12.16s/it] {'loss': 0.1026, 'learning_rate': 0.0058142857142857145, 'epoch': 27.58} + 42%|████████████████████████████████████████▏ | 293/700 [59:26<1:22:30, 12.16s/it] 42%|████████████████████████████████████████▎ | 294/700 [59:39<1:22:18, 12.16s/it] {'loss': 0.0785, 'learning_rate': 0.0058, 'epoch': 27.67} + 42%|████████████████████████████████████████▎ | 294/700 [59:39<1:22:18, 12.16s/it] 42%|████████████████████████████████████████▍ | 295/700 [59:51<1:22:06, 12.16s/it] {'loss': 0.0729, 'learning_rate': 0.005785714285714286, 'epoch': 27.76} + 42%|████████████████████████████████████████▍ | 295/700 [59:51<1:22:06, 12.16s/it] 42%|███████████████████████████████████████▋ | 296/700 [1:00:03<1:21:54, 12.16s/it] {'loss': 0.0738, 'learning_rate': 0.005771428571428572, 'epoch': 27.86} + 42%|███████████████████████████████████████▋ | 296/700 [1:00:03<1:21:54, 12.16s/it] 42%|███████████████████████████████████████▉ | 297/700 [1:00:15<1:21:41, 12.16s/it] {'loss': 0.079, 'learning_rate': 0.005757142857142857, 'epoch': 27.95} + 42%|███████████████████████████████████████▉ | 297/700 [1:00:15<1:21:41, 12.16s/it] 43%|████████████████████████████████████████ | 298/700 [1:00:27<1:21:29, 12.16s/it] {'loss': 0.0761, 'learning_rate': 0.005742857142857143, 'epoch': 28.05} + 43%|████████████████████████████████████████ | 298/700 [1:00:27<1:21:29, 12.16s/it] 43%|████████████████████████████████████████▏ | 299/700 [1:00:39<1:21:17, 12.16s/it] {'loss': 0.0792, 'learning_rate': 0.005728571428571428, 'epoch': 28.14} + 43%|████████████████████████████████████████▏ | 299/700 [1:00:39<1:21:17, 12.16s/it] 43%|████████████████████████████████████████▎ | 300/700 [1:00:52<1:21:05, 12.16s/it] {'loss': 0.0881, 'learning_rate': 0.005714285714285714, 'epoch': 28.24} + 43%|████████████████████████████████████████▎ | 300/700 [1:00:52<1:21:05, 12.16s/it]Saving PrefixEncoder +[INFO|configuration_utils.py:460] 2023-12-02 16:54:45,783 >> Configuration saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/config.json +[INFO|configuration_utils.py:544] 2023-12-02 16:54:45,783 >> Configuration saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/generation_config.json +[INFO|modeling_utils.py:2118] 2023-12-02 16:54:45,791 >> Model weights saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/pytorch_model.bin +[INFO|tokenization_utils_base.py:2437] 2023-12-02 16:54:45,792 >> tokenizer config file saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-12-02 16:54:45,792 >> Special tokens file saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/special_tokens_map.json +/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. + warnings.warn( + 43%|████████████████████████████████████████▍ | 301/700 [1:01:04<1:20:57, 12.17s/it] {'loss': 0.1073, 'learning_rate': 0.005699999999999999, 'epoch': 28.33} + 43%|████████████████████████████████████████▍ | 301/700 [1:01:04<1:20:57, 12.17s/it] 43%|████████████████████████████████████████▌ | 302/700 [1:01:16<1:20:44, 12.17s/it] {'loss': 0.0686, 'learning_rate': 0.005685714285714286, 'epoch': 28.42} + 43%|████████████████████████████████████████▌ | 302/700 [1:01:16<1:20:44, 12.17s/it] 43%|████████████████████████████████████████▋ | 303/700 [1:01:28<1:20:30, 12.17s/it] {'loss': 0.0701, 'learning_rate': 0.005671428571428572, 'epoch': 28.52} + 43%|████████████████████████████████████████▋ | 303/700 [1:01:28<1:20:30, 12.17s/it] 43%|████████████████████████████████████████▊ | 304/700 [1:01:40<1:20:18, 12.17s/it] {'loss': 0.1114, 'learning_rate': 0.005657142857142857, 'epoch': 28.61} + 43%|████████████████████████████████████████▊ | 304/700 [1:01:40<1:20:18, 12.17s/it] 44%|████████████████████████████████████████▉ | 305/700 [1:01:52<1:20:05, 12.17s/it] {'loss': 0.0595, 'learning_rate': 0.005642857142857143, 'epoch': 28.71} + 44%|████████████████████████████████████████▉ | 305/700 [1:01:52<1:20:05, 12.17s/it] 44%|█████████████████████████████████████████ | 306/700 [1:02:05<1:19:52, 12.16s/it] {'loss': 0.086, 'learning_rate': 0.005628571428571428, 'epoch': 28.8} + 44%|█████████████████████████████████████████ | 306/700 [1:02:05<1:19:52, 12.16s/it] 44%|█████████████████████████████████████████▏ | 307/700 [1:02:17<1:19:41, 12.17s/it] {'loss': 0.0877, 'learning_rate': 0.005614285714285714, 'epoch': 28.89} + 44%|█████████████████████████████████████████▏ | 307/700 [1:02:17<1:19:41, 12.17s/it] 44%|█████████████████████████████████████████▎ | 308/700 [1:02:29<1:19:28, 12.17s/it] {'loss': 0.0582, 'learning_rate': 0.005600000000000001, 'epoch': 28.99} + 44%|█████████████████████████████████████████▎ | 308/700 [1:02:29<1:19:28, 12.17s/it] 44%|█████████████████████████████████████████▍ | 309/700 [1:02:41<1:19:16, 12.17s/it] {'loss': 0.0645, 'learning_rate': 0.005585714285714286, 'epoch': 29.08} + 44%|█████████████████████████████████████████▍ | 309/700 [1:02:41<1:19:16, 12.17s/it] 44%|█████████████████████████████████████████▋ | 310/700 [1:02:53<1:19:04, 12.17s/it] {'loss': 0.1025, 'learning_rate': 0.005571428571428572, 'epoch': 29.18} + 44%|█████████████████████████████████████████▋ | 310/700 [1:02:53<1:19:04, 12.17s/it] 44%|█████████████████████████████████████████▊ | 311/700 [1:03:05<1:18:52, 12.16s/it] {'loss': 0.0612, 'learning_rate': 0.005557142857142857, 'epoch': 29.27} + 44%|█████████████████████████████████████████▊ | 311/700 [1:03:05<1:18:52, 12.16s/it] 45%|█████████████████████████████████████████▉ | 312/700 [1:03:18<1:18:40, 12.17s/it] {'loss': 0.0706, 'learning_rate': 0.005542857142857143, 'epoch': 29.36} + 45%|█████████████████████████████████████████▉ | 312/700 [1:03:18<1:18:40, 12.17s/it] 45%|██████████████████████████████████████████ | 313/700 [1:03:30<1:18:27, 12.16s/it] {'loss': 0.0636, 'learning_rate': 0.005528571428571429, 'epoch': 29.46} + 45%|██████████████████████████████████████████ | 313/700 [1:03:30<1:18:27, 12.16s/it] 45%|██████████████████████████████████████████▏ | 314/700 [1:03:42<1:18:15, 12.16s/it] {'loss': 0.0721, 'learning_rate': 0.005514285714285714, 'epoch': 29.55} + 45%|██████████████████████████████████████████▏ | 314/700 [1:03:42<1:18:15, 12.16s/it] 45%|██████████████████████████████████████████▎ | 315/700 [1:03:54<1:18:03, 12.16s/it] {'loss': 0.1062, 'learning_rate': 0.0055000000000000005, 'epoch': 29.65} + 45%|██████████████████████████████████████████▎ | 315/700 [1:03:54<1:18:03, 12.16s/it] 45%|██████████████████████████████████████████▍ | 316/700 [1:04:06<1:17:51, 12.16s/it] {'loss': 0.0739, 'learning_rate': 0.0054857142857142865, 'epoch': 29.74} + 45%|██████████████████████████████████████████▍ | 316/700 [1:04:06<1:17:51, 12.16s/it] 45%|██████████████████████████████████████████▌ | 317/700 [1:04:18<1:17:38, 12.16s/it] {'loss': 0.0688, 'learning_rate': 0.0054714285714285715, 'epoch': 29.84} + 45%|██████████████████████████████████████████▌ | 317/700 [1:04:18<1:17:38, 12.16s/it] 45%|██████████████████████████████████████████▋ | 318/700 [1:04:31<1:17:26, 12.16s/it] {'loss': 0.0715, 'learning_rate': 0.0054571428571428575, 'epoch': 29.93} + 45%|██████████████████████████████████████████▋ | 318/700 [1:04:31<1:17:26, 12.16s/it] 46%|██████████████████████████████████████████▊ | 319/700 [1:04:43<1:17:14, 12.16s/it] {'loss': 0.0628, 'learning_rate': 0.0054428571428571425, 'epoch': 30.02} + 46%|██████████████████████████████████████████▊ | 319/700 [1:04:43<1:17:14, 12.16s/it] 46%|██████████████████████████████████████████▉ | 320/700 [1:04:55<1:17:02, 12.16s/it] {'loss': 0.0831, 'learning_rate': 0.0054285714285714284, 'epoch': 30.12} + 46%|██████████████████████████████████████████▉ | 320/700 [1:04:55<1:17:02, 12.16s/it] 46%|███████████████████████████████████████████ | 321/700 [1:05:07<1:16:50, 12.16s/it] {'loss': 0.0833, 'learning_rate': 0.005414285714285715, 'epoch': 30.21} + 46%|███████████████████████████████████████████ | 321/700 [1:05:07<1:16:50, 12.16s/it] 46%|███████████████████████████████████████████▏ | 322/700 [1:05:19<1:16:38, 12.16s/it] {'loss': 0.09, 'learning_rate': 0.0054, 'epoch': 30.31} + 46%|███████████████████████████████████████████▏ | 322/700 [1:05:19<1:16:38, 12.16s/it] 46%|███████████████████████████████████████████▎ | 323/700 [1:05:31<1:16:26, 12.16s/it] {'loss': 0.0469, 'learning_rate': 0.005385714285714286, 'epoch': 30.4} + 46%|███████████████████████████████████████████▎ | 323/700 [1:05:31<1:16:26, 12.16s/it] 46%|███████████████████████████████████████████▌ | 324/700 [1:05:44<1:16:13, 12.16s/it] {'loss': 0.0631, 'learning_rate': 0.005371428571428571, 'epoch': 30.49} + 46%|███████████████████████████████████████████▌ | 324/700 [1:05:44<1:16:13, 12.16s/it] 46%|███████████████████████████████████████████▋ | 325/700 [1:05:56<1:16:01, 12.16s/it] {'loss': 0.0685, 'learning_rate': 0.005357142857142857, 'epoch': 30.59} + 46%|███████████████████████████████████████████▋ | 325/700 [1:05:56<1:16:01, 12.16s/it] 47%|███████████████████████████████████████████▊ | 326/700 [1:06:08<1:15:49, 12.16s/it] {'loss': 0.0798, 'learning_rate': 0.005342857142857142, 'epoch': 30.68} + 47%|███████████████████████████████████████████▊ | 326/700 [1:06:08<1:15:49, 12.16s/it] 47%|███████████████████████████████████████████▉ | 327/700 [1:06:20<1:15:37, 12.16s/it] {'loss': 0.0653, 'learning_rate': 0.005328571428571428, 'epoch': 30.78} + 47%|███████████████████████████████████████████▉ | 327/700 [1:06:20<1:15:37, 12.16s/it] 47%|████████████████████████████████████████████ | 328/700 [1:06:32<1:15:25, 12.17s/it] {'loss': 0.0615, 'learning_rate': 0.005314285714285715, 'epoch': 30.87} + 47%|████████████████████████████████████████████ | 328/700 [1:06:32<1:15:25, 12.17s/it] 47%|████████████████████████████████████████████▏ | 329/700 [1:06:44<1:15:13, 12.17s/it] {'loss': 0.0548, 'learning_rate': 0.0053, 'epoch': 30.96} + 47%|████████████████████████████████████████████▏ | 329/700 [1:06:44<1:15:13, 12.17s/it] 47%|████████████████████████████████████████████▎ | 330/700 [1:06:57<1:15:01, 12.17s/it] {'loss': 0.0592, 'learning_rate': 0.005285714285714286, 'epoch': 31.06} + 47%|████████████████████████████████████████████▎ | 330/700 [1:06:57<1:15:01, 12.17s/it] 47%|████████████████████████████████████████████▍ | 331/700 [1:07:09<1:14:48, 12.16s/it] {'loss': 0.0628, 'learning_rate': 0.005271428571428572, 'epoch': 31.15} + 47%|████████████████████████████████████████████▍ | 331/700 [1:07:09<1:14:48, 12.16s/it] 47%|████████████████████████████████████████████▌ | 332/700 [1:07:21<1:14:36, 12.16s/it] {'loss': 0.0604, 'learning_rate': 0.005257142857142857, 'epoch': 31.25} + 47%|████████████████████████████████████████████▌ | 332/700 [1:07:21<1:14:36, 12.16s/it] 48%|████████████████████████████████████████████▋ | 333/700 [1:07:33<1:14:24, 12.16s/it] {'loss': 0.0833, 'learning_rate': 0.005242857142857143, 'epoch': 31.34} + 48%|████████████████████████████████████████████▋ | 333/700 [1:07:33<1:14:24, 12.16s/it] 48%|████████████████████████████████████████████▊ | 334/700 [1:07:45<1:14:12, 12.16s/it] {'loss': 0.0748, 'learning_rate': 0.005228571428571429, 'epoch': 31.44} + 48%|████████████████████████████████████████████▊ | 334/700 [1:07:45<1:14:12, 12.16s/it] 48%|████████████████████████████████████████████▉ | 335/700 [1:07:57<1:13:59, 12.16s/it] {'loss': 0.0495, 'learning_rate': 0.005214285714285715, 'epoch': 31.53} + 48%|████████████████████████████████████████████▉ | 335/700 [1:07:57<1:13:59, 12.16s/it] 48%|█████████████████████████████████████████████ | 336/700 [1:08:10<1:13:48, 12.16s/it] {'loss': 0.0589, 'learning_rate': 0.005200000000000001, 'epoch': 31.62} + 48%|█████████████████████████████████████████████ | 336/700 [1:08:10<1:13:48, 12.16s/it] 48%|█████████████████████████████████████████████▎ | 337/700 [1:08:22<1:13:35, 12.16s/it] {'loss': 0.0655, 'learning_rate': 0.005185714285714286, 'epoch': 31.72} + 48%|█████████████████████████████████████████████▎ | 337/700 [1:08:22<1:13:35, 12.16s/it] 48%|█████████████████████████████████████████████▍ | 338/700 [1:08:34<1:13:23, 12.16s/it] {'loss': 0.0695, 'learning_rate': 0.005171428571428572, 'epoch': 31.81} + 48%|█████████████████████████████████████████████▍ | 338/700 [1:08:34<1:13:23, 12.16s/it] 48%|█████████████████████████████████████████████▌ | 339/700 [1:08:46<1:13:11, 12.16s/it] {'loss': 0.0609, 'learning_rate': 0.005157142857142857, 'epoch': 31.91} + 48%|█████████████████████████████████████████████▌ | 339/700 [1:08:46<1:13:11, 12.16s/it] 49%|█████████████████████████████████████████████▋ | 340/700 [1:08:58<1:12:58, 12.16s/it] {'loss': 0.0636, 'learning_rate': 0.005142857142857143, 'epoch': 32.0} + 49%|█████████████████████████████████████████████▋ | 340/700 [1:08:58<1:12:58, 12.16s/it] 49%|█████████████████████████████████████████████▊ | 341/700 [1:09:10<1:12:46, 12.16s/it] {'loss': 0.0606, 'learning_rate': 0.005128571428571429, 'epoch': 32.09} + 49%|█████████████████████████████████████████████▊ | 341/700 [1:09:10<1:12:46, 12.16s/it] 49%|█████████████████████████████████████████████▉ | 342/700 [1:09:23<1:12:34, 12.16s/it] {'loss': 0.0739, 'learning_rate': 0.0051142857142857144, 'epoch': 32.19} + 49%|█████████████████████████████████████████████▉ | 342/700 [1:09:23<1:12:34, 12.16s/it] 49%|██████████████████████████████████████████████ | 343/700 [1:09:35<1:12:22, 12.16s/it] {'loss': 0.0535, 'learning_rate': 0.0051, 'epoch': 32.28} + 49%|██████████████████████████████████████████████ | 343/700 [1:09:35<1:12:22, 12.16s/it] 49%|██████████████████████████████████████████████▏ | 344/700 [1:09:47<1:12:10, 12.16s/it] {'loss': 0.0598, 'learning_rate': 0.005085714285714285, 'epoch': 32.38} + 49%|██████████████████████████████████████████████▏ | 344/700 [1:09:47<1:12:10, 12.16s/it] 49%|██████████████████████████████████████████████▎ | 345/700 [1:09:59<1:11:58, 12.16s/it] {'loss': 0.06, 'learning_rate': 0.005071428571428571, 'epoch': 32.47} + 49%|██████████████████████████████████████████████▎ | 345/700 [1:09:59<1:11:58, 12.16s/it] 49%|██████████████████████████████████████████████▍ | 346/700 [1:10:11<1:11:45, 12.16s/it] {'loss': 0.0734, 'learning_rate': 0.005057142857142856, 'epoch': 32.56} + 49%|██████████████████████████████████████████████▍ | 346/700 [1:10:11<1:11:45, 12.16s/it] 50%|██████████████████████████████████████████████▌ | 347/700 [1:10:23<1:11:33, 12.16s/it] {'loss': 0.078, 'learning_rate': 0.005042857142857143, 'epoch': 32.66} + 50%|██████████████████████████████████████████████▌ | 347/700 [1:10:23<1:11:33, 12.16s/it] 50%|██████████████████████████████████████████████▋ | 348/700 [1:10:35<1:11:21, 12.16s/it] {'loss': 0.0618, 'learning_rate': 0.005028571428571429, 'epoch': 32.75} + 50%|██████████████████████████████████████████████▋ | 348/700 [1:10:35<1:11:21, 12.16s/it] 50%|██████████████████████████████████████████████▊ | 349/700 [1:10:48<1:11:09, 12.16s/it] {'loss': 0.0655, 'learning_rate': 0.005014285714285714, 'epoch': 32.85} + 50%|██████████████████████████████████████████████▊ | 349/700 [1:10:48<1:11:09, 12.16s/it] 50%|███████████████████████████████████████████████ | 350/700 [1:11:00<1:10:57, 12.16s/it] {'loss': 0.0615, 'learning_rate': 0.005, 'epoch': 32.94} + 50%|███████████████████████████████████████████████ | 350/700 [1:11:00<1:10:57, 12.16s/it] 50%|███████████████████████████████████████████████▏ | 351/700 [1:11:12<1:10:45, 12.16s/it] {'loss': 0.0556, 'learning_rate': 0.004985714285714286, 'epoch': 33.04} + 50%|███████████████████████████████████████████████▏ | 351/700 [1:11:12<1:10:45, 12.16s/it] 50%|███████████████████████████████████████████████▎ | 352/700 [1:11:24<1:10:32, 12.16s/it] {'loss': 0.0637, 'learning_rate': 0.004971428571428572, 'epoch': 33.13} + 50%|███████████████████████████████████████████████▎ | 352/700 [1:11:24<1:10:32, 12.16s/it] 50%|███████████████████████████████████████████████▍ | 353/700 [1:11:36<1:10:20, 12.16s/it] {'loss': 0.0518, 'learning_rate': 0.004957142857142857, 'epoch': 33.22} + 50%|███████████████████████████████████████████████▍ | 353/700 [1:11:36<1:10:20, 12.16s/it] 51%|███████████████████████████████████████████████▌ | 354/700 [1:11:48<1:10:08, 12.16s/it] {'loss': 0.0466, 'learning_rate': 0.004942857142857143, 'epoch': 33.32} + 51%|███████████████████████████████████████████████▌ | 354/700 [1:11:48<1:10:08, 12.16s/it] 51%|███████████████████████████████████████████████▋ | 355/700 [1:12:01<1:09:56, 12.16s/it] {'loss': 0.0732, 'learning_rate': 0.004928571428571429, 'epoch': 33.41} + 51%|███████████████████████████████████████████████▋ | 355/700 [1:12:01<1:09:56, 12.16s/it] 51%|███████████████████████████████████████████████▊ | 356/700 [1:12:13<1:09:44, 12.16s/it] {'loss': 0.0584, 'learning_rate': 0.004914285714285715, 'epoch': 33.51} + 51%|███████████████████████████████████████████████▊ | 356/700 [1:12:13<1:09:44, 12.16s/it] 51%|███████████████████████████████████████████████▉ | 357/700 [1:12:25<1:09:31, 12.16s/it] {'loss': 0.0586, 'learning_rate': 0.0049, 'epoch': 33.6} + 51%|███████████████████████████████████████████████▉ | 357/700 [1:12:25<1:09:31, 12.16s/it] 51%|████████████████████████████████████████████████ | 358/700 [1:12:37<1:09:19, 12.16s/it] {'loss': 0.0481, 'learning_rate': 0.004885714285714286, 'epoch': 33.69} + 51%|████████████████████████████████████████████████ | 358/700 [1:12:37<1:09:19, 12.16s/it] 51%|████████████████████████████████████████████████▏ | 359/700 [1:12:49<1:09:07, 12.16s/it] {'loss': 0.0552, 'learning_rate': 0.004871428571428572, 'epoch': 33.79} + 51%|████████████████████████████████████████████████▏ | 359/700 [1:12:49<1:09:07, 12.16s/it] 51%|████████████████████████████████████████████████▎ | 360/700 [1:13:01<1:08:55, 12.16s/it] {'loss': 0.0567, 'learning_rate': 0.004857142857142858, 'epoch': 33.88} + 51%|████████████████████████████████████████████████▎ | 360/700 [1:13:01<1:08:55, 12.16s/it] 52%|████████████████████████████████████████████████▍ | 361/700 [1:13:14<1:08:43, 12.16s/it] {'loss': 0.0664, 'learning_rate': 0.004842857142857143, 'epoch': 33.98} + 52%|████████████████████████████████████████████████▍ | 361/700 [1:13:14<1:08:43, 12.16s/it] 52%|████████████████████████████████████████████████▌ | 362/700 [1:13:26<1:08:31, 12.16s/it] {'loss': 0.0701, 'learning_rate': 0.004828571428571429, 'epoch': 34.07} + 52%|████████████████████████████████████████████████▌ | 362/700 [1:13:26<1:08:31, 12.16s/it] 52%|████████████████████████████████████████████████▋ | 363/700 [1:13:38<1:08:19, 12.16s/it] {'loss': 0.069, 'learning_rate': 0.0048142857142857145, 'epoch': 34.16} + 52%|████████████████████████████████████████████████▋ | 363/700 [1:13:38<1:08:19, 12.16s/it] 52%|████████████████████████████████████████████████▉ | 364/700 [1:13:50<1:08:06, 12.16s/it] {'loss': 0.066, 'learning_rate': 0.0048, 'epoch': 34.26} + 52%|████████████████████████████████████████████████▉ | 364/700 [1:13:50<1:08:06, 12.16s/it] 52%|█████████████████████████████████████████████████ | 365/700 [1:14:02<1:07:54, 12.16s/it] {'loss': 0.0546, 'learning_rate': 0.004785714285714286, 'epoch': 34.35} + 52%|█████████████████████████████████████████████████ | 365/700 [1:14:02<1:07:54, 12.16s/it] 52%|█████████████████████████████████████████████████▏ | 366/700 [1:14:14<1:07:42, 12.16s/it] {'loss': 0.0616, 'learning_rate': 0.004771428571428571, 'epoch': 34.45} + 52%|█████████████████████████████████████████████████▏ | 366/700 [1:14:14<1:07:42, 12.16s/it] 52%|█████████████████████████████████████████████████▎ | 367/700 [1:14:27<1:07:30, 12.16s/it] {'loss': 0.0374, 'learning_rate': 0.004757142857142857, 'epoch': 34.54} + 52%|█████████████████████████████████████████████████▎ | 367/700 [1:14:27<1:07:30, 12.16s/it] 53%|█████████████████████████████████████████████████▍ | 368/700 [1:14:39<1:07:18, 12.16s/it] {'loss': 0.046, 'learning_rate': 0.004742857142857143, 'epoch': 34.64} + 53%|█████████████████████████████████████████████████▍ | 368/700 [1:14:39<1:07:18, 12.16s/it] 53%|█████████████████████████████████████████████████▌ | 369/700 [1:14:51<1:07:06, 12.16s/it] {'loss': 0.0459, 'learning_rate': 0.004728571428571428, 'epoch': 34.73} + 53%|█████████████████████████████████████████████████▌ | 369/700 [1:14:51<1:07:06, 12.16s/it] 53%|█████████████████████████████████████████████████▋ | 370/700 [1:15:03<1:06:53, 12.16s/it] {'loss': 0.0648, 'learning_rate': 0.004714285714285714, 'epoch': 34.82} + 53%|█████████████████████████████████████████████████▋ | 370/700 [1:15:03<1:06:53, 12.16s/it] 53%|█████████████████████████████████████████████████▊ | 371/700 [1:15:15<1:06:41, 12.16s/it] {'loss': 0.0699, 'learning_rate': 0.0047, 'epoch': 34.92} + 53%|█████████████████████████████████████████████████▊ | 371/700 [1:15:15<1:06:41, 12.16s/it] 53%|█████████████████████████████████████████████████▉ | 372/700 [1:15:27<1:06:29, 12.16s/it] {'loss': 0.0605, 'learning_rate': 0.004685714285714286, 'epoch': 35.01} + 53%|█████████████████████████████████████████████████▉ | 372/700 [1:15:27<1:06:29, 12.16s/it] 53%|██████████████████████████████████████████████████ | 373/700 [1:15:40<1:06:17, 12.16s/it] {'loss': 0.0704, 'learning_rate': 0.004671428571428571, 'epoch': 35.11} + 53%|██████████████████████████████████████████████████ | 373/700 [1:15:40<1:06:17, 12.16s/it] 53%|██████████████████████████████████████████████████▏ | 374/700 [1:15:52<1:06:05, 12.16s/it] {'loss': 0.0444, 'learning_rate': 0.004657142857142857, 'epoch': 35.2} + 53%|██████████████████████████████████████████████████▏ | 374/700 [1:15:52<1:06:05, 12.16s/it] 54%|██████████████████████████████████████████████████▎ | 375/700 [1:16:04<1:05:52, 12.16s/it] {'loss': 0.062, 'learning_rate': 0.004642857142857143, 'epoch': 35.29} + 54%|██████████████████████████████████████████████████▎ | 375/700 [1:16:04<1:05:52, 12.16s/it] 54%|██████████████████████████████████████████████████▍ | 376/700 [1:16:16<1:05:40, 12.16s/it] {'loss': 0.0464, 'learning_rate': 0.004628571428571429, 'epoch': 35.39} + 54%|██████████████████████████████████████████████████▍ | 376/700 [1:16:16<1:05:40, 12.16s/it] 54%|██████████████████████████████████████████████████▋ | 377/700 [1:16:28<1:05:28, 12.16s/it] {'loss': 0.0548, 'learning_rate': 0.004614285714285714, 'epoch': 35.48} + 54%|██████████████████████████████████████████████████▋ | 377/700 [1:16:28<1:05:28, 12.16s/it] 54%|██████████████████████████████████████████████████▊ | 378/700 [1:16:40<1:05:16, 12.16s/it] {'loss': 0.0555, 'learning_rate': 0.0046, 'epoch': 35.58} + 54%|██████████████████████████████████████████████████▊ | 378/700 [1:16:40<1:05:16, 12.16s/it] 54%|██████████████████████████████████████████████████▉ | 379/700 [1:16:53<1:05:04, 12.16s/it] {'loss': 0.0654, 'learning_rate': 0.004585714285714286, 'epoch': 35.67} + 54%|██████████████████████████████████████████████████▉ | 379/700 [1:16:53<1:05:04, 12.16s/it] 54%|███████████████████████████████████████████████████ | 380/700 [1:17:05<1:04:52, 12.16s/it] {'loss': 0.0592, 'learning_rate': 0.004571428571428572, 'epoch': 35.76} + 54%|███████████████████████████████████████████████████ | 380/700 [1:17:05<1:04:52, 12.16s/it] 54%|███████████████████████████████████████████████████▏ | 381/700 [1:17:17<1:04:40, 12.16s/it] {'loss': 0.0521, 'learning_rate': 0.004557142857142858, 'epoch': 35.86} + 54%|███████████████████████████████████████████████████▏ | 381/700 [1:17:17<1:04:40, 12.16s/it] 55%|███████████████████████████████████████████████████▎ | 382/700 [1:17:29<1:04:28, 12.16s/it] {'loss': 0.0633, 'learning_rate': 0.004542857142857143, 'epoch': 35.95} + 55%|███████████████████████████████████████████████████▎ | 382/700 [1:17:29<1:04:28, 12.16s/it] 55%|███████████████████████████████████████████████████▍ | 383/700 [1:17:41<1:04:16, 12.16s/it] {'loss': 0.047, 'learning_rate': 0.004528571428571429, 'epoch': 36.05} + 55%|███████████████████████████████████████████████████▍ | 383/700 [1:17:41<1:04:16, 12.16s/it] 55%|███████████████████████████████████████████████████▌ | 384/700 [1:17:53<1:04:03, 12.16s/it] {'loss': 0.0476, 'learning_rate': 0.004514285714285714, 'epoch': 36.14} + 55%|███████████████████████████████████████████████████▌ | 384/700 [1:17:53<1:04:03, 12.16s/it] 55%|███████████████████████████████████████████████████▋ | 385/700 [1:18:06<1:03:51, 12.16s/it] {'loss': 0.051, 'learning_rate': 0.0045000000000000005, 'epoch': 36.24} + 55%|███████████████████████████████████████████████████▋ | 385/700 [1:18:06<1:03:51, 12.16s/it] 55%|███████████████████████████████████████████████████▊ | 386/700 [1:18:18<1:03:39, 12.16s/it] {'loss': 0.064, 'learning_rate': 0.004485714285714286, 'epoch': 36.33} + 55%|███████████████████████████████████████████████████▊ | 386/700 [1:18:18<1:03:39, 12.16s/it] 55%|███████████████████████████████████████████████████▉ | 387/700 [1:18:30<1:03:27, 12.16s/it] {'loss': 0.0309, 'learning_rate': 0.0044714285714285715, 'epoch': 36.42} + 55%|███████████████████████████████████████████████████▉ | 387/700 [1:18:30<1:03:27, 12.16s/it] 55%|████████████████████████████████████████████████████ | 388/700 [1:18:42<1:03:15, 12.16s/it] {'loss': 0.0632, 'learning_rate': 0.0044571428571428574, 'epoch': 36.52} + 55%|████████████████████████████████████████████████████ | 388/700 [1:18:42<1:03:15, 12.16s/it] 56%|████████████████████████████████████████████████████▏ | 389/700 [1:18:54<1:03:02, 12.16s/it] {'loss': 0.0583, 'learning_rate': 0.004442857142857143, 'epoch': 36.61} + 56%|████████████████████████████████████████████████████▏ | 389/700 [1:18:54<1:03:02, 12.16s/it] 56%|████████████████████████████████████████████████████▎ | 390/700 [1:19:06<1:02:51, 12.16s/it] {'loss': 0.0524, 'learning_rate': 0.004428571428571428, 'epoch': 36.71} + 56%|████████████████████████████████████████████████████▎ | 390/700 [1:19:06<1:02:51, 12.16s/it] 56%|████████████████████████████████████████████████████▌ | 391/700 [1:19:19<1:02:39, 12.17s/it] {'loss': 0.0574, 'learning_rate': 0.004414285714285714, 'epoch': 36.8} + 56%|████████████████████████████████████████████████████▌ | 391/700 [1:19:19<1:02:39, 12.17s/it] 56%|████████████████████████████████████████████████████▋ | 392/700 [1:19:31<1:02:26, 12.16s/it] {'loss': 0.043, 'learning_rate': 0.0044, 'epoch': 36.89} + 56%|████████████████████████████████████████████████████▋ | 392/700 [1:19:31<1:02:26, 12.16s/it] 56%|████████████████████████████████████████████████████▊ | 393/700 [1:19:43<1:02:14, 12.16s/it] {'loss': 0.0482, 'learning_rate': 0.004385714285714285, 'epoch': 36.99} + 56%|████████████████████████████████████████████████████▊ | 393/700 [1:19:43<1:02:14, 12.16s/it] 56%|████████████████████████████████████████████████████▉ | 394/700 [1:19:55<1:02:02, 12.16s/it] {'loss': 0.0585, 'learning_rate': 0.004371428571428572, 'epoch': 37.08} + 56%|████████████████████████████████████████████████████▉ | 394/700 [1:19:55<1:02:02, 12.16s/it] 56%|█████████████████████████████████████████████████████ | 395/700 [1:20:07<1:01:50, 12.16s/it] {'loss': 0.0467, 'learning_rate': 0.004357142857142857, 'epoch': 37.18} + 56%|█████████████████████████████████████████████████████ | 395/700 [1:20:07<1:01:50, 12.16s/it] 57%|█████████████████████████████████████████████████████▏ | 396/700 [1:20:19<1:01:37, 12.16s/it] {'loss': 0.0498, 'learning_rate': 0.004342857142857143, 'epoch': 37.27} + 57%|█████████████████████████████████████████████████████▏ | 396/700 [1:20:19<1:01:37, 12.16s/it] 57%|█████████████████████████████████████████████████████▎ | 397/700 [1:20:32<1:01:25, 12.16s/it] {'loss': 0.0578, 'learning_rate': 0.004328571428571429, 'epoch': 37.36} + 57%|█████████████████████████████████████████████████████▎ | 397/700 [1:20:32<1:01:25, 12.16s/it] 57%|█████████████████████████████████████████████████████▍ | 398/700 [1:20:44<1:01:13, 12.16s/it] {'loss': 0.0469, 'learning_rate': 0.004314285714285714, 'epoch': 37.46} + 57%|█████████████████████████████████████████████████████▍ | 398/700 [1:20:44<1:01:13, 12.16s/it] 57%|█████████████████████████████████████████████████████▌ | 399/700 [1:20:56<1:01:01, 12.16s/it] {'loss': 0.0447, 'learning_rate': 0.0043, 'epoch': 37.55} + 57%|█████████████████████████████████████████████████████▌ | 399/700 [1:20:56<1:01:01, 12.16s/it] 57%|█████████████████████████████████████████████████████▋ | 400/700 [1:21:08<1:00:49, 12.16s/it] {'loss': 0.0669, 'learning_rate': 0.004285714285714286, 'epoch': 37.65} + 57%|█████████████████████████████████████████████████████▋ | 400/700 [1:21:08<1:00:49, 12.16s/it]Saving PrefixEncoder +[INFO|configuration_utils.py:460] 2023-12-02 17:15:02,214 >> Configuration saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/config.json +[INFO|configuration_utils.py:544] 2023-12-02 17:15:02,215 >> Configuration saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/generation_config.json +[INFO|modeling_utils.py:2118] 2023-12-02 17:15:02,222 >> Model weights saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/pytorch_model.bin +[INFO|tokenization_utils_base.py:2437] 2023-12-02 17:15:02,223 >> tokenizer config file saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenizer_config.json +[INFO|tokenization_utils_base.py:2446] 2023-12-02 17:15:02,223 >> Special tokens file saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/special_tokens_map.json +/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. + warnings.warn( + 57%|█████████████████████████████████████████████████████▊ | 401/700 [1:21:20<1:00:41, 12.18s/it] {'loss': 0.0556, 'learning_rate': 0.004271428571428572, 'epoch': 37.74} + 57%|█████████████████████████████████████████████████████▊ | 401/700 [1:21:20<1:00:41, 12.18s/it] 57%|█████████████████████████████████████████████████████▉ | 402/700 [1:21:32<1:00:27, 12.17s/it] {'loss': 0.0408, 'learning_rate': 0.004257142857142857, 'epoch': 37.84} + 57%|█████████████████████████████████████████████████████▉ | 402/700 [1:21:32<1:00:27, 12.17s/it] 58%|██████████████████████████████████████████████████████ | 403/700 [1:21:45<1:00:14, 12.17s/it] {'loss': 0.043, 'learning_rate': 0.004242857142857143, 'epoch': 37.93} + 58%|██████████████████████████████████████████████████████ | 403/700 [1:21:45<1:00:14, 12.17s/it] 58%|██████████████████████████████████████████████████████▎ | 404/700 [1:21:57<1:00:02, 12.17s/it] {'loss': 0.058, 'learning_rate': 0.004228571428571429, 'epoch': 38.02} + 58%|██████████████████████████████████████████████████████▎ | 404/700 [1:21:57<1:00:02, 12.17s/it] 58%|███████████████████████████████████████████████████████▌ | 405/700 [1:22:09<59:49, 12.17s/it] {'loss': 0.0605, 'learning_rate': 0.004214285714285715, 'epoch': 38.12} + 58%|███████████████████████████████████████████████████████▌ | 405/700 [1:22:09<59:49, 12.17s/it] 58%|███████████████████████████████████████████████████████▋ | 406/700 [1:22:21<59:37, 12.17s/it] {'loss': 0.0502, 'learning_rate': 0.0042, 'epoch': 38.21} + 58%|███████████████████████████████████████████████████████▋ | 406/700 [1:22:21<59:37, 12.17s/it] 58%|███████████████████████████████████████████████████████▊ | 407/700 [1:22:33<59:24, 12.17s/it] {'loss': 0.054, 'learning_rate': 0.004185714285714286, 'epoch': 38.31} + 58%|███████████████████████████████████████████████████████▊ | 407/700 [1:22:33<59:24, 12.17s/it] 58%|███████████████████████████████████████████████████████▉ | 408/700 [1:22:45<59:12, 12.17s/it] {'loss': 0.0421, 'learning_rate': 0.004171428571428572, 'epoch': 38.4} + 58%|███████████████████████████████████████████████████████▉ | 408/700 [1:22:45<59:12, 12.17s/it] 58%|████████████████████████████████████████████████████████ | 409/700 [1:22:58<59:00, 12.17s/it] {'loss': 0.0446, 'learning_rate': 0.0041571428571428575, 'epoch': 38.49} + 58%|████████████████████████████████████████████████████████ | 409/700 [1:22:58<59:00, 12.17s/it] 59%|████████████████████████████████████████████████████████▏ | 410/700 [1:23:10<58:47, 12.17s/it] {'loss': 0.0529, 'learning_rate': 0.0041428571428571434, 'epoch': 38.59} + 59%|████████████████████████████████████████████████████████▏ | 410/700 [1:23:10<58:47, 12.17s/it] 59%|████████████████████████████████████████████████████████▎ | 411/700 [1:23:22<58:36, 12.17s/it] {'loss': 0.0547, 'learning_rate': 0.0041285714285714285, 'epoch': 38.68} + 59%|████████████████████████████████████████████████████████▎ | 411/700 [1:23:22<58:36, 12.17s/it] 59%|████████████████████████████████████████████████████████▌ | 412/700 [1:23:34<58:23, 12.17s/it] {'loss': 0.0537, 'learning_rate': 0.004114285714285714, 'epoch': 38.78} + 59%|████████████████████████████████████████████████████████▌ | 412/700 [1:23:34<58:23, 12.17s/it] 59%|████████████████████████████████████████████████████████▋ | 413/700 [1:23:46<58:11, 12.17s/it] {'loss': 0.0609, 'learning_rate': 0.0040999999999999995, 'epoch': 38.87} + 59%|████████████████████████████████████████████████████████▋ | 413/700 [1:23:46<58:11, 12.17s/it] 59%|████████████████████████████████████████████████████████▊ | 414/700 [1:23:58<57:58, 12.16s/it] {'loss': 0.0508, 'learning_rate': 0.004085714285714286, 'epoch': 38.96} + 59%|████████████████████████████████████████████████████████▊ | 414/700 [1:23:58<57:58, 12.16s/it] 59%|████████████████████████████████████████████████████████▉ | 415/700 [1:24:11<57:46, 12.16s/it] {'loss': 0.0446, 'learning_rate': 0.004071428571428571, 'epoch': 39.06} + 59%|████████████████████████████████████████████████████████▉ | 415/700 [1:24:11<57:46, 12.16s/it] 59%|█████████████████████████████████████████████████████████ | 416/700 [1:24:23<57:34, 12.16s/it] {'loss': 0.0503, 'learning_rate': 0.004057142857142857, 'epoch': 39.15} + 59%|█████████████████████████████████████████████████████████ | 416/700 [1:24:23<57:34, 12.16s/it] 60%|█████████████████████████████████████████████████████████▏ | 417/700 [1:24:35<57:22, 12.16s/it] {'loss': 0.0418, 'learning_rate': 0.004042857142857143, 'epoch': 39.25} + 60%|█████████████████████████████████████████████████████████▏ | 417/700 [1:24:35<57:22, 12.16s/it] 60%|█████████████████████████████████████████████████████████▎ | 418/700 [1:24:47<57:10, 12.16s/it] {'loss': 0.0503, 'learning_rate': 0.004028571428571428, 'epoch': 39.34} + 60%|█████████████████████████████████████████████████████████▎ | 418/700 [1:24:47<57:10, 12.16s/it] 60%|█████████████████████████████████████████████████████████▍ | 419/700 [1:24:59<56:58, 12.16s/it] {'loss': 0.0597, 'learning_rate': 0.004014285714285714, 'epoch': 39.44} + 60%|█████████████████████████████████████████████████████████▍ | 419/700 [1:24:59<56:58, 12.16s/it] 60%|█████████████████████████████████████████████████████████▌ | 420/700 [1:25:11<56:45, 12.16s/it] {'loss': 0.0443, 'learning_rate': 0.004, 'epoch': 39.53} + 60%|█████████████████████████████████████████████████████████▌ | 420/700 [1:25:11<56:45, 12.16s/it] 60%|█████████████████████████████████████████████████████████▋ | 421/700 [1:25:23<56:33, 12.16s/it] {'loss': 0.0495, 'learning_rate': 0.003985714285714286, 'epoch': 39.62} + 60%|█████████████████████████████████████████████████████████▋ | 421/700 [1:25:23<56:33, 12.16s/it] 60%|█████████████████████████████████████████████████████████▊ | 422/700 [1:25:36<56:21, 12.16s/it] {'loss': 0.0435, 'learning_rate': 0.003971428571428571, 'epoch': 39.72} + 60%|█████████████████████████████████████████████████████████▊ | 422/700 [1:25:36<56:21, 12.16s/it] 60%|██████████████████████████████████████████████████████████ | 423/700 [1:25:48<56:09, 12.16s/it] {'loss': 0.0528, 'learning_rate': 0.003957142857142858, 'epoch': 39.81} + 60%|██████████████████████████████████████████████████████████ | 423/700 [1:25:48<56:09, 12.16s/it] 61%|██████████████████████████████████████████████████████████▏ | 424/700 [1:26:00<55:57, 12.16s/it] {'loss': 0.0457, 'learning_rate': 0.003942857142857143, 'epoch': 39.91} + 61%|██████████████████████████████████████████████████████████▏ | 424/700 [1:26:00<55:57, 12.16s/it] 61%|██████████████████████████████████████████████████████████▎ | 425/700 [1:26:12<55:45, 12.16s/it] {'loss': 0.0491, 'learning_rate': 0.003928571428571429, 'epoch': 40.0} + 61%|██████████████████████████████████████████████████████████▎ | 425/700 [1:26:12<55:45, 12.16s/it] 61%|██████████████████████████████████████████████████████████▍ | 426/700 [1:26:24<55:32, 12.16s/it] {'loss': 0.0514, 'learning_rate': 0.003914285714285714, 'epoch': 40.09} + 61%|██████████████████████████████████████████████████████████▍ | 426/700 [1:26:24<55:32, 12.16s/it] 61%|██████████████████████████████████████████████████████████▌ | 427/700 [1:26:36<55:20, 12.16s/it] {'loss': 0.0389, 'learning_rate': 0.0039000000000000003, 'epoch': 40.19} + 61%|██████████████████████████████████████████████████████████▌ | 427/700 [1:26:36<55:20, 12.16s/it] 61%|██████████████████████████████████████████████████████████▋ | 428/700 [1:26:49<55:08, 12.16s/it] {'loss': 0.0415, 'learning_rate': 0.0038857142857142857, 'epoch': 40.28} + 61%|██████████████████████████████████████████████████████████▋ | 428/700 [1:26:49<55:08, 12.16s/it] 61%|██████████████████████████████████████████████████████████▊ | 429/700 [1:27:01<54:56, 12.16s/it] {'loss': 0.0508, 'learning_rate': 0.0038714285714285712, 'epoch': 40.38} + 61%|██████████████████████████████████████████████████████████▊ | 429/700 [1:27:01<54:56, 12.16s/it] 61%|██████████████████████████████████████████████████████████▉ | 430/700 [1:27:13<54:44, 12.16s/it] {'loss': 0.0467, 'learning_rate': 0.0038571428571428576, 'epoch': 40.47} + 61%|██████████████████████████████████████████████████████████▉ | 430/700 [1:27:13<54:44, 12.16s/it] \ No newline at end of file