diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..50d927dc68b4eaa40bd4812b7417b3f2bd61f599
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "chatglm3-6b",
+  "add_bias_linear": false,
+  "add_qkv_bias": true,
+  "apply_query_key_layer_scaling": true,
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "ChatGLMForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "auto_map": {
+    "AutoConfig": "configuration_chatglm.ChatGLMConfig",
+    "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
+  },
+  "bias_dropout_fusion": true,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "ffn_hidden_size": 13696,
+  "fp32_residual_connection": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 4096,
+  "kv_channels": 128,
+  "layernorm_epsilon": 1e-05,
+  "model_type": "chatglm",
+  "multi_query_attention": true,
+  "multi_query_group_num": 2,
+  "num_attention_heads": 32,
+  "num_layers": 28,
+  "original_rope": true,
+  "pad_token_id": 0,
+  "padded_vocab_size": 65024,
+  "post_layer_norm": true,
+  "pre_seq_len": 128,
+  "prefix_projection": false,
+  "quantization_bit": 0,
+  "rmsnorm": true,
+  "seq_length": 8192,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.34.0",
+  "use_cache": true,
+  "vocab_size": 65024
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/configuration_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/configuration_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..35600185f5a26951081de0f3a41a913eaf06af99
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/configuration_chatglm.py
@@ -0,0 +1,61 @@
+from transformers import PretrainedConfig
+
+
+class ChatGLMConfig(PretrainedConfig):
+    model_type = "chatglm"
+    def __init__(
+        self,
+        num_layers=28,
+        padded_vocab_size=65024,
+        hidden_size=4096,
+        ffn_hidden_size=13696,
+        kv_channels=128,
+        num_attention_heads=32,
+        seq_length=2048,
+        hidden_dropout=0.0,
+        classifier_dropout=None,
+        attention_dropout=0.0,
+        layernorm_epsilon=1e-5,
+        rmsnorm=True,
+        apply_residual_connection_post_layernorm=False,
+        post_layer_norm=True,
+        add_bias_linear=False,
+        add_qkv_bias=False,
+        bias_dropout_fusion=True,
+        multi_query_attention=False,
+        multi_query_group_num=1,
+        apply_query_key_layer_scaling=True,
+        attention_softmax_in_fp32=True,
+        fp32_residual_connection=False,
+        quantization_bit=0,
+        pre_seq_len=None,
+        prefix_projection=False,
+        **kwargs
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
+        self.padded_vocab_size = padded_vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.seq_length = seq_length
+        self.hidden_dropout = hidden_dropout
+        self.classifier_dropout = classifier_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_attention = multi_query_attention
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        super().__init__(**kwargs)
\ No newline at end of file
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/generation_config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..31d22ad9418a1daba6b2bbf472ac3762cd5ce643
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/generation_config.json
@@ -0,0 +1,6 @@
+{
+  "_from_model_config": true,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.34.0"
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/modeling_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/modeling_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5b5027587016090a377f25289284b6e4f829cb4
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/modeling_chatglm.py
@@ -0,0 +1,1293 @@
+""" PyTorch ChatGLM model. """
+
+import math
+import copy
+import warnings
+import re
+import sys
+
+import torch
+import torch.utils.checkpoint
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
+from torch.nn.utils import skip_init
+from typing import Optional, Tuple, Union, List, Callable, Dict, Any
+from copy import deepcopy
+
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
+
+from .configuration_chatglm import ChatGLMConfig
+
+# flags required to enable jit fusion kernels
+
+if sys.platform != 'darwin':
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    torch._C._jit_override_can_fuse_on_cpu(True)
+    torch._C._jit_override_can_fuse_on_gpu(True)
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
+_CONFIG_FOR_DOC = "ChatGLMConfig"
+
+CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "THUDM/chatglm3-6b",
+    # See all ChatGLM models at https://huggingface.co/models?filter=chatglm
+]
+
+
+def default_init(cls, *args, **kwargs):
+    return cls(*args, **kwargs)
+
+
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 5] = 5e4
+        return scores
+
+
+class PrefixEncoder(torch.nn.Module):
+    """
+    The torch.nn model to encode the prefix
+    Input shape: (batch-size, prefix-length)
+    Output shape: (batch-size, prefix-length, 2*layers*hidden)
+    """
+
+    def __init__(self, config: ChatGLMConfig):
+        super().__init__()
+        self.prefix_projection = config.prefix_projection
+        if self.prefix_projection:
+            # Use a two-layer MLP to encode the prefix
+            kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2
+            self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size)
+            self.trans = torch.nn.Sequential(
+                torch.nn.Linear(kv_size, config.hidden_size),
+                torch.nn.Tanh(),
+                torch.nn.Linear(config.hidden_size, kv_size)
+            )
+        else:
+            self.embedding = torch.nn.Embedding(config.pre_seq_len,
+                                                config.num_layers * config.kv_channels * config.multi_query_group_num * 2)
+
+    def forward(self, prefix: torch.Tensor):
+        if self.prefix_projection:
+            prefix_tokens = self.embedding(prefix)
+            past_key_values = self.trans(prefix_tokens)
+        else:
+            past_key_values = self.embedding(prefix)
+        return past_key_values
+
+
+def split_tensor_along_last_dim(
+        tensor: torch.Tensor,
+        num_partitions: int,
+        contiguous_split_chunks: bool = False,
+) -> List[torch.Tensor]:
+    """Split a tensor along its last dimension.
+
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+
+    Returns:
+        A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = tensor.size()[last_dim] // num_partitions
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, original_impl=False, device=None, dtype=None):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.dim = dim
+        self.original_impl = original_impl
+
+    def forward_impl(
+            self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
+    ):
+        """Enhanced Transformer with Rotary Position Embedding.
+
+        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+        transformers/rope/__init__.py. MIT License:
+        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+        """
+        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+        theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
+
+        # Create position indexes `[0, 1, ..., seq_len - 1]`
+        seq_idx = torch.arange(seq_len, dtype=torch.float, device=device)
+
+        # Calculate the product of position index and $\theta_i$
+        idx_theta = torch.outer(seq_idx, theta).float()
+
+        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
+
+        # this is to mimic the behaviour of complex32, else we will get different results
+        if dtype in (torch.float16, torch.bfloat16, torch.int8):
+            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
+        return cache
+
+    def forward(self, max_seq_len, offset=0):
+        return self.forward_impl(
+            max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
+        )
+
+
+@torch.jit.script
+def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
+    # x: [sq, b, np, hn]
+    sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
+    rot_dim = rope_cache.shape[-2] * 2
+    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
+    # truncate to support variable sizes
+    rope_cache = rope_cache[:sq]
+    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
+    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
+            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return torch.cat((x_out2, x_pass), dim=-1)
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
+        self.eps = eps
+
+    def forward(self, hidden_states: torch.Tensor):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+
+        return (self.weight * hidden_states).to(input_dtype)
+
+
+class CoreAttention(torch.nn.Module):
+    def __init__(self, config: ChatGLMConfig, layer_number):
+        super(CoreAttention, self).__init__()
+
+        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+
+        projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        self.hidden_size_per_partition = projection_size
+        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+        self.coeff = coeff
+
+        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
+
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        pytorch_major_version = int(torch.__version__.split('.')[0])
+        if pytorch_major_version >= 2:
+            query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
+            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 is_causal=True)
+            else:
+                if attention_mask is not None:
+                    attention_mask = ~attention_mask
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 attention_mask)
+            context_layer = context_layer.permute(2, 0, 1, 3)
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.reshape(*new_context_layer_shape)
+        else:
+            # Raw attention scores
+
+            # [b, np, sq, sk]
+            output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
+
+            # [sq, b, np, hn] -> [sq, b * np, hn]
+            query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
+            # [sk, b, np, hn] -> [sk, b * np, hn]
+            key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
+
+            # preallocting input tensor: [b * np, sq, sk]
+            matmul_input_buffer = torch.empty(
+                output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
+                device=query_layer.device
+            )
+
+            # Raw attention scores. [b * np, sq, sk]
+            matmul_result = torch.baddbmm(
+                matmul_input_buffer,
+                query_layer.transpose(0, 1),  # [b * np, sq, hn]
+                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+                beta=0.0,
+                alpha=(1.0 / self.norm_factor),
+            )
+
+            # change view to [b, np, sq, sk]
+            attention_scores = matmul_result.view(*output_size)
+
+            # ===========================
+            # Attention probs and dropout
+            # ===========================
+
+            # attention scores and attention mask [b, np, sq, sk]
+            if self.attention_softmax_in_fp32:
+                attention_scores = attention_scores.float()
+            if self.coeff is not None:
+                attention_scores = attention_scores * self.coeff
+            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
+                attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
+                                            device=attention_scores.device, dtype=torch.bool)
+                attention_mask.tril_()
+                attention_mask = ~attention_mask
+            if attention_mask is not None:
+                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
+            attention_probs = F.softmax(attention_scores, dim=-1)
+            attention_probs = attention_probs.type_as(value_layer)
+
+            # This is actually dropping out entire tokens to attend to, which might
+            # seem a bit unusual, but is taken from the original Transformer paper.
+            attention_probs = self.attention_dropout(attention_probs)
+            # =========================
+            # Context layer. [sq, b, hp]
+            # =========================
+
+            # value_layer -> context layer.
+            # [sk, b, np, hn] --> [b, np, sq, hn]
+
+            # context layer shape: [b, np, sq, hn]
+            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
+            # change view [sk, b * np, hn]
+            value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
+            # change view [b * np, sq, sk]
+            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+            # matmul: [b * np, sq, hn]
+            context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+            # change view [b, np, sq, hn]
+            context_layer = context_layer.view(*output_size)
+            # [b, np, sq, hn] --> [sq, b, np, hn]
+            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+            # [sq, b, np, hn] --> [sq, b, hp]
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.view(*new_context_layer_shape)
+
+        return context_layer
+
+
+class SelfAttention(torch.nn.Module):
+    """Parallel self-attention layer abstract class.
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(SelfAttention, self).__init__()
+        self.layer_number = max(1, layer_number)
+
+        self.projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        self.multi_query_attention = config.multi_query_attention
+        self.qkv_hidden_size = 3 * self.projection_size
+        if self.multi_query_attention:
+            self.num_multi_query_groups_per_partition = config.multi_query_group_num
+            self.qkv_hidden_size = (
+                    self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
+            )
+        self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
+                                         bias=config.add_bias_linear or config.add_qkv_bias,
+                                         device=device, **_config_to_kwargs(config)
+                                         )
+
+        self.core_attention = CoreAttention(config, self.layer_number)
+
+        # Output.
+        self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
+                               device=device, **_config_to_kwargs(config)
+                               )
+
+    def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
+        if self.multi_query_attention:
+            num_attention_heads = self.num_multi_query_groups_per_partition
+        else:
+            num_attention_heads = self.num_attention_heads_per_partition
+        return torch.empty(
+            inference_max_sequence_len,
+            batch_size,
+            num_attention_heads,
+            self.hidden_size_per_attention_head,
+            dtype=dtype,
+            device=device,
+        )
+
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
+    ):
+        # hidden_states: [sq, b, h]
+
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        # =====================
+        # Query, Key, and Value
+        # =====================
+
+        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+        mixed_x_layer = self.query_key_value(hidden_states)
+
+        if self.multi_query_attention:
+            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+                [
+                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                ],
+                dim=-1,
+            )
+            query_layer = query_layer.view(
+                query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            key_layer = key_layer.view(
+                key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.view(
+                value_layer.size()[:-1]
+                + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+        else:
+            new_tensor_shape = mixed_x_layer.size()[:-1] + \
+                               (self.num_attention_heads_per_partition,
+                                3 * self.hidden_size_per_attention_head)
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # apply relative positional encoding (rotary embedding)
+        if rotary_pos_emb is not None:
+            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
+            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
+
+        # adjust key and value for inference
+        if kv_cache is not None:
+            cache_k, cache_v = kv_cache
+            key_layer = torch.cat((cache_k, key_layer), dim=0)
+            value_layer = torch.cat((cache_v, value_layer), dim=0)
+        if use_cache:
+            kv_cache = (key_layer, value_layer)
+        else:
+            kv_cache = None
+
+        if self.multi_query_attention:
+            key_layer = key_layer.unsqueeze(-2)
+            key_layer = key_layer.expand(
+                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
+            )
+            key_layer = key_layer.contiguous().view(
+                key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.unsqueeze(-2)
+            value_layer = value_layer.expand(
+                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
+            )
+            value_layer = value_layer.contiguous().view(
+                value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output = self.dense(context_layer)
+
+        return output, kv_cache
+
+
+def _config_to_kwargs(args):
+    common_kwargs = {
+        "dtype": args.torch_dtype,
+    }
+    return common_kwargs
+
+
+class MLP(torch.nn.Module):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(MLP, self).__init__()
+
+        self.add_bias = config.add_bias_linear
+
+        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        self.dense_h_to_4h = nn.Linear(
+            config.hidden_size,
+            config.ffn_hidden_size * 2,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config)
+        )
+
+        def swiglu(x):
+            x = torch.chunk(x, 2, dim=-1)
+            return F.silu(x[0]) * x[1]
+
+        self.activation_func = swiglu
+
+        # Project back to h.
+        self.dense_4h_to_h = nn.Linear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config)
+        )
+
+    def forward(self, hidden_states):
+        # [s, b, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+        # [s, b, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        return output
+
+
+class GLMBlock(torch.nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(GLMBlock, self).__init__()
+        self.layer_number = layer_number
+
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                             dtype=config.torch_dtype)
+
+        # Self attention.
+        self.self_attention = SelfAttention(config, layer_number, device=device)
+        self.hidden_dropout = config.hidden_dropout
+
+        # Layernorm on the attention output
+        self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                      dtype=config.torch_dtype)
+
+        # MLP
+        self.mlp = MLP(config, device=device)
+
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
+    ):
+        # hidden_states: [s, b, h]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output, kv_cache = self.self_attention(
+            layernorm_output,
+            attention_mask,
+            rotary_pos_emb,
+            kv_cache=kv_cache,
+            use_cache=use_cache
+        )
+
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
+        layernorm_input = residual + layernorm_input
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
+        output = residual + output
+
+        return output, kv_cache
+
+
+class GLMTransformer(torch.nn.Module):
+    """Transformer class."""
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(GLMTransformer, self).__init__()
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.post_layer_norm = config.post_layer_norm
+
+        # Number of layers.
+        self.num_layers = config.num_layers
+
+        # Transformer layers.
+        def build_layer(layer_number):
+            return GLMBlock(config, layer_number, device=device)
+
+        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
+
+        if self.post_layer_norm:
+            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+            # Final layer norm before output.
+            self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                 dtype=config.torch_dtype)
+
+        self.gradient_checkpointing = False
+
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
+            use_cache: Optional[bool] = True,
+            output_hidden_states: Optional[bool] = False,
+    ):
+        if not kv_caches:
+            kv_caches = [None for _ in range(self.num_layers)]
+        presents = () if use_cache else None
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        all_self_attentions = None
+        all_hidden_states = () if output_hidden_states else None
+        for index in range(self.num_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer = self._get_layer(index)
+            if self.gradient_checkpointing and self.training:
+                layer_ret = torch.utils.checkpoint.checkpoint(
+                    layer,
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_caches[index],
+                    use_cache
+                )
+            else:
+                layer_ret = layer(
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_cache=kv_caches[index],
+                    use_cache=use_cache
+                )
+            hidden_states, kv_cache = layer_ret
+            if use_cache:
+                presents = presents + (kv_cache,)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        # Final layer norm.
+        if self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states, presents, all_hidden_states, all_self_attentions
+
+
+class ChatGLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    config_class = ChatGLMConfig
+    base_model_prefix = "transformer"
+    _no_split_modules = ["GLMBlock"]
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        return
+
+    def get_masks(self, input_ids, past_key_values, padding_mask=None):
+        batch_size, seq_length = input_ids.shape
+        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
+        full_attention_mask.tril_()
+        past_length = 0
+        if past_key_values:
+            past_length = past_key_values[0][0].shape[0]
+        if past_length:
+            full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
+                                                        device=input_ids.device), full_attention_mask), dim=-1)
+        if padding_mask is not None:
+            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
+        if not past_length and padding_mask is not None:
+            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
+        full_attention_mask = (full_attention_mask < 0.5).bool()
+        full_attention_mask.unsqueeze_(1)
+        return full_attention_mask
+
+    def get_position_ids(self, input_ids, device):
+        batch_size, seq_length = input_ids.shape
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+        return position_ids
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GLMTransformer):
+            module.gradient_checkpointing = value
+
+
+class Embedding(torch.nn.Module):
+    """Language model embeddings."""
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(Embedding, self).__init__()
+
+        self.hidden_size = config.hidden_size
+        # Word embeddings (parallel).
+        self.word_embeddings = nn.Embedding(
+            config.padded_vocab_size,
+            self.hidden_size,
+            dtype=config.torch_dtype,
+            device=device
+        )
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+    def forward(self, input_ids):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
+        return embeddings
+
+
+class ChatGLMModel(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
+        super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        init_kwargs = {}
+        if device is not None:
+            init_kwargs["device"] = device
+        self.embedding = init_method(Embedding, config, **init_kwargs)
+        self.num_layers = config.num_layers
+        self.multi_query_group_num = config.multi_query_group_num
+        self.kv_channels = config.kv_channels
+
+        # Rotary positional embeddings
+        self.seq_length = config.seq_length
+        rotary_dim = (
+            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
+        )
+
+        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
+                                              dtype=config.torch_dtype)
+        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
+        self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
+                                        dtype=config.torch_dtype, **init_kwargs)
+        self.pre_seq_len = config.pre_seq_len
+        self.prefix_projection = config.prefix_projection
+        if self.pre_seq_len is not None:
+            for param in self.parameters():
+                param.requires_grad = False
+            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
+            self.prefix_encoder = PrefixEncoder(config)
+            self.dropout = torch.nn.Dropout(0.1)
+
+    def get_input_embeddings(self):
+        return self.embedding.word_embeddings
+
+    def get_prompt(self, batch_size, device, dtype=torch.half):
+        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
+        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
+        past_key_values = past_key_values.view(
+            batch_size,
+            self.pre_seq_len,
+            self.num_layers * 2,
+            self.multi_query_group_num,
+            self.kv_channels
+        )
+        # seq_len, b, nh, hidden_size
+        past_key_values = self.dropout(past_key_values)
+        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
+        return past_key_values
+
+    def forward(
+            self,
+            input_ids,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.BoolTensor] = None,
+            full_attention_mask: Optional[torch.BoolTensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, seq_length = input_ids.shape
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embedding(input_ids)
+
+        if self.pre_seq_len is not None:
+            if past_key_values is None:
+                past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device,
+                                                  dtype=inputs_embeds.dtype)
+            if attention_mask is not None:
+                attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)),
+                                            attention_mask], dim=-1)
+
+        if full_attention_mask is None:
+            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
+                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
+
+        # Rotary positional embeddings
+        rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
+        if position_ids is not None:
+            rotary_pos_emb = rotary_pos_emb[position_ids]
+        else:
+            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
+        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
+
+        # Run encoder.
+        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
+            inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
+            kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
+        )
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def quantize(self, weight_bit_width: int):
+        from .quantization import quantize
+        quantize(self.encoder, weight_bit_width)
+        return self
+
+
+class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
+        super().__init__(config)
+
+        self.max_sequence_length = config.max_length
+        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
+        self.config = config
+        self.quantized = False
+
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+
+    def _update_model_kwargs_for_generation(
+            self,
+            outputs: ModelOutput,
+            model_kwargs: Dict[str, Any],
+            is_encoder_decoder: bool = False,
+            standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
+            outputs, standardize_cache_format=standardize_cache_format
+        )
+
+        # update attention mask
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+            )
+
+        # update position ids
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            new_position_id = position_ids[..., -1:].clone()
+            new_position_id += 1
+            model_kwargs["position_ids"] = torch.cat(
+                [position_ids, new_position_id], dim=-1
+            )
+
+        model_kwargs["is_first_forward"] = False
+        return model_kwargs
+
+    def prepare_inputs_for_generation(
+            self,
+            input_ids: torch.LongTensor,
+            past_key_values: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            is_first_forward: bool = True,
+            **kwargs
+    ) -> dict:
+        # only last token for input_ids if past is not None
+        if position_ids is None:
+            position_ids = self.get_position_ids(input_ids, device=input_ids.device)
+        if not is_first_forward:
+            if past_key_values is not None:
+                position_ids = position_ids[..., -1:]
+                input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "return_last_logit": True,
+            "use_cache": use_cache
+        }
+
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            return_last_logit: Optional[bool] = False,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        if return_last_logit:
+            hidden_states = hidden_states[-1:]
+        lm_logits = self.transformer.output_layer(hidden_states)
+        lm_logits = lm_logits.transpose(0, 1).contiguous()
+
+        loss = None
+        if labels is not None:
+            lm_logits = lm_logits.to(torch.float32)
+
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+            lm_logits = lm_logits.to(hidden_states.dtype)
+            loss = loss.to(hidden_states.dtype)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(
+            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+
+        Output shares the same memory storage as `past`.
+        """
+        return tuple(
+            (
+                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
+                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
+            )
+            for layer_past in past
+        )
+
+    def process_response(self, output, history):
+        content = ""
+        history = deepcopy(history)
+        for response in output.split("<|assistant|>"):
+            metadata, content = response.split("\n", maxsplit=1)
+            if not metadata.strip():
+                content = content.strip()
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                content = content.replace("[[训练时间]]", "2023年")
+            else:
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                if history[0]["role"] == "system" and "tools" in history[0]:
+                    content = "\n".join(content.split("\n")[1:-1])
+                    def tool_call(**kwargs):
+                        return kwargs
+                    parameters = eval(content)
+                    content = {"name": metadata.strip(), "parameters": parameters}
+                else:
+                    content = {"name": metadata.strip(), "content": content}
+        return content, history
+
+    @torch.inference_mode()
+    def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
+             max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
+             **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        inputs = tokenizer.build_chat_input(query, history=history, role=role)
+        inputs = inputs.to(self.device)
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
+                        tokenizer.get_command("<|observation|>")]
+        outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id)
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+        response = tokenizer.decode(outputs)
+        history.append({"role": role, "content": query})
+        response, history = self.process_response(response, history)
+        return response, history
+
+    @torch.inference_mode()
+    def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
+                    past_key_values=None,max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8,
+                    logits_processor=None, return_past_key_values=False, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
+                        tokenizer.get_command("<|observation|>")]
+        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        if past_key_values is None:
+            inputs = tokenizer.build_chat_input(query, history=history, role=role)
+        else:
+            inputs = tokenizer.build_chat_input(query, role=role)
+        inputs = inputs.to(self.device)
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[0]
+            if self.transformer.pre_seq_len is not None:
+                past_length -= self.transformer.pre_seq_len
+            inputs.position_ids += past_length
+            attention_mask = inputs.attention_mask
+            attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
+            inputs['attention_mask'] = attention_mask
+        history.append({"role": role, "content": query})
+        for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
+                                            eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
+                                            **gen_kwargs):
+            if return_past_key_values:
+                outputs, past_key_values = outputs
+            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+            response = tokenizer.decode(outputs)
+            if response and response[-1] != "�":
+                response, new_history = self.process_response(response, history)
+                if return_past_key_values:
+                    yield response, new_history, past_key_values
+                else:
+                    yield response, new_history
+
+    @torch.inference_mode()
+    def stream_generate(
+            self,
+            input_ids,
+            generation_config: Optional[GenerationConfig] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+            return_past_key_values=False,
+            **kwargs,
+    ):
+        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
+
+        if generation_config is None:
+            generation_config = self.generation_config
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)
+        model_kwargs["use_cache"] = generation_config.use_cache
+        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
+
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None:
+            warnings.warn(
+                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
+                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
+                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+            if not has_default_max_length:
+                logger.warn(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
+                    UserWarning,
+                )
+
+        if input_ids_seq_length >= generation_config.max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            logger.warning(
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`."
+            )
+
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+        )
+
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+        logits_warper = self._get_logits_warper(generation_config)
+
+        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        scores = None
+        while True:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=False,
+                output_hidden_states=False,
+            )
+
+            next_token_logits = outputs.logits[:, -1, :]
+
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            if generation_config.do_sample:
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(probs, dim=-1)
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            unfinished_sequences = unfinished_sequences.mul(
+                next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+            )
+            if return_past_key_values:
+                yield input_ids, outputs.past_key_values
+            else:
+                yield input_ids
+            # stop when each sentence is finished, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                break
+
+    def quantize(self, bits: int, empty_init=False, device=None, **kwargs):
+        if bits == 0:
+            return
+
+        from .quantization import quantize
+
+        if self.quantized:
+            logger.info("Already quantized.")
+            return self
+
+        self.quantized = True
+
+        self.config.quantization_bit = bits
+
+        self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
+                                            **kwargs)
+        return self
+
+
+class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
+
+        self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
+        if config.classifier_dropout is not None:
+            self.dropout = nn.Dropout(config.classifier_dropout)
+        else:
+            self.dropout = None
+        self.config = config
+
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            full_attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.LongTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            full_attention_mask=full_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        pooled_hidden_states = hidden_states[-1]
+        if self.dropout is not None:
+            pooled_hidden_states = self.dropout(pooled_hidden_states)
+        logits = self.classifier_head(pooled_hidden_states)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze().float(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits.float(), labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits.float(), labels.view(-1, self.num_labels))
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/optimizer.pt b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..537a0c51543a9a8284ca138a0b0bac68293ea7d5
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1536e16847acaf3fcab9fe3cbd51c33a222333a9b1fa9bc163ccff4761e8e877
+size 14682210
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/pytorch_model.bin b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7bc1511008eca1551f96ae55b62e99413c42561a
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fc22e1b75d012bb3ba2c92368d8b6364584c2af351af3dba685befac8e853db
+size 7341306
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/quantization.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb95bfe82b203ff6a2aa962326d2c7a438d6a52f
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/quantization.py
@@ -0,0 +1,188 @@
+from torch.nn import Linear
+from torch.nn.parameter import Parameter
+
+import bz2
+import torch
+import base64
+import ctypes
+from transformers.utils import logging
+
+from typing import List
+from functools import partial
+
+logger = logging.get_logger(__name__)
+
+try:
+    from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up
+
+    class Kernel:
+        def __init__(self, code: bytes, function_names: List[str]):
+            self.code = code
+            self._function_names = function_names
+            self._cmodule = LazyKernelCModule(self.code)
+
+            for name in self._function_names:
+                setattr(self, name, KernelFunction(self._cmodule, name))
+
+    quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ"
+
+    kernels = Kernel(
+        bz2.decompress(base64.b64decode(quantization_code)),
+        [
+            "int4WeightCompression",
+            "int4WeightExtractionFloat",
+            "int4WeightExtractionHalf",
+            "int8WeightExtractionFloat",
+            "int8WeightExtractionHalf",
+        ],
+    )
+except Exception as exception:
+    kernels = None
+    logger.warning("Failed to load cpm_kernels:" + str(exception))
+
+
+class W8A16Linear(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width):
+        ctx.inp_shape = inp.size()
+        ctx.weight_bit_width = weight_bit_width
+        out_features = quant_w.size(0)
+        inp = inp.contiguous().view(-1, inp.size(-1))
+        weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width)
+        ctx.weight_shape = weight.size()
+        output = inp.mm(weight.t())
+        ctx.save_for_backward(inp, quant_w, scale_w)
+        return output.view(*(ctx.inp_shape[:-1] + (out_features,)))
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        inp, quant_w, scale_w = ctx.saved_tensors
+        weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width)
+        grad_output = grad_output.contiguous().view(-1, weight.size(0))
+        grad_input = grad_output.mm(weight)
+        grad_weight = grad_output.t().mm(inp)
+        return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None
+
+
+def compress_int4_weight(weight: torch.Tensor):  # (n, m)
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        assert m % 2 == 0
+        m = m // 2
+        out = torch.empty(n, m, dtype=torch.int8, device="cuda")
+        stream = torch.cuda.current_stream()
+
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+
+        kernels.int4WeightCompression(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
+        )
+        return out
+
+
+def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int):
+    assert scale_list.dtype in [torch.half, torch.bfloat16]
+    assert weight.dtype in [torch.int8]
+    if source_bit_width == 8:
+        return weight.to(scale_list.dtype) * scale_list[:, None]
+    elif source_bit_width == 4:
+        func = (
+            kernels.int4WeightExtractionHalf if scale_list.dtype == torch.half else kernels.int4WeightExtractionBFloat16
+        )
+    else:
+        assert False, "Unsupported bit-width"
+
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        out = torch.empty(n, m * (8 // source_bit_width), dtype=scale_list.dtype, device="cuda")
+        stream = torch.cuda.current_stream()
+
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+
+        func(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [
+                ctypes.c_void_p(weight.data_ptr()),
+                ctypes.c_void_p(scale_list.data_ptr()),
+                ctypes.c_void_p(out.data_ptr()),
+                ctypes.c_int32(n),
+                ctypes.c_int32(m),
+            ],
+        )
+        return out
+
+
+class QuantizedLinear(torch.nn.Module):
+    def __init__(self, weight_bit_width: int, weight, bias=None, device="cpu", dtype=None, empty_init=False, *args,
+                 **kwargs):
+        super().__init__()
+        self.weight_bit_width = weight_bit_width
+
+        shape = weight.shape
+
+        if weight is None or empty_init:
+            self.weight = torch.empty(shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=device)
+            self.weight_scale = torch.empty(shape[0], dtype=dtype, device=device)
+        else:
+            self.weight_scale = weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)
+            self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8)
+            if weight_bit_width == 4:
+                self.weight = compress_int4_weight(self.weight)
+
+        self.weight = Parameter(self.weight.to(device), requires_grad=False)
+        self.weight_scale = Parameter(self.weight_scale.to(device), requires_grad=False)
+        self.bias = Parameter(bias.to(device), requires_grad=False) if bias is not None else None
+
+    def forward(self, input):
+        output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width)
+        if self.bias is not None:
+            output = output + self.bias
+        return output
+
+
+def quantize(model, weight_bit_width, empty_init=False, device=None):
+    """Replace fp16 linear with quantized linear"""
+    for layer in model.layers:
+        layer.self_attention.query_key_value = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.self_attention.query_key_value.weight.to(torch.cuda.current_device()),
+            bias=layer.self_attention.query_key_value.bias,
+            dtype=layer.self_attention.query_key_value.weight.dtype,
+            device=layer.self_attention.query_key_value.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.self_attention.dense = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.self_attention.dense.weight.to(torch.cuda.current_device()),
+            bias=layer.self_attention.dense.bias,
+            dtype=layer.self_attention.dense.weight.dtype,
+            device=layer.self_attention.dense.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_h_to_4h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()),
+            bias=layer.mlp.dense_h_to_4h.bias,
+            dtype=layer.mlp.dense_h_to_4h.weight.dtype,
+            device=layer.mlp.dense_h_to_4h.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_4h_to_h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()),
+            bias=layer.mlp.dense_4h_to_h.bias,
+            dtype=layer.mlp.dense_4h_to_h.weight.dtype,
+            device=layer.mlp.dense_4h_to_h.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+
+    return model
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/rng_state.pth b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..dea498e970c6dc2b029df3494a4952092a985a0f
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:770fc5b3f081eaa40d1dc87b17544797be34efa6309211006ba5f9a46a02dfc0
+size 14244
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/scheduler.pt b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..68047aa06cd43cf8389655f1a7aebf9a795ee470
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfeb060c25a48c544e101802c9544e2b821808664074249db3b359d15595a31d
+size 1064
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/special_tokens_map.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/special_tokens_map.json
@@ -0,0 +1 @@
+{}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenization_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenization_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenization_chatglm.py
@@ -0,0 +1,300 @@
+import json
+import os
+import re
+from typing import List, Optional, Union, Dict
+from sentencepiece import SentencePieceProcessor
+from transformers import PreTrainedTokenizer
+from transformers.utils import logging, PaddingStrategy
+from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
+
+
+class SPTokenizer:
+    def __init__(self, model_path: str):
+        # reload tokenizer
+        assert os.path.isfile(model_path), model_path
+        self.sp_model = SentencePieceProcessor(model_file=model_path)
+
+        # BOS / EOS token IDs
+        self.n_words: int = self.sp_model.vocab_size()
+        self.bos_id: int = self.sp_model.bos_id()
+        self.eos_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.unk_id()
+        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+
+        role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
+        special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
+        self.special_tokens = {}
+        self.index_special_tokens = {}
+        for token in special_tokens:
+            self.special_tokens[token] = self.n_words
+            self.index_special_tokens[self.n_words] = token
+            self.n_words += 1
+        self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens])
+
+    def tokenize(self, s: str, encode_special_tokens=False):
+        if encode_special_tokens:
+            last_index = 0
+            t = []
+            for match in re.finditer(self.role_special_token_expression, s):
+                if last_index < match.start():
+                    t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()]))
+                t.append(s[match.start():match.end()])
+                last_index = match.end()
+            if last_index < len(s):
+                t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
+            return t
+        else:
+            return self.sp_model.EncodeAsPieces(s)
+
+    def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
+        assert type(s) is str
+        t = self.sp_model.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+
+    def decode(self, t: List[int]) -> str:
+        text, buffer = "", []
+        for token in t:
+            if token in self.index_special_tokens:
+                if buffer:
+                    text += self.sp_model.decode(buffer)
+                    buffer = []
+                text += self.index_special_tokens[token]
+            else:
+                buffer.append(token)
+        if buffer:
+            text += self.sp_model.decode(buffer)
+        return text
+
+    def decode_tokens(self, tokens: List[str]) -> str:
+        text = self.sp_model.DecodePieces(tokens)
+        return text
+
+    def convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        return self.sp_model.PieceToId(token)
+
+    def convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.index_special_tokens:
+            return self.index_special_tokens[index]
+        if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0:
+            return ""
+        return self.sp_model.IdToPiece(index)
+
+
+class ChatGLMTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {"vocab_file": "tokenizer.model"}
+
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+
+    def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False,
+                 **kwargs):
+        self.name = "GLMTokenizer"
+
+        self.vocab_file = vocab_file
+        self.tokenizer = SPTokenizer(vocab_file)
+        self.special_tokens = {
+            "<bos>": self.tokenizer.bos_id,
+            "<eos>": self.tokenizer.eos_id,
+            "<pad>": self.tokenizer.pad_id
+        }
+        self.encode_special_tokens = encode_special_tokens
+        super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                         encode_special_tokens=encode_special_tokens,
+                         **kwargs)
+
+    def get_command(self, token):
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
+        return self.tokenizer.special_tokens[token]
+
+    @property
+    def unk_token(self) -> str:
+        return "<unk>"
+
+    @property
+    def pad_token(self) -> str:
+        return "<unk>"
+
+    @property
+    def pad_token_id(self):
+        return self.get_command("<pad>")
+
+    @property
+    def eos_token(self) -> str:
+        return "</s>"
+
+    @property
+    def eos_token_id(self):
+        return self.get_command("<eos>")
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_words
+
+    def get_vocab(self):
+        """ Returns vocab as a dict """
+        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text, **kwargs):
+        return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.tokenizer.convert_token_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.tokenizer.convert_id_to_token(index)
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return self.tokenizer.decode_tokens(tokens)
+
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, self.vocab_files_names["vocab_file"]
+            )
+        else:
+            vocab_file = save_directory
+
+        with open(self.vocab_file, 'rb') as fin:
+            proto_str = fin.read()
+
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_str)
+
+        return (vocab_file,)
+
+    def get_prefix_tokens(self):
+        prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
+        return prefix_tokens
+
+    def build_single_message(self, role, metadata, message):
+        assert role in ["system", "user", "assistant", "observation"], role
+        role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n")
+        message_tokens = self.tokenizer.encode(message)
+        tokens = role_tokens + message_tokens
+        return tokens
+
+    def build_chat_input(self, query, history=None, role="user"):
+        if history is None:
+            history = []
+        input_ids = []
+        for item in history:
+            content = item["content"]
+            if item["role"] == "system" and "tools" in item:
+                content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False)
+            input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content))
+        input_ids.extend(self.build_single_message(role, "", query))
+        input_ids.extend([self.get_command("<|assistant|>")])
+        return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True)
+
+    def build_inputs_with_special_tokens(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        prefix_tokens = self.get_prefix_tokens()
+        token_ids_0 = prefix_tokens + token_ids_0
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("<eos>")]
+        return token_ids_0
+
+    def _pad(
+            self,
+            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+            max_length: Optional[int] = None,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            pad_to_multiple_of: Optional[int] = None,
+            return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        assert self.padding_side == "left"
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * seq_length
+
+        if "position_ids" not in encoded_inputs:
+            encoded_inputs["position_ids"] = list(range(seq_length))
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
+            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+
+        return encoded_inputs
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenizer.model b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
+size 1018370
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenizer_config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..5faafd38f2e2934804feb0e7d71ebf08b0839bf5
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenizer_config.json
@@ -0,0 +1,18 @@
+{
+  "added_tokens_decoder": {},
+  "additional_special_tokens": [],
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_chatglm.ChatGLMTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "encode_special_tokens": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "padding_side": "left",
+  "remove_space": false,
+  "tokenizer_class": "ChatGLMTokenizer",
+  "tokenizer_file": null
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/trainer_state.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa720c7b50086a8a29af2806f21a94be85cc04a0
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/trainer_state.json
@@ -0,0 +1,619 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 9.411764705882353,
+  "eval_steps": 500,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.09,
+      "learning_rate": 0.009985714285714285,
+      "loss": 2.6971,
+      "step": 1
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 0.009971428571428572,
+      "loss": 2.3927,
+      "step": 2
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 0.009957142857142857,
+      "loss": 2.2539,
+      "step": 3
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 0.009942857142857144,
+      "loss": 2.1408,
+      "step": 4
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 0.009928571428571429,
+      "loss": 2.2672,
+      "step": 5
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 0.009914285714285714,
+      "loss": 1.6433,
+      "step": 6
+    },
+    {
+      "epoch": 0.66,
+      "learning_rate": 0.0099,
+      "loss": 2.1405,
+      "step": 7
+    },
+    {
+      "epoch": 0.75,
+      "learning_rate": 0.009885714285714286,
+      "loss": 2.1464,
+      "step": 8
+    },
+    {
+      "epoch": 0.85,
+      "learning_rate": 0.009871428571428571,
+      "loss": 1.8498,
+      "step": 9
+    },
+    {
+      "epoch": 0.94,
+      "learning_rate": 0.009857142857142858,
+      "loss": 1.6896,
+      "step": 10
+    },
+    {
+      "epoch": 1.04,
+      "learning_rate": 0.009842857142857143,
+      "loss": 2.1932,
+      "step": 11
+    },
+    {
+      "epoch": 1.13,
+      "learning_rate": 0.00982857142857143,
+      "loss": 1.8236,
+      "step": 12
+    },
+    {
+      "epoch": 1.22,
+      "learning_rate": 0.009814285714285715,
+      "loss": 1.735,
+      "step": 13
+    },
+    {
+      "epoch": 1.32,
+      "learning_rate": 0.0098,
+      "loss": 1.7488,
+      "step": 14
+    },
+    {
+      "epoch": 1.41,
+      "learning_rate": 0.009785714285714285,
+      "loss": 1.8336,
+      "step": 15
+    },
+    {
+      "epoch": 1.51,
+      "learning_rate": 0.009771428571428572,
+      "loss": 1.9438,
+      "step": 16
+    },
+    {
+      "epoch": 1.6,
+      "learning_rate": 0.009757142857142858,
+      "loss": 1.7178,
+      "step": 17
+    },
+    {
+      "epoch": 1.69,
+      "learning_rate": 0.009742857142857143,
+      "loss": 1.5714,
+      "step": 18
+    },
+    {
+      "epoch": 1.79,
+      "learning_rate": 0.009728571428571428,
+      "loss": 1.537,
+      "step": 19
+    },
+    {
+      "epoch": 1.88,
+      "learning_rate": 0.009714285714285715,
+      "loss": 1.6764,
+      "step": 20
+    },
+    {
+      "epoch": 1.98,
+      "learning_rate": 0.0097,
+      "loss": 1.8919,
+      "step": 21
+    },
+    {
+      "epoch": 2.07,
+      "learning_rate": 0.009685714285714285,
+      "loss": 1.346,
+      "step": 22
+    },
+    {
+      "epoch": 2.16,
+      "learning_rate": 0.009671428571428572,
+      "loss": 1.5036,
+      "step": 23
+    },
+    {
+      "epoch": 2.26,
+      "learning_rate": 0.009657142857142857,
+      "loss": 1.6788,
+      "step": 24
+    },
+    {
+      "epoch": 2.35,
+      "learning_rate": 0.009642857142857144,
+      "loss": 1.6667,
+      "step": 25
+    },
+    {
+      "epoch": 2.45,
+      "learning_rate": 0.009628571428571429,
+      "loss": 1.7153,
+      "step": 26
+    },
+    {
+      "epoch": 2.54,
+      "learning_rate": 0.009614285714285714,
+      "loss": 1.601,
+      "step": 27
+    },
+    {
+      "epoch": 2.64,
+      "learning_rate": 0.0096,
+      "loss": 1.3002,
+      "step": 28
+    },
+    {
+      "epoch": 2.73,
+      "learning_rate": 0.009585714285714286,
+      "loss": 1.3294,
+      "step": 29
+    },
+    {
+      "epoch": 2.82,
+      "learning_rate": 0.009571428571428573,
+      "loss": 1.7477,
+      "step": 30
+    },
+    {
+      "epoch": 2.92,
+      "learning_rate": 0.009557142857142858,
+      "loss": 1.7961,
+      "step": 31
+    },
+    {
+      "epoch": 3.01,
+      "learning_rate": 0.009542857142857143,
+      "loss": 1.4954,
+      "step": 32
+    },
+    {
+      "epoch": 3.11,
+      "learning_rate": 0.009528571428571428,
+      "loss": 1.6452,
+      "step": 33
+    },
+    {
+      "epoch": 3.2,
+      "learning_rate": 0.009514285714285715,
+      "loss": 1.3528,
+      "step": 34
+    },
+    {
+      "epoch": 3.29,
+      "learning_rate": 0.0095,
+      "loss": 1.4811,
+      "step": 35
+    },
+    {
+      "epoch": 3.39,
+      "learning_rate": 0.009485714285714287,
+      "loss": 1.4738,
+      "step": 36
+    },
+    {
+      "epoch": 3.48,
+      "learning_rate": 0.009471428571428572,
+      "loss": 1.174,
+      "step": 37
+    },
+    {
+      "epoch": 3.58,
+      "learning_rate": 0.009457142857142857,
+      "loss": 1.2346,
+      "step": 38
+    },
+    {
+      "epoch": 3.67,
+      "learning_rate": 0.009442857142857143,
+      "loss": 1.5327,
+      "step": 39
+    },
+    {
+      "epoch": 3.76,
+      "learning_rate": 0.009428571428571429,
+      "loss": 1.5249,
+      "step": 40
+    },
+    {
+      "epoch": 3.86,
+      "learning_rate": 0.009414285714285714,
+      "loss": 1.5086,
+      "step": 41
+    },
+    {
+      "epoch": 3.95,
+      "learning_rate": 0.0094,
+      "loss": 1.8425,
+      "step": 42
+    },
+    {
+      "epoch": 4.05,
+      "learning_rate": 0.009385714285714287,
+      "loss": 1.1943,
+      "step": 43
+    },
+    {
+      "epoch": 4.14,
+      "learning_rate": 0.009371428571428572,
+      "loss": 1.6835,
+      "step": 44
+    },
+    {
+      "epoch": 4.24,
+      "learning_rate": 0.009357142857142857,
+      "loss": 1.75,
+      "step": 45
+    },
+    {
+      "epoch": 4.33,
+      "learning_rate": 0.009342857142857142,
+      "loss": 1.2561,
+      "step": 46
+    },
+    {
+      "epoch": 4.42,
+      "learning_rate": 0.009328571428571429,
+      "loss": 1.3784,
+      "step": 47
+    },
+    {
+      "epoch": 4.52,
+      "learning_rate": 0.009314285714285714,
+      "loss": 1.2538,
+      "step": 48
+    },
+    {
+      "epoch": 4.61,
+      "learning_rate": 0.009300000000000001,
+      "loss": 1.4429,
+      "step": 49
+    },
+    {
+      "epoch": 4.71,
+      "learning_rate": 0.009285714285714286,
+      "loss": 1.3687,
+      "step": 50
+    },
+    {
+      "epoch": 4.8,
+      "learning_rate": 0.009271428571428571,
+      "loss": 1.1511,
+      "step": 51
+    },
+    {
+      "epoch": 4.89,
+      "learning_rate": 0.009257142857142858,
+      "loss": 1.181,
+      "step": 52
+    },
+    {
+      "epoch": 4.99,
+      "learning_rate": 0.009242857142857143,
+      "loss": 1.1753,
+      "step": 53
+    },
+    {
+      "epoch": 5.08,
+      "learning_rate": 0.009228571428571428,
+      "loss": 1.1562,
+      "step": 54
+    },
+    {
+      "epoch": 5.18,
+      "learning_rate": 0.009214285714285715,
+      "loss": 1.2936,
+      "step": 55
+    },
+    {
+      "epoch": 5.27,
+      "learning_rate": 0.0092,
+      "loss": 1.3591,
+      "step": 56
+    },
+    {
+      "epoch": 5.36,
+      "learning_rate": 0.009185714285714287,
+      "loss": 1.1376,
+      "step": 57
+    },
+    {
+      "epoch": 5.46,
+      "learning_rate": 0.009171428571428572,
+      "loss": 1.372,
+      "step": 58
+    },
+    {
+      "epoch": 5.55,
+      "learning_rate": 0.009157142857142857,
+      "loss": 1.5141,
+      "step": 59
+    },
+    {
+      "epoch": 5.65,
+      "learning_rate": 0.009142857142857144,
+      "loss": 1.2087,
+      "step": 60
+    },
+    {
+      "epoch": 5.74,
+      "learning_rate": 0.009128571428571429,
+      "loss": 1.136,
+      "step": 61
+    },
+    {
+      "epoch": 5.84,
+      "learning_rate": 0.009114285714285715,
+      "loss": 1.2948,
+      "step": 62
+    },
+    {
+      "epoch": 5.93,
+      "learning_rate": 0.0091,
+      "loss": 1.0592,
+      "step": 63
+    },
+    {
+      "epoch": 6.02,
+      "learning_rate": 0.009085714285714286,
+      "loss": 1.2321,
+      "step": 64
+    },
+    {
+      "epoch": 6.12,
+      "learning_rate": 0.009071428571428572,
+      "loss": 1.0827,
+      "step": 65
+    },
+    {
+      "epoch": 6.21,
+      "learning_rate": 0.009057142857142857,
+      "loss": 1.1136,
+      "step": 66
+    },
+    {
+      "epoch": 6.31,
+      "learning_rate": 0.009042857142857142,
+      "loss": 1.475,
+      "step": 67
+    },
+    {
+      "epoch": 6.4,
+      "learning_rate": 0.009028571428571427,
+      "loss": 1.1316,
+      "step": 68
+    },
+    {
+      "epoch": 6.49,
+      "learning_rate": 0.009014285714285714,
+      "loss": 1.1688,
+      "step": 69
+    },
+    {
+      "epoch": 6.59,
+      "learning_rate": 0.009000000000000001,
+      "loss": 1.0882,
+      "step": 70
+    },
+    {
+      "epoch": 6.68,
+      "learning_rate": 0.008985714285714286,
+      "loss": 1.1085,
+      "step": 71
+    },
+    {
+      "epoch": 6.78,
+      "learning_rate": 0.008971428571428571,
+      "loss": 1.2029,
+      "step": 72
+    },
+    {
+      "epoch": 6.87,
+      "learning_rate": 0.008957142857142856,
+      "loss": 1.098,
+      "step": 73
+    },
+    {
+      "epoch": 6.96,
+      "learning_rate": 0.008942857142857143,
+      "loss": 1.219,
+      "step": 74
+    },
+    {
+      "epoch": 7.06,
+      "learning_rate": 0.00892857142857143,
+      "loss": 1.0092,
+      "step": 75
+    },
+    {
+      "epoch": 7.15,
+      "learning_rate": 0.008914285714285715,
+      "loss": 1.0112,
+      "step": 76
+    },
+    {
+      "epoch": 7.25,
+      "learning_rate": 0.0089,
+      "loss": 1.1481,
+      "step": 77
+    },
+    {
+      "epoch": 7.34,
+      "learning_rate": 0.008885714285714287,
+      "loss": 0.9873,
+      "step": 78
+    },
+    {
+      "epoch": 7.44,
+      "learning_rate": 0.008871428571428572,
+      "loss": 1.0586,
+      "step": 79
+    },
+    {
+      "epoch": 7.53,
+      "learning_rate": 0.008857142857142857,
+      "loss": 1.1177,
+      "step": 80
+    },
+    {
+      "epoch": 7.62,
+      "learning_rate": 0.008842857142857142,
+      "loss": 0.7814,
+      "step": 81
+    },
+    {
+      "epoch": 7.72,
+      "learning_rate": 0.008828571428571429,
+      "loss": 1.2043,
+      "step": 82
+    },
+    {
+      "epoch": 7.81,
+      "learning_rate": 0.008814285714285715,
+      "loss": 1.0062,
+      "step": 83
+    },
+    {
+      "epoch": 7.91,
+      "learning_rate": 0.0088,
+      "loss": 1.0831,
+      "step": 84
+    },
+    {
+      "epoch": 8.0,
+      "learning_rate": 0.008785714285714286,
+      "loss": 0.9554,
+      "step": 85
+    },
+    {
+      "epoch": 8.09,
+      "learning_rate": 0.00877142857142857,
+      "loss": 1.1674,
+      "step": 86
+    },
+    {
+      "epoch": 8.19,
+      "learning_rate": 0.008757142857142857,
+      "loss": 0.8226,
+      "step": 87
+    },
+    {
+      "epoch": 8.28,
+      "learning_rate": 0.008742857142857144,
+      "loss": 0.9166,
+      "step": 88
+    },
+    {
+      "epoch": 8.38,
+      "learning_rate": 0.00872857142857143,
+      "loss": 0.734,
+      "step": 89
+    },
+    {
+      "epoch": 8.47,
+      "learning_rate": 0.008714285714285714,
+      "loss": 0.8641,
+      "step": 90
+    },
+    {
+      "epoch": 8.56,
+      "learning_rate": 0.0087,
+      "loss": 0.9517,
+      "step": 91
+    },
+    {
+      "epoch": 8.66,
+      "learning_rate": 0.008685714285714286,
+      "loss": 0.9995,
+      "step": 92
+    },
+    {
+      "epoch": 8.75,
+      "learning_rate": 0.008671428571428571,
+      "loss": 0.763,
+      "step": 93
+    },
+    {
+      "epoch": 8.85,
+      "learning_rate": 0.008657142857142858,
+      "loss": 1.0712,
+      "step": 94
+    },
+    {
+      "epoch": 8.94,
+      "learning_rate": 0.008642857142857143,
+      "loss": 1.1111,
+      "step": 95
+    },
+    {
+      "epoch": 9.04,
+      "learning_rate": 0.008628571428571428,
+      "loss": 0.9626,
+      "step": 96
+    },
+    {
+      "epoch": 9.13,
+      "learning_rate": 0.008614285714285715,
+      "loss": 0.6385,
+      "step": 97
+    },
+    {
+      "epoch": 9.22,
+      "learning_rate": 0.0086,
+      "loss": 0.8147,
+      "step": 98
+    },
+    {
+      "epoch": 9.32,
+      "learning_rate": 0.008585714285714285,
+      "loss": 0.8109,
+      "step": 99
+    },
+    {
+      "epoch": 9.41,
+      "learning_rate": 0.008571428571428572,
+      "loss": 1.0953,
+      "step": 100
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 700,
+  "num_train_epochs": 70,
+  "save_steps": 100,
+  "total_flos": 1.175174321799168e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/training_args.bin b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..17f9bfbf1a7cdd9e0e808e0672d55ad9ad4efb5f
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00669a32a6ddac0a3243bbc04d3f1f70ffc8f89f2626c1fdafa93ce68c311aa0
+size 4664
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..50d927dc68b4eaa40bd4812b7417b3f2bd61f599
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "chatglm3-6b",
+  "add_bias_linear": false,
+  "add_qkv_bias": true,
+  "apply_query_key_layer_scaling": true,
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "ChatGLMForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "auto_map": {
+    "AutoConfig": "configuration_chatglm.ChatGLMConfig",
+    "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
+  },
+  "bias_dropout_fusion": true,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "ffn_hidden_size": 13696,
+  "fp32_residual_connection": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 4096,
+  "kv_channels": 128,
+  "layernorm_epsilon": 1e-05,
+  "model_type": "chatglm",
+  "multi_query_attention": true,
+  "multi_query_group_num": 2,
+  "num_attention_heads": 32,
+  "num_layers": 28,
+  "original_rope": true,
+  "pad_token_id": 0,
+  "padded_vocab_size": 65024,
+  "post_layer_norm": true,
+  "pre_seq_len": 128,
+  "prefix_projection": false,
+  "quantization_bit": 0,
+  "rmsnorm": true,
+  "seq_length": 8192,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.34.0",
+  "use_cache": true,
+  "vocab_size": 65024
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/configuration_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/configuration_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..35600185f5a26951081de0f3a41a913eaf06af99
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/configuration_chatglm.py
@@ -0,0 +1,61 @@
+from transformers import PretrainedConfig
+
+
+class ChatGLMConfig(PretrainedConfig):
+    model_type = "chatglm"
+    def __init__(
+        self,
+        num_layers=28,
+        padded_vocab_size=65024,
+        hidden_size=4096,
+        ffn_hidden_size=13696,
+        kv_channels=128,
+        num_attention_heads=32,
+        seq_length=2048,
+        hidden_dropout=0.0,
+        classifier_dropout=None,
+        attention_dropout=0.0,
+        layernorm_epsilon=1e-5,
+        rmsnorm=True,
+        apply_residual_connection_post_layernorm=False,
+        post_layer_norm=True,
+        add_bias_linear=False,
+        add_qkv_bias=False,
+        bias_dropout_fusion=True,
+        multi_query_attention=False,
+        multi_query_group_num=1,
+        apply_query_key_layer_scaling=True,
+        attention_softmax_in_fp32=True,
+        fp32_residual_connection=False,
+        quantization_bit=0,
+        pre_seq_len=None,
+        prefix_projection=False,
+        **kwargs
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
+        self.padded_vocab_size = padded_vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.seq_length = seq_length
+        self.hidden_dropout = hidden_dropout
+        self.classifier_dropout = classifier_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_attention = multi_query_attention
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        super().__init__(**kwargs)
\ No newline at end of file
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/generation_config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..31d22ad9418a1daba6b2bbf472ac3762cd5ce643
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/generation_config.json
@@ -0,0 +1,6 @@
+{
+  "_from_model_config": true,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.34.0"
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/modeling_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/modeling_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5b5027587016090a377f25289284b6e4f829cb4
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/modeling_chatglm.py
@@ -0,0 +1,1293 @@
+""" PyTorch ChatGLM model. """
+
+import math
+import copy
+import warnings
+import re
+import sys
+
+import torch
+import torch.utils.checkpoint
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
+from torch.nn.utils import skip_init
+from typing import Optional, Tuple, Union, List, Callable, Dict, Any
+from copy import deepcopy
+
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
+
+from .configuration_chatglm import ChatGLMConfig
+
+# flags required to enable jit fusion kernels
+
+if sys.platform != 'darwin':
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    torch._C._jit_override_can_fuse_on_cpu(True)
+    torch._C._jit_override_can_fuse_on_gpu(True)
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
+_CONFIG_FOR_DOC = "ChatGLMConfig"
+
+CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "THUDM/chatglm3-6b",
+    # See all ChatGLM models at https://huggingface.co/models?filter=chatglm
+]
+
+
+def default_init(cls, *args, **kwargs):
+    return cls(*args, **kwargs)
+
+
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 5] = 5e4
+        return scores
+
+
+class PrefixEncoder(torch.nn.Module):
+    """
+    The torch.nn model to encode the prefix
+    Input shape: (batch-size, prefix-length)
+    Output shape: (batch-size, prefix-length, 2*layers*hidden)
+    """
+
+    def __init__(self, config: ChatGLMConfig):
+        super().__init__()
+        self.prefix_projection = config.prefix_projection
+        if self.prefix_projection:
+            # Use a two-layer MLP to encode the prefix
+            kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2
+            self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size)
+            self.trans = torch.nn.Sequential(
+                torch.nn.Linear(kv_size, config.hidden_size),
+                torch.nn.Tanh(),
+                torch.nn.Linear(config.hidden_size, kv_size)
+            )
+        else:
+            self.embedding = torch.nn.Embedding(config.pre_seq_len,
+                                                config.num_layers * config.kv_channels * config.multi_query_group_num * 2)
+
+    def forward(self, prefix: torch.Tensor):
+        if self.prefix_projection:
+            prefix_tokens = self.embedding(prefix)
+            past_key_values = self.trans(prefix_tokens)
+        else:
+            past_key_values = self.embedding(prefix)
+        return past_key_values
+
+
+def split_tensor_along_last_dim(
+        tensor: torch.Tensor,
+        num_partitions: int,
+        contiguous_split_chunks: bool = False,
+) -> List[torch.Tensor]:
+    """Split a tensor along its last dimension.
+
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+
+    Returns:
+        A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = tensor.size()[last_dim] // num_partitions
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, original_impl=False, device=None, dtype=None):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.dim = dim
+        self.original_impl = original_impl
+
+    def forward_impl(
+            self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
+    ):
+        """Enhanced Transformer with Rotary Position Embedding.
+
+        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+        transformers/rope/__init__.py. MIT License:
+        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+        """
+        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+        theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
+
+        # Create position indexes `[0, 1, ..., seq_len - 1]`
+        seq_idx = torch.arange(seq_len, dtype=torch.float, device=device)
+
+        # Calculate the product of position index and $\theta_i$
+        idx_theta = torch.outer(seq_idx, theta).float()
+
+        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
+
+        # this is to mimic the behaviour of complex32, else we will get different results
+        if dtype in (torch.float16, torch.bfloat16, torch.int8):
+            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
+        return cache
+
+    def forward(self, max_seq_len, offset=0):
+        return self.forward_impl(
+            max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
+        )
+
+
+@torch.jit.script
+def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
+    # x: [sq, b, np, hn]
+    sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
+    rot_dim = rope_cache.shape[-2] * 2
+    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
+    # truncate to support variable sizes
+    rope_cache = rope_cache[:sq]
+    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
+    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
+            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return torch.cat((x_out2, x_pass), dim=-1)
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
+        self.eps = eps
+
+    def forward(self, hidden_states: torch.Tensor):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+
+        return (self.weight * hidden_states).to(input_dtype)
+
+
+class CoreAttention(torch.nn.Module):
+    def __init__(self, config: ChatGLMConfig, layer_number):
+        super(CoreAttention, self).__init__()
+
+        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+
+        projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        self.hidden_size_per_partition = projection_size
+        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+        self.coeff = coeff
+
+        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
+
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        pytorch_major_version = int(torch.__version__.split('.')[0])
+        if pytorch_major_version >= 2:
+            query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
+            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 is_causal=True)
+            else:
+                if attention_mask is not None:
+                    attention_mask = ~attention_mask
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 attention_mask)
+            context_layer = context_layer.permute(2, 0, 1, 3)
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.reshape(*new_context_layer_shape)
+        else:
+            # Raw attention scores
+
+            # [b, np, sq, sk]
+            output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
+
+            # [sq, b, np, hn] -> [sq, b * np, hn]
+            query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
+            # [sk, b, np, hn] -> [sk, b * np, hn]
+            key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
+
+            # preallocting input tensor: [b * np, sq, sk]
+            matmul_input_buffer = torch.empty(
+                output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
+                device=query_layer.device
+            )
+
+            # Raw attention scores. [b * np, sq, sk]
+            matmul_result = torch.baddbmm(
+                matmul_input_buffer,
+                query_layer.transpose(0, 1),  # [b * np, sq, hn]
+                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+                beta=0.0,
+                alpha=(1.0 / self.norm_factor),
+            )
+
+            # change view to [b, np, sq, sk]
+            attention_scores = matmul_result.view(*output_size)
+
+            # ===========================
+            # Attention probs and dropout
+            # ===========================
+
+            # attention scores and attention mask [b, np, sq, sk]
+            if self.attention_softmax_in_fp32:
+                attention_scores = attention_scores.float()
+            if self.coeff is not None:
+                attention_scores = attention_scores * self.coeff
+            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
+                attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
+                                            device=attention_scores.device, dtype=torch.bool)
+                attention_mask.tril_()
+                attention_mask = ~attention_mask
+            if attention_mask is not None:
+                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
+            attention_probs = F.softmax(attention_scores, dim=-1)
+            attention_probs = attention_probs.type_as(value_layer)
+
+            # This is actually dropping out entire tokens to attend to, which might
+            # seem a bit unusual, but is taken from the original Transformer paper.
+            attention_probs = self.attention_dropout(attention_probs)
+            # =========================
+            # Context layer. [sq, b, hp]
+            # =========================
+
+            # value_layer -> context layer.
+            # [sk, b, np, hn] --> [b, np, sq, hn]
+
+            # context layer shape: [b, np, sq, hn]
+            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
+            # change view [sk, b * np, hn]
+            value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
+            # change view [b * np, sq, sk]
+            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+            # matmul: [b * np, sq, hn]
+            context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+            # change view [b, np, sq, hn]
+            context_layer = context_layer.view(*output_size)
+            # [b, np, sq, hn] --> [sq, b, np, hn]
+            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+            # [sq, b, np, hn] --> [sq, b, hp]
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.view(*new_context_layer_shape)
+
+        return context_layer
+
+
+class SelfAttention(torch.nn.Module):
+    """Parallel self-attention layer abstract class.
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(SelfAttention, self).__init__()
+        self.layer_number = max(1, layer_number)
+
+        self.projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        self.multi_query_attention = config.multi_query_attention
+        self.qkv_hidden_size = 3 * self.projection_size
+        if self.multi_query_attention:
+            self.num_multi_query_groups_per_partition = config.multi_query_group_num
+            self.qkv_hidden_size = (
+                    self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
+            )
+        self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
+                                         bias=config.add_bias_linear or config.add_qkv_bias,
+                                         device=device, **_config_to_kwargs(config)
+                                         )
+
+        self.core_attention = CoreAttention(config, self.layer_number)
+
+        # Output.
+        self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
+                               device=device, **_config_to_kwargs(config)
+                               )
+
+    def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
+        if self.multi_query_attention:
+            num_attention_heads = self.num_multi_query_groups_per_partition
+        else:
+            num_attention_heads = self.num_attention_heads_per_partition
+        return torch.empty(
+            inference_max_sequence_len,
+            batch_size,
+            num_attention_heads,
+            self.hidden_size_per_attention_head,
+            dtype=dtype,
+            device=device,
+        )
+
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
+    ):
+        # hidden_states: [sq, b, h]
+
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        # =====================
+        # Query, Key, and Value
+        # =====================
+
+        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+        mixed_x_layer = self.query_key_value(hidden_states)
+
+        if self.multi_query_attention:
+            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+                [
+                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                ],
+                dim=-1,
+            )
+            query_layer = query_layer.view(
+                query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            key_layer = key_layer.view(
+                key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.view(
+                value_layer.size()[:-1]
+                + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+        else:
+            new_tensor_shape = mixed_x_layer.size()[:-1] + \
+                               (self.num_attention_heads_per_partition,
+                                3 * self.hidden_size_per_attention_head)
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # apply relative positional encoding (rotary embedding)
+        if rotary_pos_emb is not None:
+            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
+            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
+
+        # adjust key and value for inference
+        if kv_cache is not None:
+            cache_k, cache_v = kv_cache
+            key_layer = torch.cat((cache_k, key_layer), dim=0)
+            value_layer = torch.cat((cache_v, value_layer), dim=0)
+        if use_cache:
+            kv_cache = (key_layer, value_layer)
+        else:
+            kv_cache = None
+
+        if self.multi_query_attention:
+            key_layer = key_layer.unsqueeze(-2)
+            key_layer = key_layer.expand(
+                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
+            )
+            key_layer = key_layer.contiguous().view(
+                key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.unsqueeze(-2)
+            value_layer = value_layer.expand(
+                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
+            )
+            value_layer = value_layer.contiguous().view(
+                value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output = self.dense(context_layer)
+
+        return output, kv_cache
+
+
+def _config_to_kwargs(args):
+    common_kwargs = {
+        "dtype": args.torch_dtype,
+    }
+    return common_kwargs
+
+
+class MLP(torch.nn.Module):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(MLP, self).__init__()
+
+        self.add_bias = config.add_bias_linear
+
+        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        self.dense_h_to_4h = nn.Linear(
+            config.hidden_size,
+            config.ffn_hidden_size * 2,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config)
+        )
+
+        def swiglu(x):
+            x = torch.chunk(x, 2, dim=-1)
+            return F.silu(x[0]) * x[1]
+
+        self.activation_func = swiglu
+
+        # Project back to h.
+        self.dense_4h_to_h = nn.Linear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config)
+        )
+
+    def forward(self, hidden_states):
+        # [s, b, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+        # [s, b, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        return output
+
+
+class GLMBlock(torch.nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(GLMBlock, self).__init__()
+        self.layer_number = layer_number
+
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                             dtype=config.torch_dtype)
+
+        # Self attention.
+        self.self_attention = SelfAttention(config, layer_number, device=device)
+        self.hidden_dropout = config.hidden_dropout
+
+        # Layernorm on the attention output
+        self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                      dtype=config.torch_dtype)
+
+        # MLP
+        self.mlp = MLP(config, device=device)
+
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
+    ):
+        # hidden_states: [s, b, h]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output, kv_cache = self.self_attention(
+            layernorm_output,
+            attention_mask,
+            rotary_pos_emb,
+            kv_cache=kv_cache,
+            use_cache=use_cache
+        )
+
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
+        layernorm_input = residual + layernorm_input
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
+        output = residual + output
+
+        return output, kv_cache
+
+
+class GLMTransformer(torch.nn.Module):
+    """Transformer class."""
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(GLMTransformer, self).__init__()
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.post_layer_norm = config.post_layer_norm
+
+        # Number of layers.
+        self.num_layers = config.num_layers
+
+        # Transformer layers.
+        def build_layer(layer_number):
+            return GLMBlock(config, layer_number, device=device)
+
+        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
+
+        if self.post_layer_norm:
+            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+            # Final layer norm before output.
+            self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                 dtype=config.torch_dtype)
+
+        self.gradient_checkpointing = False
+
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
+            use_cache: Optional[bool] = True,
+            output_hidden_states: Optional[bool] = False,
+    ):
+        if not kv_caches:
+            kv_caches = [None for _ in range(self.num_layers)]
+        presents = () if use_cache else None
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        all_self_attentions = None
+        all_hidden_states = () if output_hidden_states else None
+        for index in range(self.num_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer = self._get_layer(index)
+            if self.gradient_checkpointing and self.training:
+                layer_ret = torch.utils.checkpoint.checkpoint(
+                    layer,
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_caches[index],
+                    use_cache
+                )
+            else:
+                layer_ret = layer(
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_cache=kv_caches[index],
+                    use_cache=use_cache
+                )
+            hidden_states, kv_cache = layer_ret
+            if use_cache:
+                presents = presents + (kv_cache,)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        # Final layer norm.
+        if self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states, presents, all_hidden_states, all_self_attentions
+
+
+class ChatGLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    config_class = ChatGLMConfig
+    base_model_prefix = "transformer"
+    _no_split_modules = ["GLMBlock"]
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        return
+
+    def get_masks(self, input_ids, past_key_values, padding_mask=None):
+        batch_size, seq_length = input_ids.shape
+        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
+        full_attention_mask.tril_()
+        past_length = 0
+        if past_key_values:
+            past_length = past_key_values[0][0].shape[0]
+        if past_length:
+            full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
+                                                        device=input_ids.device), full_attention_mask), dim=-1)
+        if padding_mask is not None:
+            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
+        if not past_length and padding_mask is not None:
+            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
+        full_attention_mask = (full_attention_mask < 0.5).bool()
+        full_attention_mask.unsqueeze_(1)
+        return full_attention_mask
+
+    def get_position_ids(self, input_ids, device):
+        batch_size, seq_length = input_ids.shape
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+        return position_ids
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GLMTransformer):
+            module.gradient_checkpointing = value
+
+
+class Embedding(torch.nn.Module):
+    """Language model embeddings."""
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(Embedding, self).__init__()
+
+        self.hidden_size = config.hidden_size
+        # Word embeddings (parallel).
+        self.word_embeddings = nn.Embedding(
+            config.padded_vocab_size,
+            self.hidden_size,
+            dtype=config.torch_dtype,
+            device=device
+        )
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+    def forward(self, input_ids):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
+        return embeddings
+
+
+class ChatGLMModel(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
+        super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        init_kwargs = {}
+        if device is not None:
+            init_kwargs["device"] = device
+        self.embedding = init_method(Embedding, config, **init_kwargs)
+        self.num_layers = config.num_layers
+        self.multi_query_group_num = config.multi_query_group_num
+        self.kv_channels = config.kv_channels
+
+        # Rotary positional embeddings
+        self.seq_length = config.seq_length
+        rotary_dim = (
+            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
+        )
+
+        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
+                                              dtype=config.torch_dtype)
+        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
+        self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
+                                        dtype=config.torch_dtype, **init_kwargs)
+        self.pre_seq_len = config.pre_seq_len
+        self.prefix_projection = config.prefix_projection
+        if self.pre_seq_len is not None:
+            for param in self.parameters():
+                param.requires_grad = False
+            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
+            self.prefix_encoder = PrefixEncoder(config)
+            self.dropout = torch.nn.Dropout(0.1)
+
+    def get_input_embeddings(self):
+        return self.embedding.word_embeddings
+
+    def get_prompt(self, batch_size, device, dtype=torch.half):
+        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
+        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
+        past_key_values = past_key_values.view(
+            batch_size,
+            self.pre_seq_len,
+            self.num_layers * 2,
+            self.multi_query_group_num,
+            self.kv_channels
+        )
+        # seq_len, b, nh, hidden_size
+        past_key_values = self.dropout(past_key_values)
+        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
+        return past_key_values
+
+    def forward(
+            self,
+            input_ids,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.BoolTensor] = None,
+            full_attention_mask: Optional[torch.BoolTensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, seq_length = input_ids.shape
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embedding(input_ids)
+
+        if self.pre_seq_len is not None:
+            if past_key_values is None:
+                past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device,
+                                                  dtype=inputs_embeds.dtype)
+            if attention_mask is not None:
+                attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)),
+                                            attention_mask], dim=-1)
+
+        if full_attention_mask is None:
+            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
+                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
+
+        # Rotary positional embeddings
+        rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
+        if position_ids is not None:
+            rotary_pos_emb = rotary_pos_emb[position_ids]
+        else:
+            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
+        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
+
+        # Run encoder.
+        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
+            inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
+            kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
+        )
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def quantize(self, weight_bit_width: int):
+        from .quantization import quantize
+        quantize(self.encoder, weight_bit_width)
+        return self
+
+
+class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
+        super().__init__(config)
+
+        self.max_sequence_length = config.max_length
+        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
+        self.config = config
+        self.quantized = False
+
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+
+    def _update_model_kwargs_for_generation(
+            self,
+            outputs: ModelOutput,
+            model_kwargs: Dict[str, Any],
+            is_encoder_decoder: bool = False,
+            standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
+            outputs, standardize_cache_format=standardize_cache_format
+        )
+
+        # update attention mask
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+            )
+
+        # update position ids
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            new_position_id = position_ids[..., -1:].clone()
+            new_position_id += 1
+            model_kwargs["position_ids"] = torch.cat(
+                [position_ids, new_position_id], dim=-1
+            )
+
+        model_kwargs["is_first_forward"] = False
+        return model_kwargs
+
+    def prepare_inputs_for_generation(
+            self,
+            input_ids: torch.LongTensor,
+            past_key_values: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            is_first_forward: bool = True,
+            **kwargs
+    ) -> dict:
+        # only last token for input_ids if past is not None
+        if position_ids is None:
+            position_ids = self.get_position_ids(input_ids, device=input_ids.device)
+        if not is_first_forward:
+            if past_key_values is not None:
+                position_ids = position_ids[..., -1:]
+                input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "return_last_logit": True,
+            "use_cache": use_cache
+        }
+
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            return_last_logit: Optional[bool] = False,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        if return_last_logit:
+            hidden_states = hidden_states[-1:]
+        lm_logits = self.transformer.output_layer(hidden_states)
+        lm_logits = lm_logits.transpose(0, 1).contiguous()
+
+        loss = None
+        if labels is not None:
+            lm_logits = lm_logits.to(torch.float32)
+
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+            lm_logits = lm_logits.to(hidden_states.dtype)
+            loss = loss.to(hidden_states.dtype)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(
+            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+
+        Output shares the same memory storage as `past`.
+        """
+        return tuple(
+            (
+                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
+                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
+            )
+            for layer_past in past
+        )
+
+    def process_response(self, output, history):
+        content = ""
+        history = deepcopy(history)
+        for response in output.split("<|assistant|>"):
+            metadata, content = response.split("\n", maxsplit=1)
+            if not metadata.strip():
+                content = content.strip()
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                content = content.replace("[[训练时间]]", "2023年")
+            else:
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                if history[0]["role"] == "system" and "tools" in history[0]:
+                    content = "\n".join(content.split("\n")[1:-1])
+                    def tool_call(**kwargs):
+                        return kwargs
+                    parameters = eval(content)
+                    content = {"name": metadata.strip(), "parameters": parameters}
+                else:
+                    content = {"name": metadata.strip(), "content": content}
+        return content, history
+
+    @torch.inference_mode()
+    def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
+             max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
+             **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        inputs = tokenizer.build_chat_input(query, history=history, role=role)
+        inputs = inputs.to(self.device)
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
+                        tokenizer.get_command("<|observation|>")]
+        outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id)
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+        response = tokenizer.decode(outputs)
+        history.append({"role": role, "content": query})
+        response, history = self.process_response(response, history)
+        return response, history
+
+    @torch.inference_mode()
+    def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
+                    past_key_values=None,max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8,
+                    logits_processor=None, return_past_key_values=False, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
+                        tokenizer.get_command("<|observation|>")]
+        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        if past_key_values is None:
+            inputs = tokenizer.build_chat_input(query, history=history, role=role)
+        else:
+            inputs = tokenizer.build_chat_input(query, role=role)
+        inputs = inputs.to(self.device)
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[0]
+            if self.transformer.pre_seq_len is not None:
+                past_length -= self.transformer.pre_seq_len
+            inputs.position_ids += past_length
+            attention_mask = inputs.attention_mask
+            attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
+            inputs['attention_mask'] = attention_mask
+        history.append({"role": role, "content": query})
+        for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
+                                            eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
+                                            **gen_kwargs):
+            if return_past_key_values:
+                outputs, past_key_values = outputs
+            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+            response = tokenizer.decode(outputs)
+            if response and response[-1] != "�":
+                response, new_history = self.process_response(response, history)
+                if return_past_key_values:
+                    yield response, new_history, past_key_values
+                else:
+                    yield response, new_history
+
+    @torch.inference_mode()
+    def stream_generate(
+            self,
+            input_ids,
+            generation_config: Optional[GenerationConfig] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+            return_past_key_values=False,
+            **kwargs,
+    ):
+        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
+
+        if generation_config is None:
+            generation_config = self.generation_config
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)
+        model_kwargs["use_cache"] = generation_config.use_cache
+        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
+
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None:
+            warnings.warn(
+                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
+                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
+                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+            if not has_default_max_length:
+                logger.warn(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
+                    UserWarning,
+                )
+
+        if input_ids_seq_length >= generation_config.max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            logger.warning(
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`."
+            )
+
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+        )
+
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+        logits_warper = self._get_logits_warper(generation_config)
+
+        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        scores = None
+        while True:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=False,
+                output_hidden_states=False,
+            )
+
+            next_token_logits = outputs.logits[:, -1, :]
+
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            if generation_config.do_sample:
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(probs, dim=-1)
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            unfinished_sequences = unfinished_sequences.mul(
+                next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+            )
+            if return_past_key_values:
+                yield input_ids, outputs.past_key_values
+            else:
+                yield input_ids
+            # stop when each sentence is finished, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                break
+
+    def quantize(self, bits: int, empty_init=False, device=None, **kwargs):
+        if bits == 0:
+            return
+
+        from .quantization import quantize
+
+        if self.quantized:
+            logger.info("Already quantized.")
+            return self
+
+        self.quantized = True
+
+        self.config.quantization_bit = bits
+
+        self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
+                                            **kwargs)
+        return self
+
+
+class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
+
+        self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
+        if config.classifier_dropout is not None:
+            self.dropout = nn.Dropout(config.classifier_dropout)
+        else:
+            self.dropout = None
+        self.config = config
+
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            full_attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.LongTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            full_attention_mask=full_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        pooled_hidden_states = hidden_states[-1]
+        if self.dropout is not None:
+            pooled_hidden_states = self.dropout(pooled_hidden_states)
+        logits = self.classifier_head(pooled_hidden_states)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze().float(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits.float(), labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits.float(), labels.view(-1, self.num_labels))
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/optimizer.pt b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ad44085e9bc8966822377ccaefaf97559b0caa8e
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1dd1da17d3b6b12fe5b24290a2d06569fb2d3550d90f5e90e2eb102ea5fe310b
+size 14682210
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/pytorch_model.bin b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..53f07f8d3e9b4b3da941a4648f77fe1b33e4afd3
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b13806436bde90f40badb60acfb6ae15d876df2742640232e36e8b5d07fab9b0
+size 7341306
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/quantization.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb95bfe82b203ff6a2aa962326d2c7a438d6a52f
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/quantization.py
@@ -0,0 +1,188 @@
+from torch.nn import Linear
+from torch.nn.parameter import Parameter
+
+import bz2
+import torch
+import base64
+import ctypes
+from transformers.utils import logging
+
+from typing import List
+from functools import partial
+
+logger = logging.get_logger(__name__)
+
+try:
+    from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up
+
+    class Kernel:
+        def __init__(self, code: bytes, function_names: List[str]):
+            self.code = code
+            self._function_names = function_names
+            self._cmodule = LazyKernelCModule(self.code)
+
+            for name in self._function_names:
+                setattr(self, name, KernelFunction(self._cmodule, name))
+
+    quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ"
+
+    kernels = Kernel(
+        bz2.decompress(base64.b64decode(quantization_code)),
+        [
+            "int4WeightCompression",
+            "int4WeightExtractionFloat",
+            "int4WeightExtractionHalf",
+            "int8WeightExtractionFloat",
+            "int8WeightExtractionHalf",
+        ],
+    )
+except Exception as exception:
+    kernels = None
+    logger.warning("Failed to load cpm_kernels:" + str(exception))
+
+
+class W8A16Linear(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width):
+        ctx.inp_shape = inp.size()
+        ctx.weight_bit_width = weight_bit_width
+        out_features = quant_w.size(0)
+        inp = inp.contiguous().view(-1, inp.size(-1))
+        weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width)
+        ctx.weight_shape = weight.size()
+        output = inp.mm(weight.t())
+        ctx.save_for_backward(inp, quant_w, scale_w)
+        return output.view(*(ctx.inp_shape[:-1] + (out_features,)))
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        inp, quant_w, scale_w = ctx.saved_tensors
+        weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width)
+        grad_output = grad_output.contiguous().view(-1, weight.size(0))
+        grad_input = grad_output.mm(weight)
+        grad_weight = grad_output.t().mm(inp)
+        return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None
+
+
+def compress_int4_weight(weight: torch.Tensor):  # (n, m)
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        assert m % 2 == 0
+        m = m // 2
+        out = torch.empty(n, m, dtype=torch.int8, device="cuda")
+        stream = torch.cuda.current_stream()
+
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+
+        kernels.int4WeightCompression(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
+        )
+        return out
+
+
+def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int):
+    assert scale_list.dtype in [torch.half, torch.bfloat16]
+    assert weight.dtype in [torch.int8]
+    if source_bit_width == 8:
+        return weight.to(scale_list.dtype) * scale_list[:, None]
+    elif source_bit_width == 4:
+        func = (
+            kernels.int4WeightExtractionHalf if scale_list.dtype == torch.half else kernels.int4WeightExtractionBFloat16
+        )
+    else:
+        assert False, "Unsupported bit-width"
+
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        out = torch.empty(n, m * (8 // source_bit_width), dtype=scale_list.dtype, device="cuda")
+        stream = torch.cuda.current_stream()
+
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+
+        func(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [
+                ctypes.c_void_p(weight.data_ptr()),
+                ctypes.c_void_p(scale_list.data_ptr()),
+                ctypes.c_void_p(out.data_ptr()),
+                ctypes.c_int32(n),
+                ctypes.c_int32(m),
+            ],
+        )
+        return out
+
+
+class QuantizedLinear(torch.nn.Module):
+    def __init__(self, weight_bit_width: int, weight, bias=None, device="cpu", dtype=None, empty_init=False, *args,
+                 **kwargs):
+        super().__init__()
+        self.weight_bit_width = weight_bit_width
+
+        shape = weight.shape
+
+        if weight is None or empty_init:
+            self.weight = torch.empty(shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=device)
+            self.weight_scale = torch.empty(shape[0], dtype=dtype, device=device)
+        else:
+            self.weight_scale = weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)
+            self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8)
+            if weight_bit_width == 4:
+                self.weight = compress_int4_weight(self.weight)
+
+        self.weight = Parameter(self.weight.to(device), requires_grad=False)
+        self.weight_scale = Parameter(self.weight_scale.to(device), requires_grad=False)
+        self.bias = Parameter(bias.to(device), requires_grad=False) if bias is not None else None
+
+    def forward(self, input):
+        output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width)
+        if self.bias is not None:
+            output = output + self.bias
+        return output
+
+
+def quantize(model, weight_bit_width, empty_init=False, device=None):
+    """Replace fp16 linear with quantized linear"""
+    for layer in model.layers:
+        layer.self_attention.query_key_value = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.self_attention.query_key_value.weight.to(torch.cuda.current_device()),
+            bias=layer.self_attention.query_key_value.bias,
+            dtype=layer.self_attention.query_key_value.weight.dtype,
+            device=layer.self_attention.query_key_value.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.self_attention.dense = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.self_attention.dense.weight.to(torch.cuda.current_device()),
+            bias=layer.self_attention.dense.bias,
+            dtype=layer.self_attention.dense.weight.dtype,
+            device=layer.self_attention.dense.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_h_to_4h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()),
+            bias=layer.mlp.dense_h_to_4h.bias,
+            dtype=layer.mlp.dense_h_to_4h.weight.dtype,
+            device=layer.mlp.dense_h_to_4h.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_4h_to_h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()),
+            bias=layer.mlp.dense_4h_to_h.bias,
+            dtype=layer.mlp.dense_4h_to_h.weight.dtype,
+            device=layer.mlp.dense_4h_to_h.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+
+    return model
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/rng_state.pth b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8d79580df3289aeba2c49bc7ba0545698f615dc9
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9a63e97433d3fef7b3a60f533854fd6ddf541e22f57319249a84c8c03349901
+size 14244
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/scheduler.pt b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..84975cb4556df93d442cc53ad9a0422ccb68dfa3
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03807c669b0036b32d733f45cd6dd9532812cb5c07571d10756d30ca0c75581c
+size 1064
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/special_tokens_map.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/special_tokens_map.json
@@ -0,0 +1 @@
+{}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenization_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenization_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenization_chatglm.py
@@ -0,0 +1,300 @@
+import json
+import os
+import re
+from typing import List, Optional, Union, Dict
+from sentencepiece import SentencePieceProcessor
+from transformers import PreTrainedTokenizer
+from transformers.utils import logging, PaddingStrategy
+from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
+
+
+class SPTokenizer:
+    def __init__(self, model_path: str):
+        # reload tokenizer
+        assert os.path.isfile(model_path), model_path
+        self.sp_model = SentencePieceProcessor(model_file=model_path)
+
+        # BOS / EOS token IDs
+        self.n_words: int = self.sp_model.vocab_size()
+        self.bos_id: int = self.sp_model.bos_id()
+        self.eos_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.unk_id()
+        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+
+        role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
+        special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
+        self.special_tokens = {}
+        self.index_special_tokens = {}
+        for token in special_tokens:
+            self.special_tokens[token] = self.n_words
+            self.index_special_tokens[self.n_words] = token
+            self.n_words += 1
+        self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens])
+
+    def tokenize(self, s: str, encode_special_tokens=False):
+        if encode_special_tokens:
+            last_index = 0
+            t = []
+            for match in re.finditer(self.role_special_token_expression, s):
+                if last_index < match.start():
+                    t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()]))
+                t.append(s[match.start():match.end()])
+                last_index = match.end()
+            if last_index < len(s):
+                t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
+            return t
+        else:
+            return self.sp_model.EncodeAsPieces(s)
+
+    def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
+        assert type(s) is str
+        t = self.sp_model.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+
+    def decode(self, t: List[int]) -> str:
+        text, buffer = "", []
+        for token in t:
+            if token in self.index_special_tokens:
+                if buffer:
+                    text += self.sp_model.decode(buffer)
+                    buffer = []
+                text += self.index_special_tokens[token]
+            else:
+                buffer.append(token)
+        if buffer:
+            text += self.sp_model.decode(buffer)
+        return text
+
+    def decode_tokens(self, tokens: List[str]) -> str:
+        text = self.sp_model.DecodePieces(tokens)
+        return text
+
+    def convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        return self.sp_model.PieceToId(token)
+
+    def convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.index_special_tokens:
+            return self.index_special_tokens[index]
+        if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0:
+            return ""
+        return self.sp_model.IdToPiece(index)
+
+
+class ChatGLMTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {"vocab_file": "tokenizer.model"}
+
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+
+    def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False,
+                 **kwargs):
+        self.name = "GLMTokenizer"
+
+        self.vocab_file = vocab_file
+        self.tokenizer = SPTokenizer(vocab_file)
+        self.special_tokens = {
+            "<bos>": self.tokenizer.bos_id,
+            "<eos>": self.tokenizer.eos_id,
+            "<pad>": self.tokenizer.pad_id
+        }
+        self.encode_special_tokens = encode_special_tokens
+        super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                         encode_special_tokens=encode_special_tokens,
+                         **kwargs)
+
+    def get_command(self, token):
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
+        return self.tokenizer.special_tokens[token]
+
+    @property
+    def unk_token(self) -> str:
+        return "<unk>"
+
+    @property
+    def pad_token(self) -> str:
+        return "<unk>"
+
+    @property
+    def pad_token_id(self):
+        return self.get_command("<pad>")
+
+    @property
+    def eos_token(self) -> str:
+        return "</s>"
+
+    @property
+    def eos_token_id(self):
+        return self.get_command("<eos>")
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_words
+
+    def get_vocab(self):
+        """ Returns vocab as a dict """
+        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text, **kwargs):
+        return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.tokenizer.convert_token_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.tokenizer.convert_id_to_token(index)
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return self.tokenizer.decode_tokens(tokens)
+
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, self.vocab_files_names["vocab_file"]
+            )
+        else:
+            vocab_file = save_directory
+
+        with open(self.vocab_file, 'rb') as fin:
+            proto_str = fin.read()
+
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_str)
+
+        return (vocab_file,)
+
+    def get_prefix_tokens(self):
+        prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
+        return prefix_tokens
+
+    def build_single_message(self, role, metadata, message):
+        assert role in ["system", "user", "assistant", "observation"], role
+        role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n")
+        message_tokens = self.tokenizer.encode(message)
+        tokens = role_tokens + message_tokens
+        return tokens
+
+    def build_chat_input(self, query, history=None, role="user"):
+        if history is None:
+            history = []
+        input_ids = []
+        for item in history:
+            content = item["content"]
+            if item["role"] == "system" and "tools" in item:
+                content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False)
+            input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content))
+        input_ids.extend(self.build_single_message(role, "", query))
+        input_ids.extend([self.get_command("<|assistant|>")])
+        return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True)
+
+    def build_inputs_with_special_tokens(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        prefix_tokens = self.get_prefix_tokens()
+        token_ids_0 = prefix_tokens + token_ids_0
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("<eos>")]
+        return token_ids_0
+
+    def _pad(
+            self,
+            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+            max_length: Optional[int] = None,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            pad_to_multiple_of: Optional[int] = None,
+            return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        assert self.padding_side == "left"
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * seq_length
+
+        if "position_ids" not in encoded_inputs:
+            encoded_inputs["position_ids"] = list(range(seq_length))
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
+            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+
+        return encoded_inputs
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenizer.model b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
+size 1018370
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenizer_config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..5faafd38f2e2934804feb0e7d71ebf08b0839bf5
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenizer_config.json
@@ -0,0 +1,18 @@
+{
+  "added_tokens_decoder": {},
+  "additional_special_tokens": [],
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_chatglm.ChatGLMTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "encode_special_tokens": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "padding_side": "left",
+  "remove_space": false,
+  "tokenizer_class": "ChatGLMTokenizer",
+  "tokenizer_file": null
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/trainer_state.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce58b9e749f072c05c203cd33107a6fd6b1ef57c
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/trainer_state.json
@@ -0,0 +1,1219 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 18.823529411764707,
+  "eval_steps": 500,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.09,
+      "learning_rate": 0.009985714285714285,
+      "loss": 2.6971,
+      "step": 1
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 0.009971428571428572,
+      "loss": 2.3927,
+      "step": 2
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 0.009957142857142857,
+      "loss": 2.2539,
+      "step": 3
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 0.009942857142857144,
+      "loss": 2.1408,
+      "step": 4
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 0.009928571428571429,
+      "loss": 2.2672,
+      "step": 5
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 0.009914285714285714,
+      "loss": 1.6433,
+      "step": 6
+    },
+    {
+      "epoch": 0.66,
+      "learning_rate": 0.0099,
+      "loss": 2.1405,
+      "step": 7
+    },
+    {
+      "epoch": 0.75,
+      "learning_rate": 0.009885714285714286,
+      "loss": 2.1464,
+      "step": 8
+    },
+    {
+      "epoch": 0.85,
+      "learning_rate": 0.009871428571428571,
+      "loss": 1.8498,
+      "step": 9
+    },
+    {
+      "epoch": 0.94,
+      "learning_rate": 0.009857142857142858,
+      "loss": 1.6896,
+      "step": 10
+    },
+    {
+      "epoch": 1.04,
+      "learning_rate": 0.009842857142857143,
+      "loss": 2.1932,
+      "step": 11
+    },
+    {
+      "epoch": 1.13,
+      "learning_rate": 0.00982857142857143,
+      "loss": 1.8236,
+      "step": 12
+    },
+    {
+      "epoch": 1.22,
+      "learning_rate": 0.009814285714285715,
+      "loss": 1.735,
+      "step": 13
+    },
+    {
+      "epoch": 1.32,
+      "learning_rate": 0.0098,
+      "loss": 1.7488,
+      "step": 14
+    },
+    {
+      "epoch": 1.41,
+      "learning_rate": 0.009785714285714285,
+      "loss": 1.8336,
+      "step": 15
+    },
+    {
+      "epoch": 1.51,
+      "learning_rate": 0.009771428571428572,
+      "loss": 1.9438,
+      "step": 16
+    },
+    {
+      "epoch": 1.6,
+      "learning_rate": 0.009757142857142858,
+      "loss": 1.7178,
+      "step": 17
+    },
+    {
+      "epoch": 1.69,
+      "learning_rate": 0.009742857142857143,
+      "loss": 1.5714,
+      "step": 18
+    },
+    {
+      "epoch": 1.79,
+      "learning_rate": 0.009728571428571428,
+      "loss": 1.537,
+      "step": 19
+    },
+    {
+      "epoch": 1.88,
+      "learning_rate": 0.009714285714285715,
+      "loss": 1.6764,
+      "step": 20
+    },
+    {
+      "epoch": 1.98,
+      "learning_rate": 0.0097,
+      "loss": 1.8919,
+      "step": 21
+    },
+    {
+      "epoch": 2.07,
+      "learning_rate": 0.009685714285714285,
+      "loss": 1.346,
+      "step": 22
+    },
+    {
+      "epoch": 2.16,
+      "learning_rate": 0.009671428571428572,
+      "loss": 1.5036,
+      "step": 23
+    },
+    {
+      "epoch": 2.26,
+      "learning_rate": 0.009657142857142857,
+      "loss": 1.6788,
+      "step": 24
+    },
+    {
+      "epoch": 2.35,
+      "learning_rate": 0.009642857142857144,
+      "loss": 1.6667,
+      "step": 25
+    },
+    {
+      "epoch": 2.45,
+      "learning_rate": 0.009628571428571429,
+      "loss": 1.7153,
+      "step": 26
+    },
+    {
+      "epoch": 2.54,
+      "learning_rate": 0.009614285714285714,
+      "loss": 1.601,
+      "step": 27
+    },
+    {
+      "epoch": 2.64,
+      "learning_rate": 0.0096,
+      "loss": 1.3002,
+      "step": 28
+    },
+    {
+      "epoch": 2.73,
+      "learning_rate": 0.009585714285714286,
+      "loss": 1.3294,
+      "step": 29
+    },
+    {
+      "epoch": 2.82,
+      "learning_rate": 0.009571428571428573,
+      "loss": 1.7477,
+      "step": 30
+    },
+    {
+      "epoch": 2.92,
+      "learning_rate": 0.009557142857142858,
+      "loss": 1.7961,
+      "step": 31
+    },
+    {
+      "epoch": 3.01,
+      "learning_rate": 0.009542857142857143,
+      "loss": 1.4954,
+      "step": 32
+    },
+    {
+      "epoch": 3.11,
+      "learning_rate": 0.009528571428571428,
+      "loss": 1.6452,
+      "step": 33
+    },
+    {
+      "epoch": 3.2,
+      "learning_rate": 0.009514285714285715,
+      "loss": 1.3528,
+      "step": 34
+    },
+    {
+      "epoch": 3.29,
+      "learning_rate": 0.0095,
+      "loss": 1.4811,
+      "step": 35
+    },
+    {
+      "epoch": 3.39,
+      "learning_rate": 0.009485714285714287,
+      "loss": 1.4738,
+      "step": 36
+    },
+    {
+      "epoch": 3.48,
+      "learning_rate": 0.009471428571428572,
+      "loss": 1.174,
+      "step": 37
+    },
+    {
+      "epoch": 3.58,
+      "learning_rate": 0.009457142857142857,
+      "loss": 1.2346,
+      "step": 38
+    },
+    {
+      "epoch": 3.67,
+      "learning_rate": 0.009442857142857143,
+      "loss": 1.5327,
+      "step": 39
+    },
+    {
+      "epoch": 3.76,
+      "learning_rate": 0.009428571428571429,
+      "loss": 1.5249,
+      "step": 40
+    },
+    {
+      "epoch": 3.86,
+      "learning_rate": 0.009414285714285714,
+      "loss": 1.5086,
+      "step": 41
+    },
+    {
+      "epoch": 3.95,
+      "learning_rate": 0.0094,
+      "loss": 1.8425,
+      "step": 42
+    },
+    {
+      "epoch": 4.05,
+      "learning_rate": 0.009385714285714287,
+      "loss": 1.1943,
+      "step": 43
+    },
+    {
+      "epoch": 4.14,
+      "learning_rate": 0.009371428571428572,
+      "loss": 1.6835,
+      "step": 44
+    },
+    {
+      "epoch": 4.24,
+      "learning_rate": 0.009357142857142857,
+      "loss": 1.75,
+      "step": 45
+    },
+    {
+      "epoch": 4.33,
+      "learning_rate": 0.009342857142857142,
+      "loss": 1.2561,
+      "step": 46
+    },
+    {
+      "epoch": 4.42,
+      "learning_rate": 0.009328571428571429,
+      "loss": 1.3784,
+      "step": 47
+    },
+    {
+      "epoch": 4.52,
+      "learning_rate": 0.009314285714285714,
+      "loss": 1.2538,
+      "step": 48
+    },
+    {
+      "epoch": 4.61,
+      "learning_rate": 0.009300000000000001,
+      "loss": 1.4429,
+      "step": 49
+    },
+    {
+      "epoch": 4.71,
+      "learning_rate": 0.009285714285714286,
+      "loss": 1.3687,
+      "step": 50
+    },
+    {
+      "epoch": 4.8,
+      "learning_rate": 0.009271428571428571,
+      "loss": 1.1511,
+      "step": 51
+    },
+    {
+      "epoch": 4.89,
+      "learning_rate": 0.009257142857142858,
+      "loss": 1.181,
+      "step": 52
+    },
+    {
+      "epoch": 4.99,
+      "learning_rate": 0.009242857142857143,
+      "loss": 1.1753,
+      "step": 53
+    },
+    {
+      "epoch": 5.08,
+      "learning_rate": 0.009228571428571428,
+      "loss": 1.1562,
+      "step": 54
+    },
+    {
+      "epoch": 5.18,
+      "learning_rate": 0.009214285714285715,
+      "loss": 1.2936,
+      "step": 55
+    },
+    {
+      "epoch": 5.27,
+      "learning_rate": 0.0092,
+      "loss": 1.3591,
+      "step": 56
+    },
+    {
+      "epoch": 5.36,
+      "learning_rate": 0.009185714285714287,
+      "loss": 1.1376,
+      "step": 57
+    },
+    {
+      "epoch": 5.46,
+      "learning_rate": 0.009171428571428572,
+      "loss": 1.372,
+      "step": 58
+    },
+    {
+      "epoch": 5.55,
+      "learning_rate": 0.009157142857142857,
+      "loss": 1.5141,
+      "step": 59
+    },
+    {
+      "epoch": 5.65,
+      "learning_rate": 0.009142857142857144,
+      "loss": 1.2087,
+      "step": 60
+    },
+    {
+      "epoch": 5.74,
+      "learning_rate": 0.009128571428571429,
+      "loss": 1.136,
+      "step": 61
+    },
+    {
+      "epoch": 5.84,
+      "learning_rate": 0.009114285714285715,
+      "loss": 1.2948,
+      "step": 62
+    },
+    {
+      "epoch": 5.93,
+      "learning_rate": 0.0091,
+      "loss": 1.0592,
+      "step": 63
+    },
+    {
+      "epoch": 6.02,
+      "learning_rate": 0.009085714285714286,
+      "loss": 1.2321,
+      "step": 64
+    },
+    {
+      "epoch": 6.12,
+      "learning_rate": 0.009071428571428572,
+      "loss": 1.0827,
+      "step": 65
+    },
+    {
+      "epoch": 6.21,
+      "learning_rate": 0.009057142857142857,
+      "loss": 1.1136,
+      "step": 66
+    },
+    {
+      "epoch": 6.31,
+      "learning_rate": 0.009042857142857142,
+      "loss": 1.475,
+      "step": 67
+    },
+    {
+      "epoch": 6.4,
+      "learning_rate": 0.009028571428571427,
+      "loss": 1.1316,
+      "step": 68
+    },
+    {
+      "epoch": 6.49,
+      "learning_rate": 0.009014285714285714,
+      "loss": 1.1688,
+      "step": 69
+    },
+    {
+      "epoch": 6.59,
+      "learning_rate": 0.009000000000000001,
+      "loss": 1.0882,
+      "step": 70
+    },
+    {
+      "epoch": 6.68,
+      "learning_rate": 0.008985714285714286,
+      "loss": 1.1085,
+      "step": 71
+    },
+    {
+      "epoch": 6.78,
+      "learning_rate": 0.008971428571428571,
+      "loss": 1.2029,
+      "step": 72
+    },
+    {
+      "epoch": 6.87,
+      "learning_rate": 0.008957142857142856,
+      "loss": 1.098,
+      "step": 73
+    },
+    {
+      "epoch": 6.96,
+      "learning_rate": 0.008942857142857143,
+      "loss": 1.219,
+      "step": 74
+    },
+    {
+      "epoch": 7.06,
+      "learning_rate": 0.00892857142857143,
+      "loss": 1.0092,
+      "step": 75
+    },
+    {
+      "epoch": 7.15,
+      "learning_rate": 0.008914285714285715,
+      "loss": 1.0112,
+      "step": 76
+    },
+    {
+      "epoch": 7.25,
+      "learning_rate": 0.0089,
+      "loss": 1.1481,
+      "step": 77
+    },
+    {
+      "epoch": 7.34,
+      "learning_rate": 0.008885714285714287,
+      "loss": 0.9873,
+      "step": 78
+    },
+    {
+      "epoch": 7.44,
+      "learning_rate": 0.008871428571428572,
+      "loss": 1.0586,
+      "step": 79
+    },
+    {
+      "epoch": 7.53,
+      "learning_rate": 0.008857142857142857,
+      "loss": 1.1177,
+      "step": 80
+    },
+    {
+      "epoch": 7.62,
+      "learning_rate": 0.008842857142857142,
+      "loss": 0.7814,
+      "step": 81
+    },
+    {
+      "epoch": 7.72,
+      "learning_rate": 0.008828571428571429,
+      "loss": 1.2043,
+      "step": 82
+    },
+    {
+      "epoch": 7.81,
+      "learning_rate": 0.008814285714285715,
+      "loss": 1.0062,
+      "step": 83
+    },
+    {
+      "epoch": 7.91,
+      "learning_rate": 0.0088,
+      "loss": 1.0831,
+      "step": 84
+    },
+    {
+      "epoch": 8.0,
+      "learning_rate": 0.008785714285714286,
+      "loss": 0.9554,
+      "step": 85
+    },
+    {
+      "epoch": 8.09,
+      "learning_rate": 0.00877142857142857,
+      "loss": 1.1674,
+      "step": 86
+    },
+    {
+      "epoch": 8.19,
+      "learning_rate": 0.008757142857142857,
+      "loss": 0.8226,
+      "step": 87
+    },
+    {
+      "epoch": 8.28,
+      "learning_rate": 0.008742857142857144,
+      "loss": 0.9166,
+      "step": 88
+    },
+    {
+      "epoch": 8.38,
+      "learning_rate": 0.00872857142857143,
+      "loss": 0.734,
+      "step": 89
+    },
+    {
+      "epoch": 8.47,
+      "learning_rate": 0.008714285714285714,
+      "loss": 0.8641,
+      "step": 90
+    },
+    {
+      "epoch": 8.56,
+      "learning_rate": 0.0087,
+      "loss": 0.9517,
+      "step": 91
+    },
+    {
+      "epoch": 8.66,
+      "learning_rate": 0.008685714285714286,
+      "loss": 0.9995,
+      "step": 92
+    },
+    {
+      "epoch": 8.75,
+      "learning_rate": 0.008671428571428571,
+      "loss": 0.763,
+      "step": 93
+    },
+    {
+      "epoch": 8.85,
+      "learning_rate": 0.008657142857142858,
+      "loss": 1.0712,
+      "step": 94
+    },
+    {
+      "epoch": 8.94,
+      "learning_rate": 0.008642857142857143,
+      "loss": 1.1111,
+      "step": 95
+    },
+    {
+      "epoch": 9.04,
+      "learning_rate": 0.008628571428571428,
+      "loss": 0.9626,
+      "step": 96
+    },
+    {
+      "epoch": 9.13,
+      "learning_rate": 0.008614285714285715,
+      "loss": 0.6385,
+      "step": 97
+    },
+    {
+      "epoch": 9.22,
+      "learning_rate": 0.0086,
+      "loss": 0.8147,
+      "step": 98
+    },
+    {
+      "epoch": 9.32,
+      "learning_rate": 0.008585714285714285,
+      "loss": 0.8109,
+      "step": 99
+    },
+    {
+      "epoch": 9.41,
+      "learning_rate": 0.008571428571428572,
+      "loss": 1.0953,
+      "step": 100
+    },
+    {
+      "epoch": 9.51,
+      "learning_rate": 0.008557142857142859,
+      "loss": 0.7104,
+      "step": 101
+    },
+    {
+      "epoch": 9.6,
+      "learning_rate": 0.008542857142857144,
+      "loss": 0.9672,
+      "step": 102
+    },
+    {
+      "epoch": 9.69,
+      "learning_rate": 0.008528571428571429,
+      "loss": 0.7593,
+      "step": 103
+    },
+    {
+      "epoch": 9.79,
+      "learning_rate": 0.008514285714285714,
+      "loss": 1.0186,
+      "step": 104
+    },
+    {
+      "epoch": 9.88,
+      "learning_rate": 0.0085,
+      "loss": 0.7898,
+      "step": 105
+    },
+    {
+      "epoch": 9.98,
+      "learning_rate": 0.008485714285714286,
+      "loss": 0.7392,
+      "step": 106
+    },
+    {
+      "epoch": 10.07,
+      "learning_rate": 0.008471428571428572,
+      "loss": 0.7295,
+      "step": 107
+    },
+    {
+      "epoch": 10.16,
+      "learning_rate": 0.008457142857142858,
+      "loss": 0.7211,
+      "step": 108
+    },
+    {
+      "epoch": 10.26,
+      "learning_rate": 0.008442857142857143,
+      "loss": 0.769,
+      "step": 109
+    },
+    {
+      "epoch": 10.35,
+      "learning_rate": 0.00842857142857143,
+      "loss": 0.718,
+      "step": 110
+    },
+    {
+      "epoch": 10.45,
+      "learning_rate": 0.008414285714285714,
+      "loss": 0.6411,
+      "step": 111
+    },
+    {
+      "epoch": 10.54,
+      "learning_rate": 0.0084,
+      "loss": 0.8016,
+      "step": 112
+    },
+    {
+      "epoch": 10.64,
+      "learning_rate": 0.008385714285714286,
+      "loss": 0.6633,
+      "step": 113
+    },
+    {
+      "epoch": 10.73,
+      "learning_rate": 0.008371428571428571,
+      "loss": 0.7257,
+      "step": 114
+    },
+    {
+      "epoch": 10.82,
+      "learning_rate": 0.008357142857142858,
+      "loss": 0.7785,
+      "step": 115
+    },
+    {
+      "epoch": 10.92,
+      "learning_rate": 0.008342857142857143,
+      "loss": 0.8927,
+      "step": 116
+    },
+    {
+      "epoch": 11.01,
+      "learning_rate": 0.008328571428571428,
+      "loss": 0.7242,
+      "step": 117
+    },
+    {
+      "epoch": 11.11,
+      "learning_rate": 0.008314285714285715,
+      "loss": 0.8297,
+      "step": 118
+    },
+    {
+      "epoch": 11.2,
+      "learning_rate": 0.0083,
+      "loss": 0.6761,
+      "step": 119
+    },
+    {
+      "epoch": 11.29,
+      "learning_rate": 0.008285714285714287,
+      "loss": 0.6699,
+      "step": 120
+    },
+    {
+      "epoch": 11.39,
+      "learning_rate": 0.008271428571428572,
+      "loss": 0.5365,
+      "step": 121
+    },
+    {
+      "epoch": 11.48,
+      "learning_rate": 0.008257142857142857,
+      "loss": 0.9045,
+      "step": 122
+    },
+    {
+      "epoch": 11.58,
+      "learning_rate": 0.008242857142857144,
+      "loss": 0.5071,
+      "step": 123
+    },
+    {
+      "epoch": 11.67,
+      "learning_rate": 0.008228571428571429,
+      "loss": 0.6472,
+      "step": 124
+    },
+    {
+      "epoch": 11.76,
+      "learning_rate": 0.008214285714285714,
+      "loss": 0.6232,
+      "step": 125
+    },
+    {
+      "epoch": 11.86,
+      "learning_rate": 0.008199999999999999,
+      "loss": 0.4905,
+      "step": 126
+    },
+    {
+      "epoch": 11.95,
+      "learning_rate": 0.008185714285714286,
+      "loss": 0.557,
+      "step": 127
+    },
+    {
+      "epoch": 12.05,
+      "learning_rate": 0.008171428571428573,
+      "loss": 0.5517,
+      "step": 128
+    },
+    {
+      "epoch": 12.14,
+      "learning_rate": 0.008157142857142858,
+      "loss": 0.6321,
+      "step": 129
+    },
+    {
+      "epoch": 12.24,
+      "learning_rate": 0.008142857142857143,
+      "loss": 0.6619,
+      "step": 130
+    },
+    {
+      "epoch": 12.33,
+      "learning_rate": 0.008128571428571428,
+      "loss": 0.5524,
+      "step": 131
+    },
+    {
+      "epoch": 12.42,
+      "learning_rate": 0.008114285714285715,
+      "loss": 0.4688,
+      "step": 132
+    },
+    {
+      "epoch": 12.52,
+      "learning_rate": 0.008100000000000001,
+      "loss": 0.3717,
+      "step": 133
+    },
+    {
+      "epoch": 12.61,
+      "learning_rate": 0.008085714285714286,
+      "loss": 0.5118,
+      "step": 134
+    },
+    {
+      "epoch": 12.71,
+      "learning_rate": 0.008071428571428571,
+      "loss": 0.4521,
+      "step": 135
+    },
+    {
+      "epoch": 12.8,
+      "learning_rate": 0.008057142857142856,
+      "loss": 0.5865,
+      "step": 136
+    },
+    {
+      "epoch": 12.89,
+      "learning_rate": 0.008042857142857143,
+      "loss": 0.5977,
+      "step": 137
+    },
+    {
+      "epoch": 12.99,
+      "learning_rate": 0.008028571428571428,
+      "loss": 0.6977,
+      "step": 138
+    },
+    {
+      "epoch": 13.08,
+      "learning_rate": 0.008014285714285713,
+      "loss": 0.5625,
+      "step": 139
+    },
+    {
+      "epoch": 13.18,
+      "learning_rate": 0.008,
+      "loss": 0.3611,
+      "step": 140
+    },
+    {
+      "epoch": 13.27,
+      "learning_rate": 0.007985714285714287,
+      "loss": 0.5168,
+      "step": 141
+    },
+    {
+      "epoch": 13.36,
+      "learning_rate": 0.007971428571428572,
+      "loss": 0.4429,
+      "step": 142
+    },
+    {
+      "epoch": 13.46,
+      "learning_rate": 0.007957142857142857,
+      "loss": 0.4998,
+      "step": 143
+    },
+    {
+      "epoch": 13.55,
+      "learning_rate": 0.007942857142857142,
+      "loss": 0.4437,
+      "step": 144
+    },
+    {
+      "epoch": 13.65,
+      "learning_rate": 0.007928571428571429,
+      "loss": 0.4958,
+      "step": 145
+    },
+    {
+      "epoch": 13.74,
+      "learning_rate": 0.007914285714285716,
+      "loss": 0.4021,
+      "step": 146
+    },
+    {
+      "epoch": 13.84,
+      "learning_rate": 0.0079,
+      "loss": 0.6163,
+      "step": 147
+    },
+    {
+      "epoch": 13.93,
+      "learning_rate": 0.007885714285714286,
+      "loss": 0.406,
+      "step": 148
+    },
+    {
+      "epoch": 14.02,
+      "learning_rate": 0.007871428571428571,
+      "loss": 0.4905,
+      "step": 149
+    },
+    {
+      "epoch": 14.12,
+      "learning_rate": 0.007857142857142858,
+      "loss": 0.3824,
+      "step": 150
+    },
+    {
+      "epoch": 14.21,
+      "learning_rate": 0.007842857142857143,
+      "loss": 0.3591,
+      "step": 151
+    },
+    {
+      "epoch": 14.31,
+      "learning_rate": 0.007828571428571428,
+      "loss": 0.342,
+      "step": 152
+    },
+    {
+      "epoch": 14.4,
+      "learning_rate": 0.007814285714285715,
+      "loss": 0.4565,
+      "step": 153
+    },
+    {
+      "epoch": 14.49,
+      "learning_rate": 0.0078000000000000005,
+      "loss": 0.3287,
+      "step": 154
+    },
+    {
+      "epoch": 14.59,
+      "learning_rate": 0.007785714285714286,
+      "loss": 0.4179,
+      "step": 155
+    },
+    {
+      "epoch": 14.68,
+      "learning_rate": 0.0077714285714285715,
+      "loss": 0.3586,
+      "step": 156
+    },
+    {
+      "epoch": 14.78,
+      "learning_rate": 0.007757142857142857,
+      "loss": 0.4618,
+      "step": 157
+    },
+    {
+      "epoch": 14.87,
+      "learning_rate": 0.0077428571428571425,
+      "loss": 0.4133,
+      "step": 158
+    },
+    {
+      "epoch": 14.96,
+      "learning_rate": 0.007728571428571429,
+      "loss": 0.4326,
+      "step": 159
+    },
+    {
+      "epoch": 15.06,
+      "learning_rate": 0.007714285714285715,
+      "loss": 0.3838,
+      "step": 160
+    },
+    {
+      "epoch": 15.15,
+      "learning_rate": 0.0077,
+      "loss": 0.2978,
+      "step": 161
+    },
+    {
+      "epoch": 15.25,
+      "learning_rate": 0.007685714285714286,
+      "loss": 0.3993,
+      "step": 162
+    },
+    {
+      "epoch": 15.34,
+      "learning_rate": 0.007671428571428571,
+      "loss": 0.3249,
+      "step": 163
+    },
+    {
+      "epoch": 15.44,
+      "learning_rate": 0.007657142857142857,
+      "loss": 0.2796,
+      "step": 164
+    },
+    {
+      "epoch": 15.53,
+      "learning_rate": 0.007642857142857142,
+      "loss": 0.3918,
+      "step": 165
+    },
+    {
+      "epoch": 15.62,
+      "learning_rate": 0.007628571428571429,
+      "loss": 0.4122,
+      "step": 166
+    },
+    {
+      "epoch": 15.72,
+      "learning_rate": 0.007614285714285715,
+      "loss": 0.3403,
+      "step": 167
+    },
+    {
+      "epoch": 15.81,
+      "learning_rate": 0.0076,
+      "loss": 0.3759,
+      "step": 168
+    },
+    {
+      "epoch": 15.91,
+      "learning_rate": 0.007585714285714286,
+      "loss": 0.3621,
+      "step": 169
+    },
+    {
+      "epoch": 16.0,
+      "learning_rate": 0.007571428571428571,
+      "loss": 0.2991,
+      "step": 170
+    },
+    {
+      "epoch": 16.09,
+      "learning_rate": 0.007557142857142857,
+      "loss": 0.3039,
+      "step": 171
+    },
+    {
+      "epoch": 16.19,
+      "learning_rate": 0.007542857142857144,
+      "loss": 0.4571,
+      "step": 172
+    },
+    {
+      "epoch": 16.28,
+      "learning_rate": 0.007528571428571429,
+      "loss": 0.2759,
+      "step": 173
+    },
+    {
+      "epoch": 16.38,
+      "learning_rate": 0.007514285714285715,
+      "loss": 0.2835,
+      "step": 174
+    },
+    {
+      "epoch": 16.47,
+      "learning_rate": 0.0075,
+      "loss": 0.3221,
+      "step": 175
+    },
+    {
+      "epoch": 16.56,
+      "learning_rate": 0.007485714285714286,
+      "loss": 0.3072,
+      "step": 176
+    },
+    {
+      "epoch": 16.66,
+      "learning_rate": 0.007471428571428572,
+      "loss": 0.2852,
+      "step": 177
+    },
+    {
+      "epoch": 16.75,
+      "learning_rate": 0.007457142857142857,
+      "loss": 0.2559,
+      "step": 178
+    },
+    {
+      "epoch": 16.85,
+      "learning_rate": 0.007442857142857143,
+      "loss": 0.2787,
+      "step": 179
+    },
+    {
+      "epoch": 16.94,
+      "learning_rate": 0.007428571428571429,
+      "loss": 0.3331,
+      "step": 180
+    },
+    {
+      "epoch": 17.04,
+      "learning_rate": 0.007414285714285714,
+      "loss": 0.1929,
+      "step": 181
+    },
+    {
+      "epoch": 17.13,
+      "learning_rate": 0.0074,
+      "loss": 0.2065,
+      "step": 182
+    },
+    {
+      "epoch": 17.22,
+      "learning_rate": 0.007385714285714285,
+      "loss": 0.2868,
+      "step": 183
+    },
+    {
+      "epoch": 17.32,
+      "learning_rate": 0.007371428571428571,
+      "loss": 0.2206,
+      "step": 184
+    },
+    {
+      "epoch": 17.41,
+      "learning_rate": 0.007357142857142858,
+      "loss": 0.2355,
+      "step": 185
+    },
+    {
+      "epoch": 17.51,
+      "learning_rate": 0.007342857142857143,
+      "loss": 0.3041,
+      "step": 186
+    },
+    {
+      "epoch": 17.6,
+      "learning_rate": 0.007328571428571429,
+      "loss": 0.3028,
+      "step": 187
+    },
+    {
+      "epoch": 17.69,
+      "learning_rate": 0.007314285714285714,
+      "loss": 0.2435,
+      "step": 188
+    },
+    {
+      "epoch": 17.79,
+      "learning_rate": 0.0073,
+      "loss": 0.1869,
+      "step": 189
+    },
+    {
+      "epoch": 17.88,
+      "learning_rate": 0.007285714285714285,
+      "loss": 0.3036,
+      "step": 190
+    },
+    {
+      "epoch": 17.98,
+      "learning_rate": 0.007271428571428571,
+      "loss": 0.246,
+      "step": 191
+    },
+    {
+      "epoch": 18.07,
+      "learning_rate": 0.007257142857142858,
+      "loss": 0.2316,
+      "step": 192
+    },
+    {
+      "epoch": 18.16,
+      "learning_rate": 0.007242857142857143,
+      "loss": 0.186,
+      "step": 193
+    },
+    {
+      "epoch": 18.26,
+      "learning_rate": 0.007228571428571429,
+      "loss": 0.2616,
+      "step": 194
+    },
+    {
+      "epoch": 18.35,
+      "learning_rate": 0.007214285714285715,
+      "loss": 0.2824,
+      "step": 195
+    },
+    {
+      "epoch": 18.45,
+      "learning_rate": 0.0072,
+      "loss": 0.2,
+      "step": 196
+    },
+    {
+      "epoch": 18.54,
+      "learning_rate": 0.007185714285714286,
+      "loss": 0.1978,
+      "step": 197
+    },
+    {
+      "epoch": 18.64,
+      "learning_rate": 0.007171428571428572,
+      "loss": 0.1897,
+      "step": 198
+    },
+    {
+      "epoch": 18.73,
+      "learning_rate": 0.007157142857142858,
+      "loss": 0.1958,
+      "step": 199
+    },
+    {
+      "epoch": 18.82,
+      "learning_rate": 0.0071428571428571435,
+      "loss": 0.203,
+      "step": 200
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 700,
+  "num_train_epochs": 70,
+  "save_steps": 100,
+  "total_flos": 2.350348643598336e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/training_args.bin b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..17f9bfbf1a7cdd9e0e808e0672d55ad9ad4efb5f
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00669a32a6ddac0a3243bbc04d3f1f70ffc8f89f2626c1fdafa93ce68c311aa0
+size 4664
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..50d927dc68b4eaa40bd4812b7417b3f2bd61f599
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "chatglm3-6b",
+  "add_bias_linear": false,
+  "add_qkv_bias": true,
+  "apply_query_key_layer_scaling": true,
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "ChatGLMForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "auto_map": {
+    "AutoConfig": "configuration_chatglm.ChatGLMConfig",
+    "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
+  },
+  "bias_dropout_fusion": true,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "ffn_hidden_size": 13696,
+  "fp32_residual_connection": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 4096,
+  "kv_channels": 128,
+  "layernorm_epsilon": 1e-05,
+  "model_type": "chatglm",
+  "multi_query_attention": true,
+  "multi_query_group_num": 2,
+  "num_attention_heads": 32,
+  "num_layers": 28,
+  "original_rope": true,
+  "pad_token_id": 0,
+  "padded_vocab_size": 65024,
+  "post_layer_norm": true,
+  "pre_seq_len": 128,
+  "prefix_projection": false,
+  "quantization_bit": 0,
+  "rmsnorm": true,
+  "seq_length": 8192,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.34.0",
+  "use_cache": true,
+  "vocab_size": 65024
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/configuration_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/configuration_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..35600185f5a26951081de0f3a41a913eaf06af99
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/configuration_chatglm.py
@@ -0,0 +1,61 @@
+from transformers import PretrainedConfig
+
+
+class ChatGLMConfig(PretrainedConfig):
+    model_type = "chatglm"
+    def __init__(
+        self,
+        num_layers=28,
+        padded_vocab_size=65024,
+        hidden_size=4096,
+        ffn_hidden_size=13696,
+        kv_channels=128,
+        num_attention_heads=32,
+        seq_length=2048,
+        hidden_dropout=0.0,
+        classifier_dropout=None,
+        attention_dropout=0.0,
+        layernorm_epsilon=1e-5,
+        rmsnorm=True,
+        apply_residual_connection_post_layernorm=False,
+        post_layer_norm=True,
+        add_bias_linear=False,
+        add_qkv_bias=False,
+        bias_dropout_fusion=True,
+        multi_query_attention=False,
+        multi_query_group_num=1,
+        apply_query_key_layer_scaling=True,
+        attention_softmax_in_fp32=True,
+        fp32_residual_connection=False,
+        quantization_bit=0,
+        pre_seq_len=None,
+        prefix_projection=False,
+        **kwargs
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
+        self.padded_vocab_size = padded_vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.seq_length = seq_length
+        self.hidden_dropout = hidden_dropout
+        self.classifier_dropout = classifier_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_attention = multi_query_attention
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        super().__init__(**kwargs)
\ No newline at end of file
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/generation_config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..31d22ad9418a1daba6b2bbf472ac3762cd5ce643
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/generation_config.json
@@ -0,0 +1,6 @@
+{
+  "_from_model_config": true,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.34.0"
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/modeling_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/modeling_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5b5027587016090a377f25289284b6e4f829cb4
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/modeling_chatglm.py
@@ -0,0 +1,1293 @@
+""" PyTorch ChatGLM model. """
+
+import math
+import copy
+import warnings
+import re
+import sys
+
+import torch
+import torch.utils.checkpoint
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
+from torch.nn.utils import skip_init
+from typing import Optional, Tuple, Union, List, Callable, Dict, Any
+from copy import deepcopy
+
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
+
+from .configuration_chatglm import ChatGLMConfig
+
+# flags required to enable jit fusion kernels
+
+if sys.platform != 'darwin':
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    torch._C._jit_override_can_fuse_on_cpu(True)
+    torch._C._jit_override_can_fuse_on_gpu(True)
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
+_CONFIG_FOR_DOC = "ChatGLMConfig"
+
+CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "THUDM/chatglm3-6b",
+    # See all ChatGLM models at https://huggingface.co/models?filter=chatglm
+]
+
+
+def default_init(cls, *args, **kwargs):
+    return cls(*args, **kwargs)
+
+
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 5] = 5e4
+        return scores
+
+
+class PrefixEncoder(torch.nn.Module):
+    """
+    The torch.nn model to encode the prefix
+    Input shape: (batch-size, prefix-length)
+    Output shape: (batch-size, prefix-length, 2*layers*hidden)
+    """
+
+    def __init__(self, config: ChatGLMConfig):
+        super().__init__()
+        self.prefix_projection = config.prefix_projection
+        if self.prefix_projection:
+            # Use a two-layer MLP to encode the prefix
+            kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2
+            self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size)
+            self.trans = torch.nn.Sequential(
+                torch.nn.Linear(kv_size, config.hidden_size),
+                torch.nn.Tanh(),
+                torch.nn.Linear(config.hidden_size, kv_size)
+            )
+        else:
+            self.embedding = torch.nn.Embedding(config.pre_seq_len,
+                                                config.num_layers * config.kv_channels * config.multi_query_group_num * 2)
+
+    def forward(self, prefix: torch.Tensor):
+        if self.prefix_projection:
+            prefix_tokens = self.embedding(prefix)
+            past_key_values = self.trans(prefix_tokens)
+        else:
+            past_key_values = self.embedding(prefix)
+        return past_key_values
+
+
+def split_tensor_along_last_dim(
+        tensor: torch.Tensor,
+        num_partitions: int,
+        contiguous_split_chunks: bool = False,
+) -> List[torch.Tensor]:
+    """Split a tensor along its last dimension.
+
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+
+    Returns:
+        A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = tensor.size()[last_dim] // num_partitions
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, original_impl=False, device=None, dtype=None):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.dim = dim
+        self.original_impl = original_impl
+
+    def forward_impl(
+            self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
+    ):
+        """Enhanced Transformer with Rotary Position Embedding.
+
+        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+        transformers/rope/__init__.py. MIT License:
+        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+        """
+        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+        theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
+
+        # Create position indexes `[0, 1, ..., seq_len - 1]`
+        seq_idx = torch.arange(seq_len, dtype=torch.float, device=device)
+
+        # Calculate the product of position index and $\theta_i$
+        idx_theta = torch.outer(seq_idx, theta).float()
+
+        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
+
+        # this is to mimic the behaviour of complex32, else we will get different results
+        if dtype in (torch.float16, torch.bfloat16, torch.int8):
+            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
+        return cache
+
+    def forward(self, max_seq_len, offset=0):
+        return self.forward_impl(
+            max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
+        )
+
+
+@torch.jit.script
+def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
+    # x: [sq, b, np, hn]
+    sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
+    rot_dim = rope_cache.shape[-2] * 2
+    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
+    # truncate to support variable sizes
+    rope_cache = rope_cache[:sq]
+    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
+    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
+            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return torch.cat((x_out2, x_pass), dim=-1)
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
+        self.eps = eps
+
+    def forward(self, hidden_states: torch.Tensor):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+
+        return (self.weight * hidden_states).to(input_dtype)
+
+
+class CoreAttention(torch.nn.Module):
+    def __init__(self, config: ChatGLMConfig, layer_number):
+        super(CoreAttention, self).__init__()
+
+        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+
+        projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        self.hidden_size_per_partition = projection_size
+        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+        self.coeff = coeff
+
+        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
+
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        pytorch_major_version = int(torch.__version__.split('.')[0])
+        if pytorch_major_version >= 2:
+            query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
+            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 is_causal=True)
+            else:
+                if attention_mask is not None:
+                    attention_mask = ~attention_mask
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 attention_mask)
+            context_layer = context_layer.permute(2, 0, 1, 3)
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.reshape(*new_context_layer_shape)
+        else:
+            # Raw attention scores
+
+            # [b, np, sq, sk]
+            output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
+
+            # [sq, b, np, hn] -> [sq, b * np, hn]
+            query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
+            # [sk, b, np, hn] -> [sk, b * np, hn]
+            key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
+
+            # preallocting input tensor: [b * np, sq, sk]
+            matmul_input_buffer = torch.empty(
+                output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
+                device=query_layer.device
+            )
+
+            # Raw attention scores. [b * np, sq, sk]
+            matmul_result = torch.baddbmm(
+                matmul_input_buffer,
+                query_layer.transpose(0, 1),  # [b * np, sq, hn]
+                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+                beta=0.0,
+                alpha=(1.0 / self.norm_factor),
+            )
+
+            # change view to [b, np, sq, sk]
+            attention_scores = matmul_result.view(*output_size)
+
+            # ===========================
+            # Attention probs and dropout
+            # ===========================
+
+            # attention scores and attention mask [b, np, sq, sk]
+            if self.attention_softmax_in_fp32:
+                attention_scores = attention_scores.float()
+            if self.coeff is not None:
+                attention_scores = attention_scores * self.coeff
+            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
+                attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
+                                            device=attention_scores.device, dtype=torch.bool)
+                attention_mask.tril_()
+                attention_mask = ~attention_mask
+            if attention_mask is not None:
+                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
+            attention_probs = F.softmax(attention_scores, dim=-1)
+            attention_probs = attention_probs.type_as(value_layer)
+
+            # This is actually dropping out entire tokens to attend to, which might
+            # seem a bit unusual, but is taken from the original Transformer paper.
+            attention_probs = self.attention_dropout(attention_probs)
+            # =========================
+            # Context layer. [sq, b, hp]
+            # =========================
+
+            # value_layer -> context layer.
+            # [sk, b, np, hn] --> [b, np, sq, hn]
+
+            # context layer shape: [b, np, sq, hn]
+            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
+            # change view [sk, b * np, hn]
+            value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
+            # change view [b * np, sq, sk]
+            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+            # matmul: [b * np, sq, hn]
+            context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+            # change view [b, np, sq, hn]
+            context_layer = context_layer.view(*output_size)
+            # [b, np, sq, hn] --> [sq, b, np, hn]
+            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+            # [sq, b, np, hn] --> [sq, b, hp]
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.view(*new_context_layer_shape)
+
+        return context_layer
+
+
+class SelfAttention(torch.nn.Module):
+    """Parallel self-attention layer abstract class.
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(SelfAttention, self).__init__()
+        self.layer_number = max(1, layer_number)
+
+        self.projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        self.multi_query_attention = config.multi_query_attention
+        self.qkv_hidden_size = 3 * self.projection_size
+        if self.multi_query_attention:
+            self.num_multi_query_groups_per_partition = config.multi_query_group_num
+            self.qkv_hidden_size = (
+                    self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
+            )
+        self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
+                                         bias=config.add_bias_linear or config.add_qkv_bias,
+                                         device=device, **_config_to_kwargs(config)
+                                         )
+
+        self.core_attention = CoreAttention(config, self.layer_number)
+
+        # Output.
+        self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
+                               device=device, **_config_to_kwargs(config)
+                               )
+
+    def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
+        if self.multi_query_attention:
+            num_attention_heads = self.num_multi_query_groups_per_partition
+        else:
+            num_attention_heads = self.num_attention_heads_per_partition
+        return torch.empty(
+            inference_max_sequence_len,
+            batch_size,
+            num_attention_heads,
+            self.hidden_size_per_attention_head,
+            dtype=dtype,
+            device=device,
+        )
+
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
+    ):
+        # hidden_states: [sq, b, h]
+
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        # =====================
+        # Query, Key, and Value
+        # =====================
+
+        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+        mixed_x_layer = self.query_key_value(hidden_states)
+
+        if self.multi_query_attention:
+            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+                [
+                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                ],
+                dim=-1,
+            )
+            query_layer = query_layer.view(
+                query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            key_layer = key_layer.view(
+                key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.view(
+                value_layer.size()[:-1]
+                + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+        else:
+            new_tensor_shape = mixed_x_layer.size()[:-1] + \
+                               (self.num_attention_heads_per_partition,
+                                3 * self.hidden_size_per_attention_head)
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # apply relative positional encoding (rotary embedding)
+        if rotary_pos_emb is not None:
+            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
+            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
+
+        # adjust key and value for inference
+        if kv_cache is not None:
+            cache_k, cache_v = kv_cache
+            key_layer = torch.cat((cache_k, key_layer), dim=0)
+            value_layer = torch.cat((cache_v, value_layer), dim=0)
+        if use_cache:
+            kv_cache = (key_layer, value_layer)
+        else:
+            kv_cache = None
+
+        if self.multi_query_attention:
+            key_layer = key_layer.unsqueeze(-2)
+            key_layer = key_layer.expand(
+                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
+            )
+            key_layer = key_layer.contiguous().view(
+                key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.unsqueeze(-2)
+            value_layer = value_layer.expand(
+                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
+            )
+            value_layer = value_layer.contiguous().view(
+                value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output = self.dense(context_layer)
+
+        return output, kv_cache
+
+
+def _config_to_kwargs(args):
+    common_kwargs = {
+        "dtype": args.torch_dtype,
+    }
+    return common_kwargs
+
+
+class MLP(torch.nn.Module):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(MLP, self).__init__()
+
+        self.add_bias = config.add_bias_linear
+
+        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        self.dense_h_to_4h = nn.Linear(
+            config.hidden_size,
+            config.ffn_hidden_size * 2,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config)
+        )
+
+        def swiglu(x):
+            x = torch.chunk(x, 2, dim=-1)
+            return F.silu(x[0]) * x[1]
+
+        self.activation_func = swiglu
+
+        # Project back to h.
+        self.dense_4h_to_h = nn.Linear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config)
+        )
+
+    def forward(self, hidden_states):
+        # [s, b, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+        # [s, b, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        return output
+
+
+class GLMBlock(torch.nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(GLMBlock, self).__init__()
+        self.layer_number = layer_number
+
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                             dtype=config.torch_dtype)
+
+        # Self attention.
+        self.self_attention = SelfAttention(config, layer_number, device=device)
+        self.hidden_dropout = config.hidden_dropout
+
+        # Layernorm on the attention output
+        self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                      dtype=config.torch_dtype)
+
+        # MLP
+        self.mlp = MLP(config, device=device)
+
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
+    ):
+        # hidden_states: [s, b, h]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output, kv_cache = self.self_attention(
+            layernorm_output,
+            attention_mask,
+            rotary_pos_emb,
+            kv_cache=kv_cache,
+            use_cache=use_cache
+        )
+
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
+        layernorm_input = residual + layernorm_input
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
+        output = residual + output
+
+        return output, kv_cache
+
+
+class GLMTransformer(torch.nn.Module):
+    """Transformer class."""
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(GLMTransformer, self).__init__()
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.post_layer_norm = config.post_layer_norm
+
+        # Number of layers.
+        self.num_layers = config.num_layers
+
+        # Transformer layers.
+        def build_layer(layer_number):
+            return GLMBlock(config, layer_number, device=device)
+
+        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
+
+        if self.post_layer_norm:
+            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+            # Final layer norm before output.
+            self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                 dtype=config.torch_dtype)
+
+        self.gradient_checkpointing = False
+
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
+            use_cache: Optional[bool] = True,
+            output_hidden_states: Optional[bool] = False,
+    ):
+        if not kv_caches:
+            kv_caches = [None for _ in range(self.num_layers)]
+        presents = () if use_cache else None
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        all_self_attentions = None
+        all_hidden_states = () if output_hidden_states else None
+        for index in range(self.num_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer = self._get_layer(index)
+            if self.gradient_checkpointing and self.training:
+                layer_ret = torch.utils.checkpoint.checkpoint(
+                    layer,
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_caches[index],
+                    use_cache
+                )
+            else:
+                layer_ret = layer(
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_cache=kv_caches[index],
+                    use_cache=use_cache
+                )
+            hidden_states, kv_cache = layer_ret
+            if use_cache:
+                presents = presents + (kv_cache,)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        # Final layer norm.
+        if self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states, presents, all_hidden_states, all_self_attentions
+
+
+class ChatGLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    config_class = ChatGLMConfig
+    base_model_prefix = "transformer"
+    _no_split_modules = ["GLMBlock"]
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        return
+
+    def get_masks(self, input_ids, past_key_values, padding_mask=None):
+        batch_size, seq_length = input_ids.shape
+        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
+        full_attention_mask.tril_()
+        past_length = 0
+        if past_key_values:
+            past_length = past_key_values[0][0].shape[0]
+        if past_length:
+            full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
+                                                        device=input_ids.device), full_attention_mask), dim=-1)
+        if padding_mask is not None:
+            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
+        if not past_length and padding_mask is not None:
+            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
+        full_attention_mask = (full_attention_mask < 0.5).bool()
+        full_attention_mask.unsqueeze_(1)
+        return full_attention_mask
+
+    def get_position_ids(self, input_ids, device):
+        batch_size, seq_length = input_ids.shape
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+        return position_ids
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GLMTransformer):
+            module.gradient_checkpointing = value
+
+
+class Embedding(torch.nn.Module):
+    """Language model embeddings."""
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(Embedding, self).__init__()
+
+        self.hidden_size = config.hidden_size
+        # Word embeddings (parallel).
+        self.word_embeddings = nn.Embedding(
+            config.padded_vocab_size,
+            self.hidden_size,
+            dtype=config.torch_dtype,
+            device=device
+        )
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+    def forward(self, input_ids):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
+        return embeddings
+
+
+class ChatGLMModel(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
+        super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        init_kwargs = {}
+        if device is not None:
+            init_kwargs["device"] = device
+        self.embedding = init_method(Embedding, config, **init_kwargs)
+        self.num_layers = config.num_layers
+        self.multi_query_group_num = config.multi_query_group_num
+        self.kv_channels = config.kv_channels
+
+        # Rotary positional embeddings
+        self.seq_length = config.seq_length
+        rotary_dim = (
+            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
+        )
+
+        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
+                                              dtype=config.torch_dtype)
+        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
+        self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
+                                        dtype=config.torch_dtype, **init_kwargs)
+        self.pre_seq_len = config.pre_seq_len
+        self.prefix_projection = config.prefix_projection
+        if self.pre_seq_len is not None:
+            for param in self.parameters():
+                param.requires_grad = False
+            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
+            self.prefix_encoder = PrefixEncoder(config)
+            self.dropout = torch.nn.Dropout(0.1)
+
+    def get_input_embeddings(self):
+        return self.embedding.word_embeddings
+
+    def get_prompt(self, batch_size, device, dtype=torch.half):
+        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
+        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
+        past_key_values = past_key_values.view(
+            batch_size,
+            self.pre_seq_len,
+            self.num_layers * 2,
+            self.multi_query_group_num,
+            self.kv_channels
+        )
+        # seq_len, b, nh, hidden_size
+        past_key_values = self.dropout(past_key_values)
+        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
+        return past_key_values
+
+    def forward(
+            self,
+            input_ids,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.BoolTensor] = None,
+            full_attention_mask: Optional[torch.BoolTensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, seq_length = input_ids.shape
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embedding(input_ids)
+
+        if self.pre_seq_len is not None:
+            if past_key_values is None:
+                past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device,
+                                                  dtype=inputs_embeds.dtype)
+            if attention_mask is not None:
+                attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)),
+                                            attention_mask], dim=-1)
+
+        if full_attention_mask is None:
+            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
+                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
+
+        # Rotary positional embeddings
+        rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
+        if position_ids is not None:
+            rotary_pos_emb = rotary_pos_emb[position_ids]
+        else:
+            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
+        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
+
+        # Run encoder.
+        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
+            inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
+            kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
+        )
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def quantize(self, weight_bit_width: int):
+        from .quantization import quantize
+        quantize(self.encoder, weight_bit_width)
+        return self
+
+
+class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
+        super().__init__(config)
+
+        self.max_sequence_length = config.max_length
+        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
+        self.config = config
+        self.quantized = False
+
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+
+    def _update_model_kwargs_for_generation(
+            self,
+            outputs: ModelOutput,
+            model_kwargs: Dict[str, Any],
+            is_encoder_decoder: bool = False,
+            standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
+            outputs, standardize_cache_format=standardize_cache_format
+        )
+
+        # update attention mask
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+            )
+
+        # update position ids
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            new_position_id = position_ids[..., -1:].clone()
+            new_position_id += 1
+            model_kwargs["position_ids"] = torch.cat(
+                [position_ids, new_position_id], dim=-1
+            )
+
+        model_kwargs["is_first_forward"] = False
+        return model_kwargs
+
+    def prepare_inputs_for_generation(
+            self,
+            input_ids: torch.LongTensor,
+            past_key_values: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            is_first_forward: bool = True,
+            **kwargs
+    ) -> dict:
+        # only last token for input_ids if past is not None
+        if position_ids is None:
+            position_ids = self.get_position_ids(input_ids, device=input_ids.device)
+        if not is_first_forward:
+            if past_key_values is not None:
+                position_ids = position_ids[..., -1:]
+                input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "return_last_logit": True,
+            "use_cache": use_cache
+        }
+
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            return_last_logit: Optional[bool] = False,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        if return_last_logit:
+            hidden_states = hidden_states[-1:]
+        lm_logits = self.transformer.output_layer(hidden_states)
+        lm_logits = lm_logits.transpose(0, 1).contiguous()
+
+        loss = None
+        if labels is not None:
+            lm_logits = lm_logits.to(torch.float32)
+
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+            lm_logits = lm_logits.to(hidden_states.dtype)
+            loss = loss.to(hidden_states.dtype)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(
+            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+
+        Output shares the same memory storage as `past`.
+        """
+        return tuple(
+            (
+                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
+                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
+            )
+            for layer_past in past
+        )
+
+    def process_response(self, output, history):
+        content = ""
+        history = deepcopy(history)
+        for response in output.split("<|assistant|>"):
+            metadata, content = response.split("\n", maxsplit=1)
+            if not metadata.strip():
+                content = content.strip()
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                content = content.replace("[[训练时间]]", "2023年")
+            else:
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                if history[0]["role"] == "system" and "tools" in history[0]:
+                    content = "\n".join(content.split("\n")[1:-1])
+                    def tool_call(**kwargs):
+                        return kwargs
+                    parameters = eval(content)
+                    content = {"name": metadata.strip(), "parameters": parameters}
+                else:
+                    content = {"name": metadata.strip(), "content": content}
+        return content, history
+
+    @torch.inference_mode()
+    def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
+             max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
+             **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        inputs = tokenizer.build_chat_input(query, history=history, role=role)
+        inputs = inputs.to(self.device)
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
+                        tokenizer.get_command("<|observation|>")]
+        outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id)
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+        response = tokenizer.decode(outputs)
+        history.append({"role": role, "content": query})
+        response, history = self.process_response(response, history)
+        return response, history
+
+    @torch.inference_mode()
+    def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
+                    past_key_values=None,max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8,
+                    logits_processor=None, return_past_key_values=False, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
+                        tokenizer.get_command("<|observation|>")]
+        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        if past_key_values is None:
+            inputs = tokenizer.build_chat_input(query, history=history, role=role)
+        else:
+            inputs = tokenizer.build_chat_input(query, role=role)
+        inputs = inputs.to(self.device)
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[0]
+            if self.transformer.pre_seq_len is not None:
+                past_length -= self.transformer.pre_seq_len
+            inputs.position_ids += past_length
+            attention_mask = inputs.attention_mask
+            attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
+            inputs['attention_mask'] = attention_mask
+        history.append({"role": role, "content": query})
+        for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
+                                            eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
+                                            **gen_kwargs):
+            if return_past_key_values:
+                outputs, past_key_values = outputs
+            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+            response = tokenizer.decode(outputs)
+            if response and response[-1] != "�":
+                response, new_history = self.process_response(response, history)
+                if return_past_key_values:
+                    yield response, new_history, past_key_values
+                else:
+                    yield response, new_history
+
+    @torch.inference_mode()
+    def stream_generate(
+            self,
+            input_ids,
+            generation_config: Optional[GenerationConfig] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+            return_past_key_values=False,
+            **kwargs,
+    ):
+        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
+
+        if generation_config is None:
+            generation_config = self.generation_config
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)
+        model_kwargs["use_cache"] = generation_config.use_cache
+        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
+
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None:
+            warnings.warn(
+                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
+                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
+                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+            if not has_default_max_length:
+                logger.warn(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
+                    UserWarning,
+                )
+
+        if input_ids_seq_length >= generation_config.max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            logger.warning(
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`."
+            )
+
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+        )
+
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+        logits_warper = self._get_logits_warper(generation_config)
+
+        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        scores = None
+        while True:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=False,
+                output_hidden_states=False,
+            )
+
+            next_token_logits = outputs.logits[:, -1, :]
+
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            if generation_config.do_sample:
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(probs, dim=-1)
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            unfinished_sequences = unfinished_sequences.mul(
+                next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+            )
+            if return_past_key_values:
+                yield input_ids, outputs.past_key_values
+            else:
+                yield input_ids
+            # stop when each sentence is finished, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                break
+
+    def quantize(self, bits: int, empty_init=False, device=None, **kwargs):
+        if bits == 0:
+            return
+
+        from .quantization import quantize
+
+        if self.quantized:
+            logger.info("Already quantized.")
+            return self
+
+        self.quantized = True
+
+        self.config.quantization_bit = bits
+
+        self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
+                                            **kwargs)
+        return self
+
+
+class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
+
+        self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
+        if config.classifier_dropout is not None:
+            self.dropout = nn.Dropout(config.classifier_dropout)
+        else:
+            self.dropout = None
+        self.config = config
+
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            full_attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.LongTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            full_attention_mask=full_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        pooled_hidden_states = hidden_states[-1]
+        if self.dropout is not None:
+            pooled_hidden_states = self.dropout(pooled_hidden_states)
+        logits = self.classifier_head(pooled_hidden_states)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze().float(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits.float(), labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits.float(), labels.view(-1, self.num_labels))
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/optimizer.pt b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..23cca78a3dc5181762a4f56f3a4526b43c3cbaea
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24c11e7f9276f0075ac4e17159b9565cfe78bdc2e5a89771088b58390ea40548
+size 14682210
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/pytorch_model.bin b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..11f3421760601f92c3784e4a9cc53a327e7a6227
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac5a740f837528ecd48badb09ffa12ad042c01bcce3e6473d2332e85e841ecd4
+size 7341306
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/quantization.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb95bfe82b203ff6a2aa962326d2c7a438d6a52f
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/quantization.py
@@ -0,0 +1,188 @@
+from torch.nn import Linear
+from torch.nn.parameter import Parameter
+
+import bz2
+import torch
+import base64
+import ctypes
+from transformers.utils import logging
+
+from typing import List
+from functools import partial
+
+logger = logging.get_logger(__name__)
+
+try:
+    from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up
+
+    class Kernel:
+        def __init__(self, code: bytes, function_names: List[str]):
+            self.code = code
+            self._function_names = function_names
+            self._cmodule = LazyKernelCModule(self.code)
+
+            for name in self._function_names:
+                setattr(self, name, KernelFunction(self._cmodule, name))
+
+    quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ"
+
+    kernels = Kernel(
+        bz2.decompress(base64.b64decode(quantization_code)),
+        [
+            "int4WeightCompression",
+            "int4WeightExtractionFloat",
+            "int4WeightExtractionHalf",
+            "int8WeightExtractionFloat",
+            "int8WeightExtractionHalf",
+        ],
+    )
+except Exception as exception:
+    kernels = None
+    logger.warning("Failed to load cpm_kernels:" + str(exception))
+
+
+class W8A16Linear(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width):
+        ctx.inp_shape = inp.size()
+        ctx.weight_bit_width = weight_bit_width
+        out_features = quant_w.size(0)
+        inp = inp.contiguous().view(-1, inp.size(-1))
+        weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width)
+        ctx.weight_shape = weight.size()
+        output = inp.mm(weight.t())
+        ctx.save_for_backward(inp, quant_w, scale_w)
+        return output.view(*(ctx.inp_shape[:-1] + (out_features,)))
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        inp, quant_w, scale_w = ctx.saved_tensors
+        weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width)
+        grad_output = grad_output.contiguous().view(-1, weight.size(0))
+        grad_input = grad_output.mm(weight)
+        grad_weight = grad_output.t().mm(inp)
+        return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None
+
+
+def compress_int4_weight(weight: torch.Tensor):  # (n, m)
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        assert m % 2 == 0
+        m = m // 2
+        out = torch.empty(n, m, dtype=torch.int8, device="cuda")
+        stream = torch.cuda.current_stream()
+
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+
+        kernels.int4WeightCompression(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
+        )
+        return out
+
+
+def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int):
+    assert scale_list.dtype in [torch.half, torch.bfloat16]
+    assert weight.dtype in [torch.int8]
+    if source_bit_width == 8:
+        return weight.to(scale_list.dtype) * scale_list[:, None]
+    elif source_bit_width == 4:
+        func = (
+            kernels.int4WeightExtractionHalf if scale_list.dtype == torch.half else kernels.int4WeightExtractionBFloat16
+        )
+    else:
+        assert False, "Unsupported bit-width"
+
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        out = torch.empty(n, m * (8 // source_bit_width), dtype=scale_list.dtype, device="cuda")
+        stream = torch.cuda.current_stream()
+
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+
+        func(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [
+                ctypes.c_void_p(weight.data_ptr()),
+                ctypes.c_void_p(scale_list.data_ptr()),
+                ctypes.c_void_p(out.data_ptr()),
+                ctypes.c_int32(n),
+                ctypes.c_int32(m),
+            ],
+        )
+        return out
+
+
+class QuantizedLinear(torch.nn.Module):
+    def __init__(self, weight_bit_width: int, weight, bias=None, device="cpu", dtype=None, empty_init=False, *args,
+                 **kwargs):
+        super().__init__()
+        self.weight_bit_width = weight_bit_width
+
+        shape = weight.shape
+
+        if weight is None or empty_init:
+            self.weight = torch.empty(shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=device)
+            self.weight_scale = torch.empty(shape[0], dtype=dtype, device=device)
+        else:
+            self.weight_scale = weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)
+            self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8)
+            if weight_bit_width == 4:
+                self.weight = compress_int4_weight(self.weight)
+
+        self.weight = Parameter(self.weight.to(device), requires_grad=False)
+        self.weight_scale = Parameter(self.weight_scale.to(device), requires_grad=False)
+        self.bias = Parameter(bias.to(device), requires_grad=False) if bias is not None else None
+
+    def forward(self, input):
+        output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width)
+        if self.bias is not None:
+            output = output + self.bias
+        return output
+
+
+def quantize(model, weight_bit_width, empty_init=False, device=None):
+    """Replace fp16 linear with quantized linear"""
+    for layer in model.layers:
+        layer.self_attention.query_key_value = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.self_attention.query_key_value.weight.to(torch.cuda.current_device()),
+            bias=layer.self_attention.query_key_value.bias,
+            dtype=layer.self_attention.query_key_value.weight.dtype,
+            device=layer.self_attention.query_key_value.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.self_attention.dense = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.self_attention.dense.weight.to(torch.cuda.current_device()),
+            bias=layer.self_attention.dense.bias,
+            dtype=layer.self_attention.dense.weight.dtype,
+            device=layer.self_attention.dense.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_h_to_4h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()),
+            bias=layer.mlp.dense_h_to_4h.bias,
+            dtype=layer.mlp.dense_h_to_4h.weight.dtype,
+            device=layer.mlp.dense_h_to_4h.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_4h_to_h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()),
+            bias=layer.mlp.dense_4h_to_h.bias,
+            dtype=layer.mlp.dense_4h_to_h.weight.dtype,
+            device=layer.mlp.dense_4h_to_h.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+
+    return model
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/rng_state.pth b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..a4a75bee3df49fe6a8702b423e777413cdbb34ab
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b5c072ed3027d8e49ce6492bc0cac52571156d2630e130d487672c01375ed18
+size 14244
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/scheduler.pt b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..55013a617a7007e3aa12cef92fa465f45a0777d4
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c8d3eda235544386868d7d3c8b8ed7efcab36f20814c55292344c7ad9f51c2c
+size 1064
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/special_tokens_map.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/special_tokens_map.json
@@ -0,0 +1 @@
+{}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenization_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenization_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenization_chatglm.py
@@ -0,0 +1,300 @@
+import json
+import os
+import re
+from typing import List, Optional, Union, Dict
+from sentencepiece import SentencePieceProcessor
+from transformers import PreTrainedTokenizer
+from transformers.utils import logging, PaddingStrategy
+from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
+
+
+class SPTokenizer:
+    def __init__(self, model_path: str):
+        # reload tokenizer
+        assert os.path.isfile(model_path), model_path
+        self.sp_model = SentencePieceProcessor(model_file=model_path)
+
+        # BOS / EOS token IDs
+        self.n_words: int = self.sp_model.vocab_size()
+        self.bos_id: int = self.sp_model.bos_id()
+        self.eos_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.unk_id()
+        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+
+        role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
+        special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
+        self.special_tokens = {}
+        self.index_special_tokens = {}
+        for token in special_tokens:
+            self.special_tokens[token] = self.n_words
+            self.index_special_tokens[self.n_words] = token
+            self.n_words += 1
+        self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens])
+
+    def tokenize(self, s: str, encode_special_tokens=False):
+        if encode_special_tokens:
+            last_index = 0
+            t = []
+            for match in re.finditer(self.role_special_token_expression, s):
+                if last_index < match.start():
+                    t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()]))
+                t.append(s[match.start():match.end()])
+                last_index = match.end()
+            if last_index < len(s):
+                t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
+            return t
+        else:
+            return self.sp_model.EncodeAsPieces(s)
+
+    def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
+        assert type(s) is str
+        t = self.sp_model.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+
+    def decode(self, t: List[int]) -> str:
+        text, buffer = "", []
+        for token in t:
+            if token in self.index_special_tokens:
+                if buffer:
+                    text += self.sp_model.decode(buffer)
+                    buffer = []
+                text += self.index_special_tokens[token]
+            else:
+                buffer.append(token)
+        if buffer:
+            text += self.sp_model.decode(buffer)
+        return text
+
+    def decode_tokens(self, tokens: List[str]) -> str:
+        text = self.sp_model.DecodePieces(tokens)
+        return text
+
+    def convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        return self.sp_model.PieceToId(token)
+
+    def convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.index_special_tokens:
+            return self.index_special_tokens[index]
+        if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0:
+            return ""
+        return self.sp_model.IdToPiece(index)
+
+
+class ChatGLMTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {"vocab_file": "tokenizer.model"}
+
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+
+    def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False,
+                 **kwargs):
+        self.name = "GLMTokenizer"
+
+        self.vocab_file = vocab_file
+        self.tokenizer = SPTokenizer(vocab_file)
+        self.special_tokens = {
+            "<bos>": self.tokenizer.bos_id,
+            "<eos>": self.tokenizer.eos_id,
+            "<pad>": self.tokenizer.pad_id
+        }
+        self.encode_special_tokens = encode_special_tokens
+        super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                         encode_special_tokens=encode_special_tokens,
+                         **kwargs)
+
+    def get_command(self, token):
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
+        return self.tokenizer.special_tokens[token]
+
+    @property
+    def unk_token(self) -> str:
+        return "<unk>"
+
+    @property
+    def pad_token(self) -> str:
+        return "<unk>"
+
+    @property
+    def pad_token_id(self):
+        return self.get_command("<pad>")
+
+    @property
+    def eos_token(self) -> str:
+        return "</s>"
+
+    @property
+    def eos_token_id(self):
+        return self.get_command("<eos>")
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_words
+
+    def get_vocab(self):
+        """ Returns vocab as a dict """
+        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text, **kwargs):
+        return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.tokenizer.convert_token_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.tokenizer.convert_id_to_token(index)
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return self.tokenizer.decode_tokens(tokens)
+
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, self.vocab_files_names["vocab_file"]
+            )
+        else:
+            vocab_file = save_directory
+
+        with open(self.vocab_file, 'rb') as fin:
+            proto_str = fin.read()
+
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_str)
+
+        return (vocab_file,)
+
+    def get_prefix_tokens(self):
+        prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
+        return prefix_tokens
+
+    def build_single_message(self, role, metadata, message):
+        assert role in ["system", "user", "assistant", "observation"], role
+        role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n")
+        message_tokens = self.tokenizer.encode(message)
+        tokens = role_tokens + message_tokens
+        return tokens
+
+    def build_chat_input(self, query, history=None, role="user"):
+        if history is None:
+            history = []
+        input_ids = []
+        for item in history:
+            content = item["content"]
+            if item["role"] == "system" and "tools" in item:
+                content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False)
+            input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content))
+        input_ids.extend(self.build_single_message(role, "", query))
+        input_ids.extend([self.get_command("<|assistant|>")])
+        return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True)
+
+    def build_inputs_with_special_tokens(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        prefix_tokens = self.get_prefix_tokens()
+        token_ids_0 = prefix_tokens + token_ids_0
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("<eos>")]
+        return token_ids_0
+
+    def _pad(
+            self,
+            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+            max_length: Optional[int] = None,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            pad_to_multiple_of: Optional[int] = None,
+            return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        assert self.padding_side == "left"
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * seq_length
+
+        if "position_ids" not in encoded_inputs:
+            encoded_inputs["position_ids"] = list(range(seq_length))
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
+            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+
+        return encoded_inputs
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenizer.model b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
+size 1018370
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenizer_config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..5faafd38f2e2934804feb0e7d71ebf08b0839bf5
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenizer_config.json
@@ -0,0 +1,18 @@
+{
+  "added_tokens_decoder": {},
+  "additional_special_tokens": [],
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_chatglm.ChatGLMTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "encode_special_tokens": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "padding_side": "left",
+  "remove_space": false,
+  "tokenizer_class": "ChatGLMTokenizer",
+  "tokenizer_file": null
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/trainer_state.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ac36a582c86f3f51f90527761f8bdd771ed73ef
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/trainer_state.json
@@ -0,0 +1,1819 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 28.235294117647058,
+  "eval_steps": 500,
+  "global_step": 300,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.09,
+      "learning_rate": 0.009985714285714285,
+      "loss": 2.6971,
+      "step": 1
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 0.009971428571428572,
+      "loss": 2.3927,
+      "step": 2
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 0.009957142857142857,
+      "loss": 2.2539,
+      "step": 3
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 0.009942857142857144,
+      "loss": 2.1408,
+      "step": 4
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 0.009928571428571429,
+      "loss": 2.2672,
+      "step": 5
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 0.009914285714285714,
+      "loss": 1.6433,
+      "step": 6
+    },
+    {
+      "epoch": 0.66,
+      "learning_rate": 0.0099,
+      "loss": 2.1405,
+      "step": 7
+    },
+    {
+      "epoch": 0.75,
+      "learning_rate": 0.009885714285714286,
+      "loss": 2.1464,
+      "step": 8
+    },
+    {
+      "epoch": 0.85,
+      "learning_rate": 0.009871428571428571,
+      "loss": 1.8498,
+      "step": 9
+    },
+    {
+      "epoch": 0.94,
+      "learning_rate": 0.009857142857142858,
+      "loss": 1.6896,
+      "step": 10
+    },
+    {
+      "epoch": 1.04,
+      "learning_rate": 0.009842857142857143,
+      "loss": 2.1932,
+      "step": 11
+    },
+    {
+      "epoch": 1.13,
+      "learning_rate": 0.00982857142857143,
+      "loss": 1.8236,
+      "step": 12
+    },
+    {
+      "epoch": 1.22,
+      "learning_rate": 0.009814285714285715,
+      "loss": 1.735,
+      "step": 13
+    },
+    {
+      "epoch": 1.32,
+      "learning_rate": 0.0098,
+      "loss": 1.7488,
+      "step": 14
+    },
+    {
+      "epoch": 1.41,
+      "learning_rate": 0.009785714285714285,
+      "loss": 1.8336,
+      "step": 15
+    },
+    {
+      "epoch": 1.51,
+      "learning_rate": 0.009771428571428572,
+      "loss": 1.9438,
+      "step": 16
+    },
+    {
+      "epoch": 1.6,
+      "learning_rate": 0.009757142857142858,
+      "loss": 1.7178,
+      "step": 17
+    },
+    {
+      "epoch": 1.69,
+      "learning_rate": 0.009742857142857143,
+      "loss": 1.5714,
+      "step": 18
+    },
+    {
+      "epoch": 1.79,
+      "learning_rate": 0.009728571428571428,
+      "loss": 1.537,
+      "step": 19
+    },
+    {
+      "epoch": 1.88,
+      "learning_rate": 0.009714285714285715,
+      "loss": 1.6764,
+      "step": 20
+    },
+    {
+      "epoch": 1.98,
+      "learning_rate": 0.0097,
+      "loss": 1.8919,
+      "step": 21
+    },
+    {
+      "epoch": 2.07,
+      "learning_rate": 0.009685714285714285,
+      "loss": 1.346,
+      "step": 22
+    },
+    {
+      "epoch": 2.16,
+      "learning_rate": 0.009671428571428572,
+      "loss": 1.5036,
+      "step": 23
+    },
+    {
+      "epoch": 2.26,
+      "learning_rate": 0.009657142857142857,
+      "loss": 1.6788,
+      "step": 24
+    },
+    {
+      "epoch": 2.35,
+      "learning_rate": 0.009642857142857144,
+      "loss": 1.6667,
+      "step": 25
+    },
+    {
+      "epoch": 2.45,
+      "learning_rate": 0.009628571428571429,
+      "loss": 1.7153,
+      "step": 26
+    },
+    {
+      "epoch": 2.54,
+      "learning_rate": 0.009614285714285714,
+      "loss": 1.601,
+      "step": 27
+    },
+    {
+      "epoch": 2.64,
+      "learning_rate": 0.0096,
+      "loss": 1.3002,
+      "step": 28
+    },
+    {
+      "epoch": 2.73,
+      "learning_rate": 0.009585714285714286,
+      "loss": 1.3294,
+      "step": 29
+    },
+    {
+      "epoch": 2.82,
+      "learning_rate": 0.009571428571428573,
+      "loss": 1.7477,
+      "step": 30
+    },
+    {
+      "epoch": 2.92,
+      "learning_rate": 0.009557142857142858,
+      "loss": 1.7961,
+      "step": 31
+    },
+    {
+      "epoch": 3.01,
+      "learning_rate": 0.009542857142857143,
+      "loss": 1.4954,
+      "step": 32
+    },
+    {
+      "epoch": 3.11,
+      "learning_rate": 0.009528571428571428,
+      "loss": 1.6452,
+      "step": 33
+    },
+    {
+      "epoch": 3.2,
+      "learning_rate": 0.009514285714285715,
+      "loss": 1.3528,
+      "step": 34
+    },
+    {
+      "epoch": 3.29,
+      "learning_rate": 0.0095,
+      "loss": 1.4811,
+      "step": 35
+    },
+    {
+      "epoch": 3.39,
+      "learning_rate": 0.009485714285714287,
+      "loss": 1.4738,
+      "step": 36
+    },
+    {
+      "epoch": 3.48,
+      "learning_rate": 0.009471428571428572,
+      "loss": 1.174,
+      "step": 37
+    },
+    {
+      "epoch": 3.58,
+      "learning_rate": 0.009457142857142857,
+      "loss": 1.2346,
+      "step": 38
+    },
+    {
+      "epoch": 3.67,
+      "learning_rate": 0.009442857142857143,
+      "loss": 1.5327,
+      "step": 39
+    },
+    {
+      "epoch": 3.76,
+      "learning_rate": 0.009428571428571429,
+      "loss": 1.5249,
+      "step": 40
+    },
+    {
+      "epoch": 3.86,
+      "learning_rate": 0.009414285714285714,
+      "loss": 1.5086,
+      "step": 41
+    },
+    {
+      "epoch": 3.95,
+      "learning_rate": 0.0094,
+      "loss": 1.8425,
+      "step": 42
+    },
+    {
+      "epoch": 4.05,
+      "learning_rate": 0.009385714285714287,
+      "loss": 1.1943,
+      "step": 43
+    },
+    {
+      "epoch": 4.14,
+      "learning_rate": 0.009371428571428572,
+      "loss": 1.6835,
+      "step": 44
+    },
+    {
+      "epoch": 4.24,
+      "learning_rate": 0.009357142857142857,
+      "loss": 1.75,
+      "step": 45
+    },
+    {
+      "epoch": 4.33,
+      "learning_rate": 0.009342857142857142,
+      "loss": 1.2561,
+      "step": 46
+    },
+    {
+      "epoch": 4.42,
+      "learning_rate": 0.009328571428571429,
+      "loss": 1.3784,
+      "step": 47
+    },
+    {
+      "epoch": 4.52,
+      "learning_rate": 0.009314285714285714,
+      "loss": 1.2538,
+      "step": 48
+    },
+    {
+      "epoch": 4.61,
+      "learning_rate": 0.009300000000000001,
+      "loss": 1.4429,
+      "step": 49
+    },
+    {
+      "epoch": 4.71,
+      "learning_rate": 0.009285714285714286,
+      "loss": 1.3687,
+      "step": 50
+    },
+    {
+      "epoch": 4.8,
+      "learning_rate": 0.009271428571428571,
+      "loss": 1.1511,
+      "step": 51
+    },
+    {
+      "epoch": 4.89,
+      "learning_rate": 0.009257142857142858,
+      "loss": 1.181,
+      "step": 52
+    },
+    {
+      "epoch": 4.99,
+      "learning_rate": 0.009242857142857143,
+      "loss": 1.1753,
+      "step": 53
+    },
+    {
+      "epoch": 5.08,
+      "learning_rate": 0.009228571428571428,
+      "loss": 1.1562,
+      "step": 54
+    },
+    {
+      "epoch": 5.18,
+      "learning_rate": 0.009214285714285715,
+      "loss": 1.2936,
+      "step": 55
+    },
+    {
+      "epoch": 5.27,
+      "learning_rate": 0.0092,
+      "loss": 1.3591,
+      "step": 56
+    },
+    {
+      "epoch": 5.36,
+      "learning_rate": 0.009185714285714287,
+      "loss": 1.1376,
+      "step": 57
+    },
+    {
+      "epoch": 5.46,
+      "learning_rate": 0.009171428571428572,
+      "loss": 1.372,
+      "step": 58
+    },
+    {
+      "epoch": 5.55,
+      "learning_rate": 0.009157142857142857,
+      "loss": 1.5141,
+      "step": 59
+    },
+    {
+      "epoch": 5.65,
+      "learning_rate": 0.009142857142857144,
+      "loss": 1.2087,
+      "step": 60
+    },
+    {
+      "epoch": 5.74,
+      "learning_rate": 0.009128571428571429,
+      "loss": 1.136,
+      "step": 61
+    },
+    {
+      "epoch": 5.84,
+      "learning_rate": 0.009114285714285715,
+      "loss": 1.2948,
+      "step": 62
+    },
+    {
+      "epoch": 5.93,
+      "learning_rate": 0.0091,
+      "loss": 1.0592,
+      "step": 63
+    },
+    {
+      "epoch": 6.02,
+      "learning_rate": 0.009085714285714286,
+      "loss": 1.2321,
+      "step": 64
+    },
+    {
+      "epoch": 6.12,
+      "learning_rate": 0.009071428571428572,
+      "loss": 1.0827,
+      "step": 65
+    },
+    {
+      "epoch": 6.21,
+      "learning_rate": 0.009057142857142857,
+      "loss": 1.1136,
+      "step": 66
+    },
+    {
+      "epoch": 6.31,
+      "learning_rate": 0.009042857142857142,
+      "loss": 1.475,
+      "step": 67
+    },
+    {
+      "epoch": 6.4,
+      "learning_rate": 0.009028571428571427,
+      "loss": 1.1316,
+      "step": 68
+    },
+    {
+      "epoch": 6.49,
+      "learning_rate": 0.009014285714285714,
+      "loss": 1.1688,
+      "step": 69
+    },
+    {
+      "epoch": 6.59,
+      "learning_rate": 0.009000000000000001,
+      "loss": 1.0882,
+      "step": 70
+    },
+    {
+      "epoch": 6.68,
+      "learning_rate": 0.008985714285714286,
+      "loss": 1.1085,
+      "step": 71
+    },
+    {
+      "epoch": 6.78,
+      "learning_rate": 0.008971428571428571,
+      "loss": 1.2029,
+      "step": 72
+    },
+    {
+      "epoch": 6.87,
+      "learning_rate": 0.008957142857142856,
+      "loss": 1.098,
+      "step": 73
+    },
+    {
+      "epoch": 6.96,
+      "learning_rate": 0.008942857142857143,
+      "loss": 1.219,
+      "step": 74
+    },
+    {
+      "epoch": 7.06,
+      "learning_rate": 0.00892857142857143,
+      "loss": 1.0092,
+      "step": 75
+    },
+    {
+      "epoch": 7.15,
+      "learning_rate": 0.008914285714285715,
+      "loss": 1.0112,
+      "step": 76
+    },
+    {
+      "epoch": 7.25,
+      "learning_rate": 0.0089,
+      "loss": 1.1481,
+      "step": 77
+    },
+    {
+      "epoch": 7.34,
+      "learning_rate": 0.008885714285714287,
+      "loss": 0.9873,
+      "step": 78
+    },
+    {
+      "epoch": 7.44,
+      "learning_rate": 0.008871428571428572,
+      "loss": 1.0586,
+      "step": 79
+    },
+    {
+      "epoch": 7.53,
+      "learning_rate": 0.008857142857142857,
+      "loss": 1.1177,
+      "step": 80
+    },
+    {
+      "epoch": 7.62,
+      "learning_rate": 0.008842857142857142,
+      "loss": 0.7814,
+      "step": 81
+    },
+    {
+      "epoch": 7.72,
+      "learning_rate": 0.008828571428571429,
+      "loss": 1.2043,
+      "step": 82
+    },
+    {
+      "epoch": 7.81,
+      "learning_rate": 0.008814285714285715,
+      "loss": 1.0062,
+      "step": 83
+    },
+    {
+      "epoch": 7.91,
+      "learning_rate": 0.0088,
+      "loss": 1.0831,
+      "step": 84
+    },
+    {
+      "epoch": 8.0,
+      "learning_rate": 0.008785714285714286,
+      "loss": 0.9554,
+      "step": 85
+    },
+    {
+      "epoch": 8.09,
+      "learning_rate": 0.00877142857142857,
+      "loss": 1.1674,
+      "step": 86
+    },
+    {
+      "epoch": 8.19,
+      "learning_rate": 0.008757142857142857,
+      "loss": 0.8226,
+      "step": 87
+    },
+    {
+      "epoch": 8.28,
+      "learning_rate": 0.008742857142857144,
+      "loss": 0.9166,
+      "step": 88
+    },
+    {
+      "epoch": 8.38,
+      "learning_rate": 0.00872857142857143,
+      "loss": 0.734,
+      "step": 89
+    },
+    {
+      "epoch": 8.47,
+      "learning_rate": 0.008714285714285714,
+      "loss": 0.8641,
+      "step": 90
+    },
+    {
+      "epoch": 8.56,
+      "learning_rate": 0.0087,
+      "loss": 0.9517,
+      "step": 91
+    },
+    {
+      "epoch": 8.66,
+      "learning_rate": 0.008685714285714286,
+      "loss": 0.9995,
+      "step": 92
+    },
+    {
+      "epoch": 8.75,
+      "learning_rate": 0.008671428571428571,
+      "loss": 0.763,
+      "step": 93
+    },
+    {
+      "epoch": 8.85,
+      "learning_rate": 0.008657142857142858,
+      "loss": 1.0712,
+      "step": 94
+    },
+    {
+      "epoch": 8.94,
+      "learning_rate": 0.008642857142857143,
+      "loss": 1.1111,
+      "step": 95
+    },
+    {
+      "epoch": 9.04,
+      "learning_rate": 0.008628571428571428,
+      "loss": 0.9626,
+      "step": 96
+    },
+    {
+      "epoch": 9.13,
+      "learning_rate": 0.008614285714285715,
+      "loss": 0.6385,
+      "step": 97
+    },
+    {
+      "epoch": 9.22,
+      "learning_rate": 0.0086,
+      "loss": 0.8147,
+      "step": 98
+    },
+    {
+      "epoch": 9.32,
+      "learning_rate": 0.008585714285714285,
+      "loss": 0.8109,
+      "step": 99
+    },
+    {
+      "epoch": 9.41,
+      "learning_rate": 0.008571428571428572,
+      "loss": 1.0953,
+      "step": 100
+    },
+    {
+      "epoch": 9.51,
+      "learning_rate": 0.008557142857142859,
+      "loss": 0.7104,
+      "step": 101
+    },
+    {
+      "epoch": 9.6,
+      "learning_rate": 0.008542857142857144,
+      "loss": 0.9672,
+      "step": 102
+    },
+    {
+      "epoch": 9.69,
+      "learning_rate": 0.008528571428571429,
+      "loss": 0.7593,
+      "step": 103
+    },
+    {
+      "epoch": 9.79,
+      "learning_rate": 0.008514285714285714,
+      "loss": 1.0186,
+      "step": 104
+    },
+    {
+      "epoch": 9.88,
+      "learning_rate": 0.0085,
+      "loss": 0.7898,
+      "step": 105
+    },
+    {
+      "epoch": 9.98,
+      "learning_rate": 0.008485714285714286,
+      "loss": 0.7392,
+      "step": 106
+    },
+    {
+      "epoch": 10.07,
+      "learning_rate": 0.008471428571428572,
+      "loss": 0.7295,
+      "step": 107
+    },
+    {
+      "epoch": 10.16,
+      "learning_rate": 0.008457142857142858,
+      "loss": 0.7211,
+      "step": 108
+    },
+    {
+      "epoch": 10.26,
+      "learning_rate": 0.008442857142857143,
+      "loss": 0.769,
+      "step": 109
+    },
+    {
+      "epoch": 10.35,
+      "learning_rate": 0.00842857142857143,
+      "loss": 0.718,
+      "step": 110
+    },
+    {
+      "epoch": 10.45,
+      "learning_rate": 0.008414285714285714,
+      "loss": 0.6411,
+      "step": 111
+    },
+    {
+      "epoch": 10.54,
+      "learning_rate": 0.0084,
+      "loss": 0.8016,
+      "step": 112
+    },
+    {
+      "epoch": 10.64,
+      "learning_rate": 0.008385714285714286,
+      "loss": 0.6633,
+      "step": 113
+    },
+    {
+      "epoch": 10.73,
+      "learning_rate": 0.008371428571428571,
+      "loss": 0.7257,
+      "step": 114
+    },
+    {
+      "epoch": 10.82,
+      "learning_rate": 0.008357142857142858,
+      "loss": 0.7785,
+      "step": 115
+    },
+    {
+      "epoch": 10.92,
+      "learning_rate": 0.008342857142857143,
+      "loss": 0.8927,
+      "step": 116
+    },
+    {
+      "epoch": 11.01,
+      "learning_rate": 0.008328571428571428,
+      "loss": 0.7242,
+      "step": 117
+    },
+    {
+      "epoch": 11.11,
+      "learning_rate": 0.008314285714285715,
+      "loss": 0.8297,
+      "step": 118
+    },
+    {
+      "epoch": 11.2,
+      "learning_rate": 0.0083,
+      "loss": 0.6761,
+      "step": 119
+    },
+    {
+      "epoch": 11.29,
+      "learning_rate": 0.008285714285714287,
+      "loss": 0.6699,
+      "step": 120
+    },
+    {
+      "epoch": 11.39,
+      "learning_rate": 0.008271428571428572,
+      "loss": 0.5365,
+      "step": 121
+    },
+    {
+      "epoch": 11.48,
+      "learning_rate": 0.008257142857142857,
+      "loss": 0.9045,
+      "step": 122
+    },
+    {
+      "epoch": 11.58,
+      "learning_rate": 0.008242857142857144,
+      "loss": 0.5071,
+      "step": 123
+    },
+    {
+      "epoch": 11.67,
+      "learning_rate": 0.008228571428571429,
+      "loss": 0.6472,
+      "step": 124
+    },
+    {
+      "epoch": 11.76,
+      "learning_rate": 0.008214285714285714,
+      "loss": 0.6232,
+      "step": 125
+    },
+    {
+      "epoch": 11.86,
+      "learning_rate": 0.008199999999999999,
+      "loss": 0.4905,
+      "step": 126
+    },
+    {
+      "epoch": 11.95,
+      "learning_rate": 0.008185714285714286,
+      "loss": 0.557,
+      "step": 127
+    },
+    {
+      "epoch": 12.05,
+      "learning_rate": 0.008171428571428573,
+      "loss": 0.5517,
+      "step": 128
+    },
+    {
+      "epoch": 12.14,
+      "learning_rate": 0.008157142857142858,
+      "loss": 0.6321,
+      "step": 129
+    },
+    {
+      "epoch": 12.24,
+      "learning_rate": 0.008142857142857143,
+      "loss": 0.6619,
+      "step": 130
+    },
+    {
+      "epoch": 12.33,
+      "learning_rate": 0.008128571428571428,
+      "loss": 0.5524,
+      "step": 131
+    },
+    {
+      "epoch": 12.42,
+      "learning_rate": 0.008114285714285715,
+      "loss": 0.4688,
+      "step": 132
+    },
+    {
+      "epoch": 12.52,
+      "learning_rate": 0.008100000000000001,
+      "loss": 0.3717,
+      "step": 133
+    },
+    {
+      "epoch": 12.61,
+      "learning_rate": 0.008085714285714286,
+      "loss": 0.5118,
+      "step": 134
+    },
+    {
+      "epoch": 12.71,
+      "learning_rate": 0.008071428571428571,
+      "loss": 0.4521,
+      "step": 135
+    },
+    {
+      "epoch": 12.8,
+      "learning_rate": 0.008057142857142856,
+      "loss": 0.5865,
+      "step": 136
+    },
+    {
+      "epoch": 12.89,
+      "learning_rate": 0.008042857142857143,
+      "loss": 0.5977,
+      "step": 137
+    },
+    {
+      "epoch": 12.99,
+      "learning_rate": 0.008028571428571428,
+      "loss": 0.6977,
+      "step": 138
+    },
+    {
+      "epoch": 13.08,
+      "learning_rate": 0.008014285714285713,
+      "loss": 0.5625,
+      "step": 139
+    },
+    {
+      "epoch": 13.18,
+      "learning_rate": 0.008,
+      "loss": 0.3611,
+      "step": 140
+    },
+    {
+      "epoch": 13.27,
+      "learning_rate": 0.007985714285714287,
+      "loss": 0.5168,
+      "step": 141
+    },
+    {
+      "epoch": 13.36,
+      "learning_rate": 0.007971428571428572,
+      "loss": 0.4429,
+      "step": 142
+    },
+    {
+      "epoch": 13.46,
+      "learning_rate": 0.007957142857142857,
+      "loss": 0.4998,
+      "step": 143
+    },
+    {
+      "epoch": 13.55,
+      "learning_rate": 0.007942857142857142,
+      "loss": 0.4437,
+      "step": 144
+    },
+    {
+      "epoch": 13.65,
+      "learning_rate": 0.007928571428571429,
+      "loss": 0.4958,
+      "step": 145
+    },
+    {
+      "epoch": 13.74,
+      "learning_rate": 0.007914285714285716,
+      "loss": 0.4021,
+      "step": 146
+    },
+    {
+      "epoch": 13.84,
+      "learning_rate": 0.0079,
+      "loss": 0.6163,
+      "step": 147
+    },
+    {
+      "epoch": 13.93,
+      "learning_rate": 0.007885714285714286,
+      "loss": 0.406,
+      "step": 148
+    },
+    {
+      "epoch": 14.02,
+      "learning_rate": 0.007871428571428571,
+      "loss": 0.4905,
+      "step": 149
+    },
+    {
+      "epoch": 14.12,
+      "learning_rate": 0.007857142857142858,
+      "loss": 0.3824,
+      "step": 150
+    },
+    {
+      "epoch": 14.21,
+      "learning_rate": 0.007842857142857143,
+      "loss": 0.3591,
+      "step": 151
+    },
+    {
+      "epoch": 14.31,
+      "learning_rate": 0.007828571428571428,
+      "loss": 0.342,
+      "step": 152
+    },
+    {
+      "epoch": 14.4,
+      "learning_rate": 0.007814285714285715,
+      "loss": 0.4565,
+      "step": 153
+    },
+    {
+      "epoch": 14.49,
+      "learning_rate": 0.0078000000000000005,
+      "loss": 0.3287,
+      "step": 154
+    },
+    {
+      "epoch": 14.59,
+      "learning_rate": 0.007785714285714286,
+      "loss": 0.4179,
+      "step": 155
+    },
+    {
+      "epoch": 14.68,
+      "learning_rate": 0.0077714285714285715,
+      "loss": 0.3586,
+      "step": 156
+    },
+    {
+      "epoch": 14.78,
+      "learning_rate": 0.007757142857142857,
+      "loss": 0.4618,
+      "step": 157
+    },
+    {
+      "epoch": 14.87,
+      "learning_rate": 0.0077428571428571425,
+      "loss": 0.4133,
+      "step": 158
+    },
+    {
+      "epoch": 14.96,
+      "learning_rate": 0.007728571428571429,
+      "loss": 0.4326,
+      "step": 159
+    },
+    {
+      "epoch": 15.06,
+      "learning_rate": 0.007714285714285715,
+      "loss": 0.3838,
+      "step": 160
+    },
+    {
+      "epoch": 15.15,
+      "learning_rate": 0.0077,
+      "loss": 0.2978,
+      "step": 161
+    },
+    {
+      "epoch": 15.25,
+      "learning_rate": 0.007685714285714286,
+      "loss": 0.3993,
+      "step": 162
+    },
+    {
+      "epoch": 15.34,
+      "learning_rate": 0.007671428571428571,
+      "loss": 0.3249,
+      "step": 163
+    },
+    {
+      "epoch": 15.44,
+      "learning_rate": 0.007657142857142857,
+      "loss": 0.2796,
+      "step": 164
+    },
+    {
+      "epoch": 15.53,
+      "learning_rate": 0.007642857142857142,
+      "loss": 0.3918,
+      "step": 165
+    },
+    {
+      "epoch": 15.62,
+      "learning_rate": 0.007628571428571429,
+      "loss": 0.4122,
+      "step": 166
+    },
+    {
+      "epoch": 15.72,
+      "learning_rate": 0.007614285714285715,
+      "loss": 0.3403,
+      "step": 167
+    },
+    {
+      "epoch": 15.81,
+      "learning_rate": 0.0076,
+      "loss": 0.3759,
+      "step": 168
+    },
+    {
+      "epoch": 15.91,
+      "learning_rate": 0.007585714285714286,
+      "loss": 0.3621,
+      "step": 169
+    },
+    {
+      "epoch": 16.0,
+      "learning_rate": 0.007571428571428571,
+      "loss": 0.2991,
+      "step": 170
+    },
+    {
+      "epoch": 16.09,
+      "learning_rate": 0.007557142857142857,
+      "loss": 0.3039,
+      "step": 171
+    },
+    {
+      "epoch": 16.19,
+      "learning_rate": 0.007542857142857144,
+      "loss": 0.4571,
+      "step": 172
+    },
+    {
+      "epoch": 16.28,
+      "learning_rate": 0.007528571428571429,
+      "loss": 0.2759,
+      "step": 173
+    },
+    {
+      "epoch": 16.38,
+      "learning_rate": 0.007514285714285715,
+      "loss": 0.2835,
+      "step": 174
+    },
+    {
+      "epoch": 16.47,
+      "learning_rate": 0.0075,
+      "loss": 0.3221,
+      "step": 175
+    },
+    {
+      "epoch": 16.56,
+      "learning_rate": 0.007485714285714286,
+      "loss": 0.3072,
+      "step": 176
+    },
+    {
+      "epoch": 16.66,
+      "learning_rate": 0.007471428571428572,
+      "loss": 0.2852,
+      "step": 177
+    },
+    {
+      "epoch": 16.75,
+      "learning_rate": 0.007457142857142857,
+      "loss": 0.2559,
+      "step": 178
+    },
+    {
+      "epoch": 16.85,
+      "learning_rate": 0.007442857142857143,
+      "loss": 0.2787,
+      "step": 179
+    },
+    {
+      "epoch": 16.94,
+      "learning_rate": 0.007428571428571429,
+      "loss": 0.3331,
+      "step": 180
+    },
+    {
+      "epoch": 17.04,
+      "learning_rate": 0.007414285714285714,
+      "loss": 0.1929,
+      "step": 181
+    },
+    {
+      "epoch": 17.13,
+      "learning_rate": 0.0074,
+      "loss": 0.2065,
+      "step": 182
+    },
+    {
+      "epoch": 17.22,
+      "learning_rate": 0.007385714285714285,
+      "loss": 0.2868,
+      "step": 183
+    },
+    {
+      "epoch": 17.32,
+      "learning_rate": 0.007371428571428571,
+      "loss": 0.2206,
+      "step": 184
+    },
+    {
+      "epoch": 17.41,
+      "learning_rate": 0.007357142857142858,
+      "loss": 0.2355,
+      "step": 185
+    },
+    {
+      "epoch": 17.51,
+      "learning_rate": 0.007342857142857143,
+      "loss": 0.3041,
+      "step": 186
+    },
+    {
+      "epoch": 17.6,
+      "learning_rate": 0.007328571428571429,
+      "loss": 0.3028,
+      "step": 187
+    },
+    {
+      "epoch": 17.69,
+      "learning_rate": 0.007314285714285714,
+      "loss": 0.2435,
+      "step": 188
+    },
+    {
+      "epoch": 17.79,
+      "learning_rate": 0.0073,
+      "loss": 0.1869,
+      "step": 189
+    },
+    {
+      "epoch": 17.88,
+      "learning_rate": 0.007285714285714285,
+      "loss": 0.3036,
+      "step": 190
+    },
+    {
+      "epoch": 17.98,
+      "learning_rate": 0.007271428571428571,
+      "loss": 0.246,
+      "step": 191
+    },
+    {
+      "epoch": 18.07,
+      "learning_rate": 0.007257142857142858,
+      "loss": 0.2316,
+      "step": 192
+    },
+    {
+      "epoch": 18.16,
+      "learning_rate": 0.007242857142857143,
+      "loss": 0.186,
+      "step": 193
+    },
+    {
+      "epoch": 18.26,
+      "learning_rate": 0.007228571428571429,
+      "loss": 0.2616,
+      "step": 194
+    },
+    {
+      "epoch": 18.35,
+      "learning_rate": 0.007214285714285715,
+      "loss": 0.2824,
+      "step": 195
+    },
+    {
+      "epoch": 18.45,
+      "learning_rate": 0.0072,
+      "loss": 0.2,
+      "step": 196
+    },
+    {
+      "epoch": 18.54,
+      "learning_rate": 0.007185714285714286,
+      "loss": 0.1978,
+      "step": 197
+    },
+    {
+      "epoch": 18.64,
+      "learning_rate": 0.007171428571428572,
+      "loss": 0.1897,
+      "step": 198
+    },
+    {
+      "epoch": 18.73,
+      "learning_rate": 0.007157142857142858,
+      "loss": 0.1958,
+      "step": 199
+    },
+    {
+      "epoch": 18.82,
+      "learning_rate": 0.0071428571428571435,
+      "loss": 0.203,
+      "step": 200
+    },
+    {
+      "epoch": 18.92,
+      "learning_rate": 0.0071285714285714286,
+      "loss": 0.2451,
+      "step": 201
+    },
+    {
+      "epoch": 19.01,
+      "learning_rate": 0.0071142857142857145,
+      "loss": 0.2045,
+      "step": 202
+    },
+    {
+      "epoch": 19.11,
+      "learning_rate": 0.0070999999999999995,
+      "loss": 0.1937,
+      "step": 203
+    },
+    {
+      "epoch": 19.2,
+      "learning_rate": 0.0070857142857142855,
+      "loss": 0.1814,
+      "step": 204
+    },
+    {
+      "epoch": 19.29,
+      "learning_rate": 0.007071428571428572,
+      "loss": 0.1869,
+      "step": 205
+    },
+    {
+      "epoch": 19.39,
+      "learning_rate": 0.007057142857142857,
+      "loss": 0.2089,
+      "step": 206
+    },
+    {
+      "epoch": 19.48,
+      "learning_rate": 0.007042857142857143,
+      "loss": 0.1924,
+      "step": 207
+    },
+    {
+      "epoch": 19.58,
+      "learning_rate": 0.007028571428571428,
+      "loss": 0.1512,
+      "step": 208
+    },
+    {
+      "epoch": 19.67,
+      "learning_rate": 0.007014285714285714,
+      "loss": 0.1375,
+      "step": 209
+    },
+    {
+      "epoch": 19.76,
+      "learning_rate": 0.006999999999999999,
+      "loss": 0.187,
+      "step": 210
+    },
+    {
+      "epoch": 19.86,
+      "learning_rate": 0.006985714285714286,
+      "loss": 0.2488,
+      "step": 211
+    },
+    {
+      "epoch": 19.95,
+      "learning_rate": 0.006971428571428572,
+      "loss": 0.1864,
+      "step": 212
+    },
+    {
+      "epoch": 20.05,
+      "learning_rate": 0.006957142857142857,
+      "loss": 0.1984,
+      "step": 213
+    },
+    {
+      "epoch": 20.14,
+      "learning_rate": 0.006942857142857143,
+      "loss": 0.156,
+      "step": 214
+    },
+    {
+      "epoch": 20.24,
+      "learning_rate": 0.006928571428571429,
+      "loss": 0.2082,
+      "step": 215
+    },
+    {
+      "epoch": 20.33,
+      "learning_rate": 0.006914285714285714,
+      "loss": 0.094,
+      "step": 216
+    },
+    {
+      "epoch": 20.42,
+      "learning_rate": 0.0069,
+      "loss": 0.1784,
+      "step": 217
+    },
+    {
+      "epoch": 20.52,
+      "learning_rate": 0.006885714285714287,
+      "loss": 0.1293,
+      "step": 218
+    },
+    {
+      "epoch": 20.61,
+      "learning_rate": 0.006871428571428572,
+      "loss": 0.1635,
+      "step": 219
+    },
+    {
+      "epoch": 20.71,
+      "learning_rate": 0.006857142857142858,
+      "loss": 0.1668,
+      "step": 220
+    },
+    {
+      "epoch": 20.8,
+      "learning_rate": 0.006842857142857143,
+      "loss": 0.1946,
+      "step": 221
+    },
+    {
+      "epoch": 20.89,
+      "learning_rate": 0.006828571428571429,
+      "loss": 0.2347,
+      "step": 222
+    },
+    {
+      "epoch": 20.99,
+      "learning_rate": 0.006814285714285714,
+      "loss": 0.1523,
+      "step": 223
+    },
+    {
+      "epoch": 21.08,
+      "learning_rate": 0.0068000000000000005,
+      "loss": 0.1337,
+      "step": 224
+    },
+    {
+      "epoch": 21.18,
+      "learning_rate": 0.006785714285714286,
+      "loss": 0.1511,
+      "step": 225
+    },
+    {
+      "epoch": 21.27,
+      "learning_rate": 0.0067714285714285715,
+      "loss": 0.1058,
+      "step": 226
+    },
+    {
+      "epoch": 21.36,
+      "learning_rate": 0.006757142857142857,
+      "loss": 0.172,
+      "step": 227
+    },
+    {
+      "epoch": 21.46,
+      "learning_rate": 0.0067428571428571425,
+      "loss": 0.1077,
+      "step": 228
+    },
+    {
+      "epoch": 21.55,
+      "learning_rate": 0.006728571428571428,
+      "loss": 0.1993,
+      "step": 229
+    },
+    {
+      "epoch": 21.65,
+      "learning_rate": 0.006714285714285714,
+      "loss": 0.1414,
+      "step": 230
+    },
+    {
+      "epoch": 21.74,
+      "learning_rate": 0.0067,
+      "loss": 0.126,
+      "step": 231
+    },
+    {
+      "epoch": 21.84,
+      "learning_rate": 0.006685714285714286,
+      "loss": 0.1528,
+      "step": 232
+    },
+    {
+      "epoch": 21.93,
+      "learning_rate": 0.006671428571428571,
+      "loss": 0.1316,
+      "step": 233
+    },
+    {
+      "epoch": 22.02,
+      "learning_rate": 0.006657142857142857,
+      "loss": 0.1565,
+      "step": 234
+    },
+    {
+      "epoch": 22.12,
+      "learning_rate": 0.006642857142857143,
+      "loss": 0.1088,
+      "step": 235
+    },
+    {
+      "epoch": 22.21,
+      "learning_rate": 0.006628571428571428,
+      "loss": 0.088,
+      "step": 236
+    },
+    {
+      "epoch": 22.31,
+      "learning_rate": 0.006614285714285715,
+      "loss": 0.1348,
+      "step": 237
+    },
+    {
+      "epoch": 22.4,
+      "learning_rate": 0.006600000000000001,
+      "loss": 0.1702,
+      "step": 238
+    },
+    {
+      "epoch": 22.49,
+      "learning_rate": 0.006585714285714286,
+      "loss": 0.132,
+      "step": 239
+    },
+    {
+      "epoch": 22.59,
+      "learning_rate": 0.006571428571428572,
+      "loss": 0.1115,
+      "step": 240
+    },
+    {
+      "epoch": 22.68,
+      "learning_rate": 0.006557142857142857,
+      "loss": 0.1173,
+      "step": 241
+    },
+    {
+      "epoch": 22.78,
+      "learning_rate": 0.006542857142857143,
+      "loss": 0.0967,
+      "step": 242
+    },
+    {
+      "epoch": 22.87,
+      "learning_rate": 0.006528571428571428,
+      "loss": 0.1484,
+      "step": 243
+    },
+    {
+      "epoch": 22.96,
+      "learning_rate": 0.006514285714285715,
+      "loss": 0.1566,
+      "step": 244
+    },
+    {
+      "epoch": 23.06,
+      "learning_rate": 0.006500000000000001,
+      "loss": 0.162,
+      "step": 245
+    },
+    {
+      "epoch": 23.15,
+      "learning_rate": 0.006485714285714286,
+      "loss": 0.1099,
+      "step": 246
+    },
+    {
+      "epoch": 23.25,
+      "learning_rate": 0.0064714285714285716,
+      "loss": 0.1087,
+      "step": 247
+    },
+    {
+      "epoch": 23.34,
+      "learning_rate": 0.006457142857142857,
+      "loss": 0.116,
+      "step": 248
+    },
+    {
+      "epoch": 23.44,
+      "learning_rate": 0.0064428571428571425,
+      "loss": 0.1096,
+      "step": 249
+    },
+    {
+      "epoch": 23.53,
+      "learning_rate": 0.006428571428571429,
+      "loss": 0.0972,
+      "step": 250
+    },
+    {
+      "epoch": 23.62,
+      "learning_rate": 0.006414285714285714,
+      "loss": 0.0889,
+      "step": 251
+    },
+    {
+      "epoch": 23.72,
+      "learning_rate": 0.0064,
+      "loss": 0.1199,
+      "step": 252
+    },
+    {
+      "epoch": 23.81,
+      "learning_rate": 0.006385714285714286,
+      "loss": 0.1337,
+      "step": 253
+    },
+    {
+      "epoch": 23.91,
+      "learning_rate": 0.006371428571428571,
+      "loss": 0.0977,
+      "step": 254
+    },
+    {
+      "epoch": 24.0,
+      "learning_rate": 0.006357142857142857,
+      "loss": 0.146,
+      "step": 255
+    },
+    {
+      "epoch": 24.09,
+      "learning_rate": 0.006342857142857142,
+      "loss": 0.1102,
+      "step": 256
+    },
+    {
+      "epoch": 24.19,
+      "learning_rate": 0.006328571428571429,
+      "loss": 0.1025,
+      "step": 257
+    },
+    {
+      "epoch": 24.28,
+      "learning_rate": 0.006314285714285715,
+      "loss": 0.09,
+      "step": 258
+    },
+    {
+      "epoch": 24.38,
+      "learning_rate": 0.0063,
+      "loss": 0.1302,
+      "step": 259
+    },
+    {
+      "epoch": 24.47,
+      "learning_rate": 0.006285714285714286,
+      "loss": 0.0739,
+      "step": 260
+    },
+    {
+      "epoch": 24.56,
+      "learning_rate": 0.006271428571428571,
+      "loss": 0.1172,
+      "step": 261
+    },
+    {
+      "epoch": 24.66,
+      "learning_rate": 0.006257142857142857,
+      "loss": 0.1048,
+      "step": 262
+    },
+    {
+      "epoch": 24.75,
+      "learning_rate": 0.006242857142857144,
+      "loss": 0.0977,
+      "step": 263
+    },
+    {
+      "epoch": 24.85,
+      "learning_rate": 0.006228571428571429,
+      "loss": 0.1056,
+      "step": 264
+    },
+    {
+      "epoch": 24.94,
+      "learning_rate": 0.006214285714285715,
+      "loss": 0.1252,
+      "step": 265
+    },
+    {
+      "epoch": 25.04,
+      "learning_rate": 0.0062,
+      "loss": 0.1107,
+      "step": 266
+    },
+    {
+      "epoch": 25.13,
+      "learning_rate": 0.006185714285714286,
+      "loss": 0.0887,
+      "step": 267
+    },
+    {
+      "epoch": 25.22,
+      "learning_rate": 0.006171428571428571,
+      "loss": 0.0836,
+      "step": 268
+    },
+    {
+      "epoch": 25.32,
+      "learning_rate": 0.0061571428571428576,
+      "loss": 0.0957,
+      "step": 269
+    },
+    {
+      "epoch": 25.41,
+      "learning_rate": 0.0061428571428571435,
+      "loss": 0.1165,
+      "step": 270
+    },
+    {
+      "epoch": 25.51,
+      "learning_rate": 0.0061285714285714285,
+      "loss": 0.1135,
+      "step": 271
+    },
+    {
+      "epoch": 25.6,
+      "learning_rate": 0.0061142857142857145,
+      "loss": 0.0901,
+      "step": 272
+    },
+    {
+      "epoch": 25.69,
+      "learning_rate": 0.0061,
+      "loss": 0.0751,
+      "step": 273
+    },
+    {
+      "epoch": 25.79,
+      "learning_rate": 0.0060857142857142854,
+      "loss": 0.109,
+      "step": 274
+    },
+    {
+      "epoch": 25.88,
+      "learning_rate": 0.006071428571428571,
+      "loss": 0.102,
+      "step": 275
+    },
+    {
+      "epoch": 25.98,
+      "learning_rate": 0.006057142857142858,
+      "loss": 0.0916,
+      "step": 276
+    },
+    {
+      "epoch": 26.07,
+      "learning_rate": 0.006042857142857143,
+      "loss": 0.0821,
+      "step": 277
+    },
+    {
+      "epoch": 26.16,
+      "learning_rate": 0.006028571428571429,
+      "loss": 0.0797,
+      "step": 278
+    },
+    {
+      "epoch": 26.26,
+      "learning_rate": 0.006014285714285714,
+      "loss": 0.0804,
+      "step": 279
+    },
+    {
+      "epoch": 26.35,
+      "learning_rate": 0.006,
+      "loss": 0.0987,
+      "step": 280
+    },
+    {
+      "epoch": 26.45,
+      "learning_rate": 0.005985714285714285,
+      "loss": 0.1192,
+      "step": 281
+    },
+    {
+      "epoch": 26.54,
+      "learning_rate": 0.005971428571428572,
+      "loss": 0.0699,
+      "step": 282
+    },
+    {
+      "epoch": 26.64,
+      "learning_rate": 0.005957142857142858,
+      "loss": 0.0902,
+      "step": 283
+    },
+    {
+      "epoch": 26.73,
+      "learning_rate": 0.005942857142857143,
+      "loss": 0.0916,
+      "step": 284
+    },
+    {
+      "epoch": 26.82,
+      "learning_rate": 0.005928571428571429,
+      "loss": 0.0753,
+      "step": 285
+    },
+    {
+      "epoch": 26.92,
+      "learning_rate": 0.005914285714285714,
+      "loss": 0.0964,
+      "step": 286
+    },
+    {
+      "epoch": 27.01,
+      "learning_rate": 0.0059,
+      "loss": 0.1108,
+      "step": 287
+    },
+    {
+      "epoch": 27.11,
+      "learning_rate": 0.005885714285714286,
+      "loss": 0.1062,
+      "step": 288
+    },
+    {
+      "epoch": 27.2,
+      "learning_rate": 0.005871428571428572,
+      "loss": 0.0846,
+      "step": 289
+    },
+    {
+      "epoch": 27.29,
+      "learning_rate": 0.005857142857142858,
+      "loss": 0.0986,
+      "step": 290
+    },
+    {
+      "epoch": 27.39,
+      "learning_rate": 0.005842857142857143,
+      "loss": 0.0713,
+      "step": 291
+    },
+    {
+      "epoch": 27.48,
+      "learning_rate": 0.005828571428571429,
+      "loss": 0.0829,
+      "step": 292
+    },
+    {
+      "epoch": 27.58,
+      "learning_rate": 0.0058142857142857145,
+      "loss": 0.1026,
+      "step": 293
+    },
+    {
+      "epoch": 27.67,
+      "learning_rate": 0.0058,
+      "loss": 0.0785,
+      "step": 294
+    },
+    {
+      "epoch": 27.76,
+      "learning_rate": 0.005785714285714286,
+      "loss": 0.0729,
+      "step": 295
+    },
+    {
+      "epoch": 27.86,
+      "learning_rate": 0.005771428571428572,
+      "loss": 0.0738,
+      "step": 296
+    },
+    {
+      "epoch": 27.95,
+      "learning_rate": 0.005757142857142857,
+      "loss": 0.079,
+      "step": 297
+    },
+    {
+      "epoch": 28.05,
+      "learning_rate": 0.005742857142857143,
+      "loss": 0.0761,
+      "step": 298
+    },
+    {
+      "epoch": 28.14,
+      "learning_rate": 0.005728571428571428,
+      "loss": 0.0792,
+      "step": 299
+    },
+    {
+      "epoch": 28.24,
+      "learning_rate": 0.005714285714285714,
+      "loss": 0.0881,
+      "step": 300
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 700,
+  "num_train_epochs": 70,
+  "save_steps": 100,
+  "total_flos": 3.525522965397504e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/training_args.bin b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..17f9bfbf1a7cdd9e0e808e0672d55ad9ad4efb5f
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00669a32a6ddac0a3243bbc04d3f1f70ffc8f89f2626c1fdafa93ce68c311aa0
+size 4664
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/.ipynb_checkpoints/Untitled-checkpoint.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..363fcab7ed6e9634e198cf5555ceb88932c9a245
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/.ipynb_checkpoints/Untitled-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/.ipynb_checkpoints/config-checkpoint.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/.ipynb_checkpoints/config-checkpoint.json
new file mode 100644
index 0000000000000000000000000000000000000000..50d927dc68b4eaa40bd4812b7417b3f2bd61f599
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/.ipynb_checkpoints/config-checkpoint.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "chatglm3-6b",
+  "add_bias_linear": false,
+  "add_qkv_bias": true,
+  "apply_query_key_layer_scaling": true,
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "ChatGLMForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "auto_map": {
+    "AutoConfig": "configuration_chatglm.ChatGLMConfig",
+    "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
+  },
+  "bias_dropout_fusion": true,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "ffn_hidden_size": 13696,
+  "fp32_residual_connection": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 4096,
+  "kv_channels": 128,
+  "layernorm_epsilon": 1e-05,
+  "model_type": "chatglm",
+  "multi_query_attention": true,
+  "multi_query_group_num": 2,
+  "num_attention_heads": 32,
+  "num_layers": 28,
+  "original_rope": true,
+  "pad_token_id": 0,
+  "padded_vocab_size": 65024,
+  "post_layer_norm": true,
+  "pre_seq_len": 128,
+  "prefix_projection": false,
+  "quantization_bit": 0,
+  "rmsnorm": true,
+  "seq_length": 8192,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.34.0",
+  "use_cache": true,
+  "vocab_size": 65024
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/Untitled.ipynb b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/Untitled.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..c639653712eb0c035e8a63da023f415678f56a25
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/Untitled.ipynb
@@ -0,0 +1,99 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "8be7ec39-c93d-4529-bef3-6b65b66a8bcd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: huggingface_hub in /opt/conda/lib/python3.10/site-packages (0.17.3)\n",
+      "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (3.9.0)\n",
+      "Requirement already satisfied: fsspec in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2023.10.0)\n",
+      "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2.31.0)\n",
+      "Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.65.0)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.8.0)\n",
+      "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (23.1)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2.0.4)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (3.4)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (1.26.18)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2023.7.22)\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install huggingface_hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "a34bc1d4-4e94-4fa8-9c6d-778ea504b70b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5e2dc9023df04cf390302198d09374c1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import login\n",
+    "login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "83a1c592-5f21-4ce0-835b-75c0b581c5a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import HfApi\n",
+    "api = HfApi()\n",
+    "\n",
+    "api.upload_folder(\n",
+    "    folder_path=\"out\",\n",
+    "    repo_id=\"username/my-cool-space\",\n",
+    "    repo_type=\"space\",\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..50d927dc68b4eaa40bd4812b7417b3f2bd61f599
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "chatglm3-6b",
+  "add_bias_linear": false,
+  "add_qkv_bias": true,
+  "apply_query_key_layer_scaling": true,
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "ChatGLMForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "auto_map": {
+    "AutoConfig": "configuration_chatglm.ChatGLMConfig",
+    "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
+  },
+  "bias_dropout_fusion": true,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "ffn_hidden_size": 13696,
+  "fp32_residual_connection": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 4096,
+  "kv_channels": 128,
+  "layernorm_epsilon": 1e-05,
+  "model_type": "chatglm",
+  "multi_query_attention": true,
+  "multi_query_group_num": 2,
+  "num_attention_heads": 32,
+  "num_layers": 28,
+  "original_rope": true,
+  "pad_token_id": 0,
+  "padded_vocab_size": 65024,
+  "post_layer_norm": true,
+  "pre_seq_len": 128,
+  "prefix_projection": false,
+  "quantization_bit": 0,
+  "rmsnorm": true,
+  "seq_length": 8192,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.34.0",
+  "use_cache": true,
+  "vocab_size": 65024
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/configuration_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/configuration_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..35600185f5a26951081de0f3a41a913eaf06af99
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/configuration_chatglm.py
@@ -0,0 +1,61 @@
+from transformers import PretrainedConfig
+
+
+class ChatGLMConfig(PretrainedConfig):
+    model_type = "chatglm"
+    def __init__(
+        self,
+        num_layers=28,
+        padded_vocab_size=65024,
+        hidden_size=4096,
+        ffn_hidden_size=13696,
+        kv_channels=128,
+        num_attention_heads=32,
+        seq_length=2048,
+        hidden_dropout=0.0,
+        classifier_dropout=None,
+        attention_dropout=0.0,
+        layernorm_epsilon=1e-5,
+        rmsnorm=True,
+        apply_residual_connection_post_layernorm=False,
+        post_layer_norm=True,
+        add_bias_linear=False,
+        add_qkv_bias=False,
+        bias_dropout_fusion=True,
+        multi_query_attention=False,
+        multi_query_group_num=1,
+        apply_query_key_layer_scaling=True,
+        attention_softmax_in_fp32=True,
+        fp32_residual_connection=False,
+        quantization_bit=0,
+        pre_seq_len=None,
+        prefix_projection=False,
+        **kwargs
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
+        self.padded_vocab_size = padded_vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.seq_length = seq_length
+        self.hidden_dropout = hidden_dropout
+        self.classifier_dropout = classifier_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_attention = multi_query_attention
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        super().__init__(**kwargs)
\ No newline at end of file
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/generation_config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..31d22ad9418a1daba6b2bbf472ac3762cd5ce643
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/generation_config.json
@@ -0,0 +1,6 @@
+{
+  "_from_model_config": true,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.34.0"
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/modeling_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/modeling_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5b5027587016090a377f25289284b6e4f829cb4
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/modeling_chatglm.py
@@ -0,0 +1,1293 @@
+""" PyTorch ChatGLM model. """
+
+import math
+import copy
+import warnings
+import re
+import sys
+
+import torch
+import torch.utils.checkpoint
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
+from torch.nn.utils import skip_init
+from typing import Optional, Tuple, Union, List, Callable, Dict, Any
+from copy import deepcopy
+
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
+
+from .configuration_chatglm import ChatGLMConfig
+
+# flags required to enable jit fusion kernels
+
+if sys.platform != 'darwin':
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    torch._C._jit_override_can_fuse_on_cpu(True)
+    torch._C._jit_override_can_fuse_on_gpu(True)
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
+_CONFIG_FOR_DOC = "ChatGLMConfig"
+
+CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "THUDM/chatglm3-6b",
+    # See all ChatGLM models at https://huggingface.co/models?filter=chatglm
+]
+
+
+def default_init(cls, *args, **kwargs):
+    return cls(*args, **kwargs)
+
+
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 5] = 5e4
+        return scores
+
+
+class PrefixEncoder(torch.nn.Module):
+    """
+    The torch.nn model to encode the prefix
+    Input shape: (batch-size, prefix-length)
+    Output shape: (batch-size, prefix-length, 2*layers*hidden)
+    """
+
+    def __init__(self, config: ChatGLMConfig):
+        super().__init__()
+        self.prefix_projection = config.prefix_projection
+        if self.prefix_projection:
+            # Use a two-layer MLP to encode the prefix
+            kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2
+            self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size)
+            self.trans = torch.nn.Sequential(
+                torch.nn.Linear(kv_size, config.hidden_size),
+                torch.nn.Tanh(),
+                torch.nn.Linear(config.hidden_size, kv_size)
+            )
+        else:
+            self.embedding = torch.nn.Embedding(config.pre_seq_len,
+                                                config.num_layers * config.kv_channels * config.multi_query_group_num * 2)
+
+    def forward(self, prefix: torch.Tensor):
+        if self.prefix_projection:
+            prefix_tokens = self.embedding(prefix)
+            past_key_values = self.trans(prefix_tokens)
+        else:
+            past_key_values = self.embedding(prefix)
+        return past_key_values
+
+
+def split_tensor_along_last_dim(
+        tensor: torch.Tensor,
+        num_partitions: int,
+        contiguous_split_chunks: bool = False,
+) -> List[torch.Tensor]:
+    """Split a tensor along its last dimension.
+
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+
+    Returns:
+        A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = tensor.size()[last_dim] // num_partitions
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, original_impl=False, device=None, dtype=None):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.dim = dim
+        self.original_impl = original_impl
+
+    def forward_impl(
+            self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
+    ):
+        """Enhanced Transformer with Rotary Position Embedding.
+
+        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+        transformers/rope/__init__.py. MIT License:
+        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+        """
+        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+        theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
+
+        # Create position indexes `[0, 1, ..., seq_len - 1]`
+        seq_idx = torch.arange(seq_len, dtype=torch.float, device=device)
+
+        # Calculate the product of position index and $\theta_i$
+        idx_theta = torch.outer(seq_idx, theta).float()
+
+        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
+
+        # this is to mimic the behaviour of complex32, else we will get different results
+        if dtype in (torch.float16, torch.bfloat16, torch.int8):
+            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
+        return cache
+
+    def forward(self, max_seq_len, offset=0):
+        return self.forward_impl(
+            max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
+        )
+
+
+@torch.jit.script
+def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
+    # x: [sq, b, np, hn]
+    sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
+    rot_dim = rope_cache.shape[-2] * 2
+    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
+    # truncate to support variable sizes
+    rope_cache = rope_cache[:sq]
+    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
+    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
+            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return torch.cat((x_out2, x_pass), dim=-1)
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
+        self.eps = eps
+
+    def forward(self, hidden_states: torch.Tensor):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+
+        return (self.weight * hidden_states).to(input_dtype)
+
+
+class CoreAttention(torch.nn.Module):
+    def __init__(self, config: ChatGLMConfig, layer_number):
+        super(CoreAttention, self).__init__()
+
+        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+
+        projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        self.hidden_size_per_partition = projection_size
+        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+        self.coeff = coeff
+
+        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
+
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        pytorch_major_version = int(torch.__version__.split('.')[0])
+        if pytorch_major_version >= 2:
+            query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
+            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 is_causal=True)
+            else:
+                if attention_mask is not None:
+                    attention_mask = ~attention_mask
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 attention_mask)
+            context_layer = context_layer.permute(2, 0, 1, 3)
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.reshape(*new_context_layer_shape)
+        else:
+            # Raw attention scores
+
+            # [b, np, sq, sk]
+            output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
+
+            # [sq, b, np, hn] -> [sq, b * np, hn]
+            query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
+            # [sk, b, np, hn] -> [sk, b * np, hn]
+            key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
+
+            # preallocting input tensor: [b * np, sq, sk]
+            matmul_input_buffer = torch.empty(
+                output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
+                device=query_layer.device
+            )
+
+            # Raw attention scores. [b * np, sq, sk]
+            matmul_result = torch.baddbmm(
+                matmul_input_buffer,
+                query_layer.transpose(0, 1),  # [b * np, sq, hn]
+                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+                beta=0.0,
+                alpha=(1.0 / self.norm_factor),
+            )
+
+            # change view to [b, np, sq, sk]
+            attention_scores = matmul_result.view(*output_size)
+
+            # ===========================
+            # Attention probs and dropout
+            # ===========================
+
+            # attention scores and attention mask [b, np, sq, sk]
+            if self.attention_softmax_in_fp32:
+                attention_scores = attention_scores.float()
+            if self.coeff is not None:
+                attention_scores = attention_scores * self.coeff
+            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
+                attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
+                                            device=attention_scores.device, dtype=torch.bool)
+                attention_mask.tril_()
+                attention_mask = ~attention_mask
+            if attention_mask is not None:
+                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
+            attention_probs = F.softmax(attention_scores, dim=-1)
+            attention_probs = attention_probs.type_as(value_layer)
+
+            # This is actually dropping out entire tokens to attend to, which might
+            # seem a bit unusual, but is taken from the original Transformer paper.
+            attention_probs = self.attention_dropout(attention_probs)
+            # =========================
+            # Context layer. [sq, b, hp]
+            # =========================
+
+            # value_layer -> context layer.
+            # [sk, b, np, hn] --> [b, np, sq, hn]
+
+            # context layer shape: [b, np, sq, hn]
+            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
+            # change view [sk, b * np, hn]
+            value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
+            # change view [b * np, sq, sk]
+            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+            # matmul: [b * np, sq, hn]
+            context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+            # change view [b, np, sq, hn]
+            context_layer = context_layer.view(*output_size)
+            # [b, np, sq, hn] --> [sq, b, np, hn]
+            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+            # [sq, b, np, hn] --> [sq, b, hp]
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.view(*new_context_layer_shape)
+
+        return context_layer
+
+
+class SelfAttention(torch.nn.Module):
+    """Parallel self-attention layer abstract class.
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(SelfAttention, self).__init__()
+        self.layer_number = max(1, layer_number)
+
+        self.projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        self.multi_query_attention = config.multi_query_attention
+        self.qkv_hidden_size = 3 * self.projection_size
+        if self.multi_query_attention:
+            self.num_multi_query_groups_per_partition = config.multi_query_group_num
+            self.qkv_hidden_size = (
+                    self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
+            )
+        self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
+                                         bias=config.add_bias_linear or config.add_qkv_bias,
+                                         device=device, **_config_to_kwargs(config)
+                                         )
+
+        self.core_attention = CoreAttention(config, self.layer_number)
+
+        # Output.
+        self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
+                               device=device, **_config_to_kwargs(config)
+                               )
+
+    def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
+        if self.multi_query_attention:
+            num_attention_heads = self.num_multi_query_groups_per_partition
+        else:
+            num_attention_heads = self.num_attention_heads_per_partition
+        return torch.empty(
+            inference_max_sequence_len,
+            batch_size,
+            num_attention_heads,
+            self.hidden_size_per_attention_head,
+            dtype=dtype,
+            device=device,
+        )
+
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
+    ):
+        # hidden_states: [sq, b, h]
+
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        # =====================
+        # Query, Key, and Value
+        # =====================
+
+        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+        mixed_x_layer = self.query_key_value(hidden_states)
+
+        if self.multi_query_attention:
+            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+                [
+                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                ],
+                dim=-1,
+            )
+            query_layer = query_layer.view(
+                query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            key_layer = key_layer.view(
+                key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.view(
+                value_layer.size()[:-1]
+                + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+        else:
+            new_tensor_shape = mixed_x_layer.size()[:-1] + \
+                               (self.num_attention_heads_per_partition,
+                                3 * self.hidden_size_per_attention_head)
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # apply relative positional encoding (rotary embedding)
+        if rotary_pos_emb is not None:
+            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
+            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
+
+        # adjust key and value for inference
+        if kv_cache is not None:
+            cache_k, cache_v = kv_cache
+            key_layer = torch.cat((cache_k, key_layer), dim=0)
+            value_layer = torch.cat((cache_v, value_layer), dim=0)
+        if use_cache:
+            kv_cache = (key_layer, value_layer)
+        else:
+            kv_cache = None
+
+        if self.multi_query_attention:
+            key_layer = key_layer.unsqueeze(-2)
+            key_layer = key_layer.expand(
+                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
+            )
+            key_layer = key_layer.contiguous().view(
+                key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.unsqueeze(-2)
+            value_layer = value_layer.expand(
+                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
+            )
+            value_layer = value_layer.contiguous().view(
+                value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output = self.dense(context_layer)
+
+        return output, kv_cache
+
+
+def _config_to_kwargs(args):
+    common_kwargs = {
+        "dtype": args.torch_dtype,
+    }
+    return common_kwargs
+
+
+class MLP(torch.nn.Module):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(MLP, self).__init__()
+
+        self.add_bias = config.add_bias_linear
+
+        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        self.dense_h_to_4h = nn.Linear(
+            config.hidden_size,
+            config.ffn_hidden_size * 2,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config)
+        )
+
+        def swiglu(x):
+            x = torch.chunk(x, 2, dim=-1)
+            return F.silu(x[0]) * x[1]
+
+        self.activation_func = swiglu
+
+        # Project back to h.
+        self.dense_4h_to_h = nn.Linear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config)
+        )
+
+    def forward(self, hidden_states):
+        # [s, b, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+        # [s, b, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        return output
+
+
+class GLMBlock(torch.nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(GLMBlock, self).__init__()
+        self.layer_number = layer_number
+
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                             dtype=config.torch_dtype)
+
+        # Self attention.
+        self.self_attention = SelfAttention(config, layer_number, device=device)
+        self.hidden_dropout = config.hidden_dropout
+
+        # Layernorm on the attention output
+        self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                      dtype=config.torch_dtype)
+
+        # MLP
+        self.mlp = MLP(config, device=device)
+
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
+    ):
+        # hidden_states: [s, b, h]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output, kv_cache = self.self_attention(
+            layernorm_output,
+            attention_mask,
+            rotary_pos_emb,
+            kv_cache=kv_cache,
+            use_cache=use_cache
+        )
+
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
+        layernorm_input = residual + layernorm_input
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
+        output = residual + output
+
+        return output, kv_cache
+
+
+class GLMTransformer(torch.nn.Module):
+    """Transformer class."""
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(GLMTransformer, self).__init__()
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.post_layer_norm = config.post_layer_norm
+
+        # Number of layers.
+        self.num_layers = config.num_layers
+
+        # Transformer layers.
+        def build_layer(layer_number):
+            return GLMBlock(config, layer_number, device=device)
+
+        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
+
+        if self.post_layer_norm:
+            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+            # Final layer norm before output.
+            self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                 dtype=config.torch_dtype)
+
+        self.gradient_checkpointing = False
+
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
+            use_cache: Optional[bool] = True,
+            output_hidden_states: Optional[bool] = False,
+    ):
+        if not kv_caches:
+            kv_caches = [None for _ in range(self.num_layers)]
+        presents = () if use_cache else None
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        all_self_attentions = None
+        all_hidden_states = () if output_hidden_states else None
+        for index in range(self.num_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer = self._get_layer(index)
+            if self.gradient_checkpointing and self.training:
+                layer_ret = torch.utils.checkpoint.checkpoint(
+                    layer,
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_caches[index],
+                    use_cache
+                )
+            else:
+                layer_ret = layer(
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_cache=kv_caches[index],
+                    use_cache=use_cache
+                )
+            hidden_states, kv_cache = layer_ret
+            if use_cache:
+                presents = presents + (kv_cache,)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        # Final layer norm.
+        if self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states, presents, all_hidden_states, all_self_attentions
+
+
+class ChatGLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    config_class = ChatGLMConfig
+    base_model_prefix = "transformer"
+    _no_split_modules = ["GLMBlock"]
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        return
+
+    def get_masks(self, input_ids, past_key_values, padding_mask=None):
+        batch_size, seq_length = input_ids.shape
+        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
+        full_attention_mask.tril_()
+        past_length = 0
+        if past_key_values:
+            past_length = past_key_values[0][0].shape[0]
+        if past_length:
+            full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
+                                                        device=input_ids.device), full_attention_mask), dim=-1)
+        if padding_mask is not None:
+            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
+        if not past_length and padding_mask is not None:
+            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
+        full_attention_mask = (full_attention_mask < 0.5).bool()
+        full_attention_mask.unsqueeze_(1)
+        return full_attention_mask
+
+    def get_position_ids(self, input_ids, device):
+        batch_size, seq_length = input_ids.shape
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+        return position_ids
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GLMTransformer):
+            module.gradient_checkpointing = value
+
+
+class Embedding(torch.nn.Module):
+    """Language model embeddings."""
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(Embedding, self).__init__()
+
+        self.hidden_size = config.hidden_size
+        # Word embeddings (parallel).
+        self.word_embeddings = nn.Embedding(
+            config.padded_vocab_size,
+            self.hidden_size,
+            dtype=config.torch_dtype,
+            device=device
+        )
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+    def forward(self, input_ids):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
+        return embeddings
+
+
+class ChatGLMModel(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
+        super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        init_kwargs = {}
+        if device is not None:
+            init_kwargs["device"] = device
+        self.embedding = init_method(Embedding, config, **init_kwargs)
+        self.num_layers = config.num_layers
+        self.multi_query_group_num = config.multi_query_group_num
+        self.kv_channels = config.kv_channels
+
+        # Rotary positional embeddings
+        self.seq_length = config.seq_length
+        rotary_dim = (
+            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
+        )
+
+        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
+                                              dtype=config.torch_dtype)
+        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
+        self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
+                                        dtype=config.torch_dtype, **init_kwargs)
+        self.pre_seq_len = config.pre_seq_len
+        self.prefix_projection = config.prefix_projection
+        if self.pre_seq_len is not None:
+            for param in self.parameters():
+                param.requires_grad = False
+            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
+            self.prefix_encoder = PrefixEncoder(config)
+            self.dropout = torch.nn.Dropout(0.1)
+
+    def get_input_embeddings(self):
+        return self.embedding.word_embeddings
+
+    def get_prompt(self, batch_size, device, dtype=torch.half):
+        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
+        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
+        past_key_values = past_key_values.view(
+            batch_size,
+            self.pre_seq_len,
+            self.num_layers * 2,
+            self.multi_query_group_num,
+            self.kv_channels
+        )
+        # seq_len, b, nh, hidden_size
+        past_key_values = self.dropout(past_key_values)
+        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
+        return past_key_values
+
+    def forward(
+            self,
+            input_ids,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.BoolTensor] = None,
+            full_attention_mask: Optional[torch.BoolTensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, seq_length = input_ids.shape
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embedding(input_ids)
+
+        if self.pre_seq_len is not None:
+            if past_key_values is None:
+                past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device,
+                                                  dtype=inputs_embeds.dtype)
+            if attention_mask is not None:
+                attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)),
+                                            attention_mask], dim=-1)
+
+        if full_attention_mask is None:
+            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
+                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
+
+        # Rotary positional embeddings
+        rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
+        if position_ids is not None:
+            rotary_pos_emb = rotary_pos_emb[position_ids]
+        else:
+            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
+        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
+
+        # Run encoder.
+        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
+            inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
+            kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
+        )
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def quantize(self, weight_bit_width: int):
+        from .quantization import quantize
+        quantize(self.encoder, weight_bit_width)
+        return self
+
+
+class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
+        super().__init__(config)
+
+        self.max_sequence_length = config.max_length
+        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
+        self.config = config
+        self.quantized = False
+
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+
+    def _update_model_kwargs_for_generation(
+            self,
+            outputs: ModelOutput,
+            model_kwargs: Dict[str, Any],
+            is_encoder_decoder: bool = False,
+            standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
+            outputs, standardize_cache_format=standardize_cache_format
+        )
+
+        # update attention mask
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+            )
+
+        # update position ids
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            new_position_id = position_ids[..., -1:].clone()
+            new_position_id += 1
+            model_kwargs["position_ids"] = torch.cat(
+                [position_ids, new_position_id], dim=-1
+            )
+
+        model_kwargs["is_first_forward"] = False
+        return model_kwargs
+
+    def prepare_inputs_for_generation(
+            self,
+            input_ids: torch.LongTensor,
+            past_key_values: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            is_first_forward: bool = True,
+            **kwargs
+    ) -> dict:
+        # only last token for input_ids if past is not None
+        if position_ids is None:
+            position_ids = self.get_position_ids(input_ids, device=input_ids.device)
+        if not is_first_forward:
+            if past_key_values is not None:
+                position_ids = position_ids[..., -1:]
+                input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "return_last_logit": True,
+            "use_cache": use_cache
+        }
+
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            return_last_logit: Optional[bool] = False,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        if return_last_logit:
+            hidden_states = hidden_states[-1:]
+        lm_logits = self.transformer.output_layer(hidden_states)
+        lm_logits = lm_logits.transpose(0, 1).contiguous()
+
+        loss = None
+        if labels is not None:
+            lm_logits = lm_logits.to(torch.float32)
+
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+            lm_logits = lm_logits.to(hidden_states.dtype)
+            loss = loss.to(hidden_states.dtype)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(
+            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+
+        Output shares the same memory storage as `past`.
+        """
+        return tuple(
+            (
+                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
+                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
+            )
+            for layer_past in past
+        )
+
+    def process_response(self, output, history):
+        content = ""
+        history = deepcopy(history)
+        for response in output.split("<|assistant|>"):
+            metadata, content = response.split("\n", maxsplit=1)
+            if not metadata.strip():
+                content = content.strip()
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                content = content.replace("[[训练时间]]", "2023年")
+            else:
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                if history[0]["role"] == "system" and "tools" in history[0]:
+                    content = "\n".join(content.split("\n")[1:-1])
+                    def tool_call(**kwargs):
+                        return kwargs
+                    parameters = eval(content)
+                    content = {"name": metadata.strip(), "parameters": parameters}
+                else:
+                    content = {"name": metadata.strip(), "content": content}
+        return content, history
+
+    @torch.inference_mode()
+    def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
+             max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
+             **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        inputs = tokenizer.build_chat_input(query, history=history, role=role)
+        inputs = inputs.to(self.device)
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
+                        tokenizer.get_command("<|observation|>")]
+        outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id)
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+        response = tokenizer.decode(outputs)
+        history.append({"role": role, "content": query})
+        response, history = self.process_response(response, history)
+        return response, history
+
+    @torch.inference_mode()
+    def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
+                    past_key_values=None,max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8,
+                    logits_processor=None, return_past_key_values=False, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
+                        tokenizer.get_command("<|observation|>")]
+        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        if past_key_values is None:
+            inputs = tokenizer.build_chat_input(query, history=history, role=role)
+        else:
+            inputs = tokenizer.build_chat_input(query, role=role)
+        inputs = inputs.to(self.device)
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[0]
+            if self.transformer.pre_seq_len is not None:
+                past_length -= self.transformer.pre_seq_len
+            inputs.position_ids += past_length
+            attention_mask = inputs.attention_mask
+            attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
+            inputs['attention_mask'] = attention_mask
+        history.append({"role": role, "content": query})
+        for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
+                                            eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
+                                            **gen_kwargs):
+            if return_past_key_values:
+                outputs, past_key_values = outputs
+            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+            response = tokenizer.decode(outputs)
+            if response and response[-1] != "�":
+                response, new_history = self.process_response(response, history)
+                if return_past_key_values:
+                    yield response, new_history, past_key_values
+                else:
+                    yield response, new_history
+
+    @torch.inference_mode()
+    def stream_generate(
+            self,
+            input_ids,
+            generation_config: Optional[GenerationConfig] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+            return_past_key_values=False,
+            **kwargs,
+    ):
+        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
+
+        if generation_config is None:
+            generation_config = self.generation_config
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)
+        model_kwargs["use_cache"] = generation_config.use_cache
+        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
+
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None:
+            warnings.warn(
+                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
+                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
+                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+            if not has_default_max_length:
+                logger.warn(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
+                    UserWarning,
+                )
+
+        if input_ids_seq_length >= generation_config.max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            logger.warning(
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`."
+            )
+
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+        )
+
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+        logits_warper = self._get_logits_warper(generation_config)
+
+        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        scores = None
+        while True:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=False,
+                output_hidden_states=False,
+            )
+
+            next_token_logits = outputs.logits[:, -1, :]
+
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            if generation_config.do_sample:
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(probs, dim=-1)
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            unfinished_sequences = unfinished_sequences.mul(
+                next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+            )
+            if return_past_key_values:
+                yield input_ids, outputs.past_key_values
+            else:
+                yield input_ids
+            # stop when each sentence is finished, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                break
+
+    def quantize(self, bits: int, empty_init=False, device=None, **kwargs):
+        if bits == 0:
+            return
+
+        from .quantization import quantize
+
+        if self.quantized:
+            logger.info("Already quantized.")
+            return self
+
+        self.quantized = True
+
+        self.config.quantization_bit = bits
+
+        self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
+                                            **kwargs)
+        return self
+
+
+class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
+
+        self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
+        if config.classifier_dropout is not None:
+            self.dropout = nn.Dropout(config.classifier_dropout)
+        else:
+            self.dropout = None
+        self.config = config
+
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            full_attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.LongTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            full_attention_mask=full_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        pooled_hidden_states = hidden_states[-1]
+        if self.dropout is not None:
+            pooled_hidden_states = self.dropout(pooled_hidden_states)
+        logits = self.classifier_head(pooled_hidden_states)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze().float(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits.float(), labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits.float(), labels.view(-1, self.num_labels))
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/optimizer.pt b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bcf315d03d4eaeff2d678e8793973659a8ad1855
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:400e8a5e274a768dc0b9682a0501d298708c312857f56088f2a4a2def65fc62e
+size 14682210
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/pytorch_model.bin b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e3d93d3ca55b0927dee612ec601af9dbca54237b
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06951d06cd2537b8b6927e793abc5f91f918d00518df2a8282449b078a3a4a11
+size 7341306
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/quantization.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb95bfe82b203ff6a2aa962326d2c7a438d6a52f
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/quantization.py
@@ -0,0 +1,188 @@
+from torch.nn import Linear
+from torch.nn.parameter import Parameter
+
+import bz2
+import torch
+import base64
+import ctypes
+from transformers.utils import logging
+
+from typing import List
+from functools import partial
+
+logger = logging.get_logger(__name__)
+
+try:
+    from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up
+
+    class Kernel:
+        def __init__(self, code: bytes, function_names: List[str]):
+            self.code = code
+            self._function_names = function_names
+            self._cmodule = LazyKernelCModule(self.code)
+
+            for name in self._function_names:
+                setattr(self, name, KernelFunction(self._cmodule, name))
+
+    quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ"
+
+    kernels = Kernel(
+        bz2.decompress(base64.b64decode(quantization_code)),
+        [
+            "int4WeightCompression",
+            "int4WeightExtractionFloat",
+            "int4WeightExtractionHalf",
+            "int8WeightExtractionFloat",
+            "int8WeightExtractionHalf",
+        ],
+    )
+except Exception as exception:
+    kernels = None
+    logger.warning("Failed to load cpm_kernels:" + str(exception))
+
+
+class W8A16Linear(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width):
+        ctx.inp_shape = inp.size()
+        ctx.weight_bit_width = weight_bit_width
+        out_features = quant_w.size(0)
+        inp = inp.contiguous().view(-1, inp.size(-1))
+        weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width)
+        ctx.weight_shape = weight.size()
+        output = inp.mm(weight.t())
+        ctx.save_for_backward(inp, quant_w, scale_w)
+        return output.view(*(ctx.inp_shape[:-1] + (out_features,)))
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        inp, quant_w, scale_w = ctx.saved_tensors
+        weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width)
+        grad_output = grad_output.contiguous().view(-1, weight.size(0))
+        grad_input = grad_output.mm(weight)
+        grad_weight = grad_output.t().mm(inp)
+        return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None
+
+
+def compress_int4_weight(weight: torch.Tensor):  # (n, m)
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        assert m % 2 == 0
+        m = m // 2
+        out = torch.empty(n, m, dtype=torch.int8, device="cuda")
+        stream = torch.cuda.current_stream()
+
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+
+        kernels.int4WeightCompression(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
+        )
+        return out
+
+
+def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int):
+    assert scale_list.dtype in [torch.half, torch.bfloat16]
+    assert weight.dtype in [torch.int8]
+    if source_bit_width == 8:
+        return weight.to(scale_list.dtype) * scale_list[:, None]
+    elif source_bit_width == 4:
+        func = (
+            kernels.int4WeightExtractionHalf if scale_list.dtype == torch.half else kernels.int4WeightExtractionBFloat16
+        )
+    else:
+        assert False, "Unsupported bit-width"
+
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        out = torch.empty(n, m * (8 // source_bit_width), dtype=scale_list.dtype, device="cuda")
+        stream = torch.cuda.current_stream()
+
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+
+        func(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [
+                ctypes.c_void_p(weight.data_ptr()),
+                ctypes.c_void_p(scale_list.data_ptr()),
+                ctypes.c_void_p(out.data_ptr()),
+                ctypes.c_int32(n),
+                ctypes.c_int32(m),
+            ],
+        )
+        return out
+
+
+class QuantizedLinear(torch.nn.Module):
+    def __init__(self, weight_bit_width: int, weight, bias=None, device="cpu", dtype=None, empty_init=False, *args,
+                 **kwargs):
+        super().__init__()
+        self.weight_bit_width = weight_bit_width
+
+        shape = weight.shape
+
+        if weight is None or empty_init:
+            self.weight = torch.empty(shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=device)
+            self.weight_scale = torch.empty(shape[0], dtype=dtype, device=device)
+        else:
+            self.weight_scale = weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)
+            self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8)
+            if weight_bit_width == 4:
+                self.weight = compress_int4_weight(self.weight)
+
+        self.weight = Parameter(self.weight.to(device), requires_grad=False)
+        self.weight_scale = Parameter(self.weight_scale.to(device), requires_grad=False)
+        self.bias = Parameter(bias.to(device), requires_grad=False) if bias is not None else None
+
+    def forward(self, input):
+        output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width)
+        if self.bias is not None:
+            output = output + self.bias
+        return output
+
+
+def quantize(model, weight_bit_width, empty_init=False, device=None):
+    """Replace fp16 linear with quantized linear"""
+    for layer in model.layers:
+        layer.self_attention.query_key_value = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.self_attention.query_key_value.weight.to(torch.cuda.current_device()),
+            bias=layer.self_attention.query_key_value.bias,
+            dtype=layer.self_attention.query_key_value.weight.dtype,
+            device=layer.self_attention.query_key_value.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.self_attention.dense = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.self_attention.dense.weight.to(torch.cuda.current_device()),
+            bias=layer.self_attention.dense.bias,
+            dtype=layer.self_attention.dense.weight.dtype,
+            device=layer.self_attention.dense.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_h_to_4h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()),
+            bias=layer.mlp.dense_h_to_4h.bias,
+            dtype=layer.mlp.dense_h_to_4h.weight.dtype,
+            device=layer.mlp.dense_h_to_4h.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_4h_to_h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()),
+            bias=layer.mlp.dense_4h_to_h.bias,
+            dtype=layer.mlp.dense_4h_to_h.weight.dtype,
+            device=layer.mlp.dense_4h_to_h.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+
+    return model
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/rng_state.pth b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..5367c96ab2f1fef1e6f78de4c3cb38f6b50f37d5
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc0cb6255ebbd22879226b5a84d22302be0dc04a17df5c3e33192cc7f59bf84e
+size 14244
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/scheduler.pt b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7d467cfe25a20f7e6079f110ede561db40957ece
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:acca6ccdfe33e0ab0bbeb7e3423cfda45eccc967cbf8152c875aec962ac04588
+size 1064
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/special_tokens_map.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/special_tokens_map.json
@@ -0,0 +1 @@
+{}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenization_chatglm.py b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenization_chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenization_chatglm.py
@@ -0,0 +1,300 @@
+import json
+import os
+import re
+from typing import List, Optional, Union, Dict
+from sentencepiece import SentencePieceProcessor
+from transformers import PreTrainedTokenizer
+from transformers.utils import logging, PaddingStrategy
+from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
+
+
+class SPTokenizer:
+    def __init__(self, model_path: str):
+        # reload tokenizer
+        assert os.path.isfile(model_path), model_path
+        self.sp_model = SentencePieceProcessor(model_file=model_path)
+
+        # BOS / EOS token IDs
+        self.n_words: int = self.sp_model.vocab_size()
+        self.bos_id: int = self.sp_model.bos_id()
+        self.eos_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.unk_id()
+        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+
+        role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
+        special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
+        self.special_tokens = {}
+        self.index_special_tokens = {}
+        for token in special_tokens:
+            self.special_tokens[token] = self.n_words
+            self.index_special_tokens[self.n_words] = token
+            self.n_words += 1
+        self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens])
+
+    def tokenize(self, s: str, encode_special_tokens=False):
+        if encode_special_tokens:
+            last_index = 0
+            t = []
+            for match in re.finditer(self.role_special_token_expression, s):
+                if last_index < match.start():
+                    t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()]))
+                t.append(s[match.start():match.end()])
+                last_index = match.end()
+            if last_index < len(s):
+                t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
+            return t
+        else:
+            return self.sp_model.EncodeAsPieces(s)
+
+    def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
+        assert type(s) is str
+        t = self.sp_model.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+
+    def decode(self, t: List[int]) -> str:
+        text, buffer = "", []
+        for token in t:
+            if token in self.index_special_tokens:
+                if buffer:
+                    text += self.sp_model.decode(buffer)
+                    buffer = []
+                text += self.index_special_tokens[token]
+            else:
+                buffer.append(token)
+        if buffer:
+            text += self.sp_model.decode(buffer)
+        return text
+
+    def decode_tokens(self, tokens: List[str]) -> str:
+        text = self.sp_model.DecodePieces(tokens)
+        return text
+
+    def convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        return self.sp_model.PieceToId(token)
+
+    def convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.index_special_tokens:
+            return self.index_special_tokens[index]
+        if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0:
+            return ""
+        return self.sp_model.IdToPiece(index)
+
+
+class ChatGLMTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {"vocab_file": "tokenizer.model"}
+
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+
+    def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False,
+                 **kwargs):
+        self.name = "GLMTokenizer"
+
+        self.vocab_file = vocab_file
+        self.tokenizer = SPTokenizer(vocab_file)
+        self.special_tokens = {
+            "<bos>": self.tokenizer.bos_id,
+            "<eos>": self.tokenizer.eos_id,
+            "<pad>": self.tokenizer.pad_id
+        }
+        self.encode_special_tokens = encode_special_tokens
+        super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                         encode_special_tokens=encode_special_tokens,
+                         **kwargs)
+
+    def get_command(self, token):
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
+        return self.tokenizer.special_tokens[token]
+
+    @property
+    def unk_token(self) -> str:
+        return "<unk>"
+
+    @property
+    def pad_token(self) -> str:
+        return "<unk>"
+
+    @property
+    def pad_token_id(self):
+        return self.get_command("<pad>")
+
+    @property
+    def eos_token(self) -> str:
+        return "</s>"
+
+    @property
+    def eos_token_id(self):
+        return self.get_command("<eos>")
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_words
+
+    def get_vocab(self):
+        """ Returns vocab as a dict """
+        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text, **kwargs):
+        return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.tokenizer.convert_token_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.tokenizer.convert_id_to_token(index)
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return self.tokenizer.decode_tokens(tokens)
+
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, self.vocab_files_names["vocab_file"]
+            )
+        else:
+            vocab_file = save_directory
+
+        with open(self.vocab_file, 'rb') as fin:
+            proto_str = fin.read()
+
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_str)
+
+        return (vocab_file,)
+
+    def get_prefix_tokens(self):
+        prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
+        return prefix_tokens
+
+    def build_single_message(self, role, metadata, message):
+        assert role in ["system", "user", "assistant", "observation"], role
+        role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n")
+        message_tokens = self.tokenizer.encode(message)
+        tokens = role_tokens + message_tokens
+        return tokens
+
+    def build_chat_input(self, query, history=None, role="user"):
+        if history is None:
+            history = []
+        input_ids = []
+        for item in history:
+            content = item["content"]
+            if item["role"] == "system" and "tools" in item:
+                content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False)
+            input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content))
+        input_ids.extend(self.build_single_message(role, "", query))
+        input_ids.extend([self.get_command("<|assistant|>")])
+        return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True)
+
+    def build_inputs_with_special_tokens(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        prefix_tokens = self.get_prefix_tokens()
+        token_ids_0 = prefix_tokens + token_ids_0
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("<eos>")]
+        return token_ids_0
+
+    def _pad(
+            self,
+            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+            max_length: Optional[int] = None,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            pad_to_multiple_of: Optional[int] = None,
+            return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        assert self.padding_side == "left"
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * seq_length
+
+        if "position_ids" not in encoded_inputs:
+            encoded_inputs["position_ids"] = list(range(seq_length))
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
+            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+
+        return encoded_inputs
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenizer.model b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
+size 1018370
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenizer_config.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..5faafd38f2e2934804feb0e7d71ebf08b0839bf5
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenizer_config.json
@@ -0,0 +1,18 @@
+{
+  "added_tokens_decoder": {},
+  "additional_special_tokens": [],
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_chatglm.ChatGLMTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "encode_special_tokens": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "padding_side": "left",
+  "remove_space": false,
+  "tokenizer_class": "ChatGLMTokenizer",
+  "tokenizer_file": null
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/trainer_state.json b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..e119522202901d2c6817509eb52a0b5aec492e91
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/trainer_state.json
@@ -0,0 +1,2419 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 37.64705882352941,
+  "eval_steps": 500,
+  "global_step": 400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.09,
+      "learning_rate": 0.009985714285714285,
+      "loss": 2.6971,
+      "step": 1
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 0.009971428571428572,
+      "loss": 2.3927,
+      "step": 2
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 0.009957142857142857,
+      "loss": 2.2539,
+      "step": 3
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 0.009942857142857144,
+      "loss": 2.1408,
+      "step": 4
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 0.009928571428571429,
+      "loss": 2.2672,
+      "step": 5
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 0.009914285714285714,
+      "loss": 1.6433,
+      "step": 6
+    },
+    {
+      "epoch": 0.66,
+      "learning_rate": 0.0099,
+      "loss": 2.1405,
+      "step": 7
+    },
+    {
+      "epoch": 0.75,
+      "learning_rate": 0.009885714285714286,
+      "loss": 2.1464,
+      "step": 8
+    },
+    {
+      "epoch": 0.85,
+      "learning_rate": 0.009871428571428571,
+      "loss": 1.8498,
+      "step": 9
+    },
+    {
+      "epoch": 0.94,
+      "learning_rate": 0.009857142857142858,
+      "loss": 1.6896,
+      "step": 10
+    },
+    {
+      "epoch": 1.04,
+      "learning_rate": 0.009842857142857143,
+      "loss": 2.1932,
+      "step": 11
+    },
+    {
+      "epoch": 1.13,
+      "learning_rate": 0.00982857142857143,
+      "loss": 1.8236,
+      "step": 12
+    },
+    {
+      "epoch": 1.22,
+      "learning_rate": 0.009814285714285715,
+      "loss": 1.735,
+      "step": 13
+    },
+    {
+      "epoch": 1.32,
+      "learning_rate": 0.0098,
+      "loss": 1.7488,
+      "step": 14
+    },
+    {
+      "epoch": 1.41,
+      "learning_rate": 0.009785714285714285,
+      "loss": 1.8336,
+      "step": 15
+    },
+    {
+      "epoch": 1.51,
+      "learning_rate": 0.009771428571428572,
+      "loss": 1.9438,
+      "step": 16
+    },
+    {
+      "epoch": 1.6,
+      "learning_rate": 0.009757142857142858,
+      "loss": 1.7178,
+      "step": 17
+    },
+    {
+      "epoch": 1.69,
+      "learning_rate": 0.009742857142857143,
+      "loss": 1.5714,
+      "step": 18
+    },
+    {
+      "epoch": 1.79,
+      "learning_rate": 0.009728571428571428,
+      "loss": 1.537,
+      "step": 19
+    },
+    {
+      "epoch": 1.88,
+      "learning_rate": 0.009714285714285715,
+      "loss": 1.6764,
+      "step": 20
+    },
+    {
+      "epoch": 1.98,
+      "learning_rate": 0.0097,
+      "loss": 1.8919,
+      "step": 21
+    },
+    {
+      "epoch": 2.07,
+      "learning_rate": 0.009685714285714285,
+      "loss": 1.346,
+      "step": 22
+    },
+    {
+      "epoch": 2.16,
+      "learning_rate": 0.009671428571428572,
+      "loss": 1.5036,
+      "step": 23
+    },
+    {
+      "epoch": 2.26,
+      "learning_rate": 0.009657142857142857,
+      "loss": 1.6788,
+      "step": 24
+    },
+    {
+      "epoch": 2.35,
+      "learning_rate": 0.009642857142857144,
+      "loss": 1.6667,
+      "step": 25
+    },
+    {
+      "epoch": 2.45,
+      "learning_rate": 0.009628571428571429,
+      "loss": 1.7153,
+      "step": 26
+    },
+    {
+      "epoch": 2.54,
+      "learning_rate": 0.009614285714285714,
+      "loss": 1.601,
+      "step": 27
+    },
+    {
+      "epoch": 2.64,
+      "learning_rate": 0.0096,
+      "loss": 1.3002,
+      "step": 28
+    },
+    {
+      "epoch": 2.73,
+      "learning_rate": 0.009585714285714286,
+      "loss": 1.3294,
+      "step": 29
+    },
+    {
+      "epoch": 2.82,
+      "learning_rate": 0.009571428571428573,
+      "loss": 1.7477,
+      "step": 30
+    },
+    {
+      "epoch": 2.92,
+      "learning_rate": 0.009557142857142858,
+      "loss": 1.7961,
+      "step": 31
+    },
+    {
+      "epoch": 3.01,
+      "learning_rate": 0.009542857142857143,
+      "loss": 1.4954,
+      "step": 32
+    },
+    {
+      "epoch": 3.11,
+      "learning_rate": 0.009528571428571428,
+      "loss": 1.6452,
+      "step": 33
+    },
+    {
+      "epoch": 3.2,
+      "learning_rate": 0.009514285714285715,
+      "loss": 1.3528,
+      "step": 34
+    },
+    {
+      "epoch": 3.29,
+      "learning_rate": 0.0095,
+      "loss": 1.4811,
+      "step": 35
+    },
+    {
+      "epoch": 3.39,
+      "learning_rate": 0.009485714285714287,
+      "loss": 1.4738,
+      "step": 36
+    },
+    {
+      "epoch": 3.48,
+      "learning_rate": 0.009471428571428572,
+      "loss": 1.174,
+      "step": 37
+    },
+    {
+      "epoch": 3.58,
+      "learning_rate": 0.009457142857142857,
+      "loss": 1.2346,
+      "step": 38
+    },
+    {
+      "epoch": 3.67,
+      "learning_rate": 0.009442857142857143,
+      "loss": 1.5327,
+      "step": 39
+    },
+    {
+      "epoch": 3.76,
+      "learning_rate": 0.009428571428571429,
+      "loss": 1.5249,
+      "step": 40
+    },
+    {
+      "epoch": 3.86,
+      "learning_rate": 0.009414285714285714,
+      "loss": 1.5086,
+      "step": 41
+    },
+    {
+      "epoch": 3.95,
+      "learning_rate": 0.0094,
+      "loss": 1.8425,
+      "step": 42
+    },
+    {
+      "epoch": 4.05,
+      "learning_rate": 0.009385714285714287,
+      "loss": 1.1943,
+      "step": 43
+    },
+    {
+      "epoch": 4.14,
+      "learning_rate": 0.009371428571428572,
+      "loss": 1.6835,
+      "step": 44
+    },
+    {
+      "epoch": 4.24,
+      "learning_rate": 0.009357142857142857,
+      "loss": 1.75,
+      "step": 45
+    },
+    {
+      "epoch": 4.33,
+      "learning_rate": 0.009342857142857142,
+      "loss": 1.2561,
+      "step": 46
+    },
+    {
+      "epoch": 4.42,
+      "learning_rate": 0.009328571428571429,
+      "loss": 1.3784,
+      "step": 47
+    },
+    {
+      "epoch": 4.52,
+      "learning_rate": 0.009314285714285714,
+      "loss": 1.2538,
+      "step": 48
+    },
+    {
+      "epoch": 4.61,
+      "learning_rate": 0.009300000000000001,
+      "loss": 1.4429,
+      "step": 49
+    },
+    {
+      "epoch": 4.71,
+      "learning_rate": 0.009285714285714286,
+      "loss": 1.3687,
+      "step": 50
+    },
+    {
+      "epoch": 4.8,
+      "learning_rate": 0.009271428571428571,
+      "loss": 1.1511,
+      "step": 51
+    },
+    {
+      "epoch": 4.89,
+      "learning_rate": 0.009257142857142858,
+      "loss": 1.181,
+      "step": 52
+    },
+    {
+      "epoch": 4.99,
+      "learning_rate": 0.009242857142857143,
+      "loss": 1.1753,
+      "step": 53
+    },
+    {
+      "epoch": 5.08,
+      "learning_rate": 0.009228571428571428,
+      "loss": 1.1562,
+      "step": 54
+    },
+    {
+      "epoch": 5.18,
+      "learning_rate": 0.009214285714285715,
+      "loss": 1.2936,
+      "step": 55
+    },
+    {
+      "epoch": 5.27,
+      "learning_rate": 0.0092,
+      "loss": 1.3591,
+      "step": 56
+    },
+    {
+      "epoch": 5.36,
+      "learning_rate": 0.009185714285714287,
+      "loss": 1.1376,
+      "step": 57
+    },
+    {
+      "epoch": 5.46,
+      "learning_rate": 0.009171428571428572,
+      "loss": 1.372,
+      "step": 58
+    },
+    {
+      "epoch": 5.55,
+      "learning_rate": 0.009157142857142857,
+      "loss": 1.5141,
+      "step": 59
+    },
+    {
+      "epoch": 5.65,
+      "learning_rate": 0.009142857142857144,
+      "loss": 1.2087,
+      "step": 60
+    },
+    {
+      "epoch": 5.74,
+      "learning_rate": 0.009128571428571429,
+      "loss": 1.136,
+      "step": 61
+    },
+    {
+      "epoch": 5.84,
+      "learning_rate": 0.009114285714285715,
+      "loss": 1.2948,
+      "step": 62
+    },
+    {
+      "epoch": 5.93,
+      "learning_rate": 0.0091,
+      "loss": 1.0592,
+      "step": 63
+    },
+    {
+      "epoch": 6.02,
+      "learning_rate": 0.009085714285714286,
+      "loss": 1.2321,
+      "step": 64
+    },
+    {
+      "epoch": 6.12,
+      "learning_rate": 0.009071428571428572,
+      "loss": 1.0827,
+      "step": 65
+    },
+    {
+      "epoch": 6.21,
+      "learning_rate": 0.009057142857142857,
+      "loss": 1.1136,
+      "step": 66
+    },
+    {
+      "epoch": 6.31,
+      "learning_rate": 0.009042857142857142,
+      "loss": 1.475,
+      "step": 67
+    },
+    {
+      "epoch": 6.4,
+      "learning_rate": 0.009028571428571427,
+      "loss": 1.1316,
+      "step": 68
+    },
+    {
+      "epoch": 6.49,
+      "learning_rate": 0.009014285714285714,
+      "loss": 1.1688,
+      "step": 69
+    },
+    {
+      "epoch": 6.59,
+      "learning_rate": 0.009000000000000001,
+      "loss": 1.0882,
+      "step": 70
+    },
+    {
+      "epoch": 6.68,
+      "learning_rate": 0.008985714285714286,
+      "loss": 1.1085,
+      "step": 71
+    },
+    {
+      "epoch": 6.78,
+      "learning_rate": 0.008971428571428571,
+      "loss": 1.2029,
+      "step": 72
+    },
+    {
+      "epoch": 6.87,
+      "learning_rate": 0.008957142857142856,
+      "loss": 1.098,
+      "step": 73
+    },
+    {
+      "epoch": 6.96,
+      "learning_rate": 0.008942857142857143,
+      "loss": 1.219,
+      "step": 74
+    },
+    {
+      "epoch": 7.06,
+      "learning_rate": 0.00892857142857143,
+      "loss": 1.0092,
+      "step": 75
+    },
+    {
+      "epoch": 7.15,
+      "learning_rate": 0.008914285714285715,
+      "loss": 1.0112,
+      "step": 76
+    },
+    {
+      "epoch": 7.25,
+      "learning_rate": 0.0089,
+      "loss": 1.1481,
+      "step": 77
+    },
+    {
+      "epoch": 7.34,
+      "learning_rate": 0.008885714285714287,
+      "loss": 0.9873,
+      "step": 78
+    },
+    {
+      "epoch": 7.44,
+      "learning_rate": 0.008871428571428572,
+      "loss": 1.0586,
+      "step": 79
+    },
+    {
+      "epoch": 7.53,
+      "learning_rate": 0.008857142857142857,
+      "loss": 1.1177,
+      "step": 80
+    },
+    {
+      "epoch": 7.62,
+      "learning_rate": 0.008842857142857142,
+      "loss": 0.7814,
+      "step": 81
+    },
+    {
+      "epoch": 7.72,
+      "learning_rate": 0.008828571428571429,
+      "loss": 1.2043,
+      "step": 82
+    },
+    {
+      "epoch": 7.81,
+      "learning_rate": 0.008814285714285715,
+      "loss": 1.0062,
+      "step": 83
+    },
+    {
+      "epoch": 7.91,
+      "learning_rate": 0.0088,
+      "loss": 1.0831,
+      "step": 84
+    },
+    {
+      "epoch": 8.0,
+      "learning_rate": 0.008785714285714286,
+      "loss": 0.9554,
+      "step": 85
+    },
+    {
+      "epoch": 8.09,
+      "learning_rate": 0.00877142857142857,
+      "loss": 1.1674,
+      "step": 86
+    },
+    {
+      "epoch": 8.19,
+      "learning_rate": 0.008757142857142857,
+      "loss": 0.8226,
+      "step": 87
+    },
+    {
+      "epoch": 8.28,
+      "learning_rate": 0.008742857142857144,
+      "loss": 0.9166,
+      "step": 88
+    },
+    {
+      "epoch": 8.38,
+      "learning_rate": 0.00872857142857143,
+      "loss": 0.734,
+      "step": 89
+    },
+    {
+      "epoch": 8.47,
+      "learning_rate": 0.008714285714285714,
+      "loss": 0.8641,
+      "step": 90
+    },
+    {
+      "epoch": 8.56,
+      "learning_rate": 0.0087,
+      "loss": 0.9517,
+      "step": 91
+    },
+    {
+      "epoch": 8.66,
+      "learning_rate": 0.008685714285714286,
+      "loss": 0.9995,
+      "step": 92
+    },
+    {
+      "epoch": 8.75,
+      "learning_rate": 0.008671428571428571,
+      "loss": 0.763,
+      "step": 93
+    },
+    {
+      "epoch": 8.85,
+      "learning_rate": 0.008657142857142858,
+      "loss": 1.0712,
+      "step": 94
+    },
+    {
+      "epoch": 8.94,
+      "learning_rate": 0.008642857142857143,
+      "loss": 1.1111,
+      "step": 95
+    },
+    {
+      "epoch": 9.04,
+      "learning_rate": 0.008628571428571428,
+      "loss": 0.9626,
+      "step": 96
+    },
+    {
+      "epoch": 9.13,
+      "learning_rate": 0.008614285714285715,
+      "loss": 0.6385,
+      "step": 97
+    },
+    {
+      "epoch": 9.22,
+      "learning_rate": 0.0086,
+      "loss": 0.8147,
+      "step": 98
+    },
+    {
+      "epoch": 9.32,
+      "learning_rate": 0.008585714285714285,
+      "loss": 0.8109,
+      "step": 99
+    },
+    {
+      "epoch": 9.41,
+      "learning_rate": 0.008571428571428572,
+      "loss": 1.0953,
+      "step": 100
+    },
+    {
+      "epoch": 9.51,
+      "learning_rate": 0.008557142857142859,
+      "loss": 0.7104,
+      "step": 101
+    },
+    {
+      "epoch": 9.6,
+      "learning_rate": 0.008542857142857144,
+      "loss": 0.9672,
+      "step": 102
+    },
+    {
+      "epoch": 9.69,
+      "learning_rate": 0.008528571428571429,
+      "loss": 0.7593,
+      "step": 103
+    },
+    {
+      "epoch": 9.79,
+      "learning_rate": 0.008514285714285714,
+      "loss": 1.0186,
+      "step": 104
+    },
+    {
+      "epoch": 9.88,
+      "learning_rate": 0.0085,
+      "loss": 0.7898,
+      "step": 105
+    },
+    {
+      "epoch": 9.98,
+      "learning_rate": 0.008485714285714286,
+      "loss": 0.7392,
+      "step": 106
+    },
+    {
+      "epoch": 10.07,
+      "learning_rate": 0.008471428571428572,
+      "loss": 0.7295,
+      "step": 107
+    },
+    {
+      "epoch": 10.16,
+      "learning_rate": 0.008457142857142858,
+      "loss": 0.7211,
+      "step": 108
+    },
+    {
+      "epoch": 10.26,
+      "learning_rate": 0.008442857142857143,
+      "loss": 0.769,
+      "step": 109
+    },
+    {
+      "epoch": 10.35,
+      "learning_rate": 0.00842857142857143,
+      "loss": 0.718,
+      "step": 110
+    },
+    {
+      "epoch": 10.45,
+      "learning_rate": 0.008414285714285714,
+      "loss": 0.6411,
+      "step": 111
+    },
+    {
+      "epoch": 10.54,
+      "learning_rate": 0.0084,
+      "loss": 0.8016,
+      "step": 112
+    },
+    {
+      "epoch": 10.64,
+      "learning_rate": 0.008385714285714286,
+      "loss": 0.6633,
+      "step": 113
+    },
+    {
+      "epoch": 10.73,
+      "learning_rate": 0.008371428571428571,
+      "loss": 0.7257,
+      "step": 114
+    },
+    {
+      "epoch": 10.82,
+      "learning_rate": 0.008357142857142858,
+      "loss": 0.7785,
+      "step": 115
+    },
+    {
+      "epoch": 10.92,
+      "learning_rate": 0.008342857142857143,
+      "loss": 0.8927,
+      "step": 116
+    },
+    {
+      "epoch": 11.01,
+      "learning_rate": 0.008328571428571428,
+      "loss": 0.7242,
+      "step": 117
+    },
+    {
+      "epoch": 11.11,
+      "learning_rate": 0.008314285714285715,
+      "loss": 0.8297,
+      "step": 118
+    },
+    {
+      "epoch": 11.2,
+      "learning_rate": 0.0083,
+      "loss": 0.6761,
+      "step": 119
+    },
+    {
+      "epoch": 11.29,
+      "learning_rate": 0.008285714285714287,
+      "loss": 0.6699,
+      "step": 120
+    },
+    {
+      "epoch": 11.39,
+      "learning_rate": 0.008271428571428572,
+      "loss": 0.5365,
+      "step": 121
+    },
+    {
+      "epoch": 11.48,
+      "learning_rate": 0.008257142857142857,
+      "loss": 0.9045,
+      "step": 122
+    },
+    {
+      "epoch": 11.58,
+      "learning_rate": 0.008242857142857144,
+      "loss": 0.5071,
+      "step": 123
+    },
+    {
+      "epoch": 11.67,
+      "learning_rate": 0.008228571428571429,
+      "loss": 0.6472,
+      "step": 124
+    },
+    {
+      "epoch": 11.76,
+      "learning_rate": 0.008214285714285714,
+      "loss": 0.6232,
+      "step": 125
+    },
+    {
+      "epoch": 11.86,
+      "learning_rate": 0.008199999999999999,
+      "loss": 0.4905,
+      "step": 126
+    },
+    {
+      "epoch": 11.95,
+      "learning_rate": 0.008185714285714286,
+      "loss": 0.557,
+      "step": 127
+    },
+    {
+      "epoch": 12.05,
+      "learning_rate": 0.008171428571428573,
+      "loss": 0.5517,
+      "step": 128
+    },
+    {
+      "epoch": 12.14,
+      "learning_rate": 0.008157142857142858,
+      "loss": 0.6321,
+      "step": 129
+    },
+    {
+      "epoch": 12.24,
+      "learning_rate": 0.008142857142857143,
+      "loss": 0.6619,
+      "step": 130
+    },
+    {
+      "epoch": 12.33,
+      "learning_rate": 0.008128571428571428,
+      "loss": 0.5524,
+      "step": 131
+    },
+    {
+      "epoch": 12.42,
+      "learning_rate": 0.008114285714285715,
+      "loss": 0.4688,
+      "step": 132
+    },
+    {
+      "epoch": 12.52,
+      "learning_rate": 0.008100000000000001,
+      "loss": 0.3717,
+      "step": 133
+    },
+    {
+      "epoch": 12.61,
+      "learning_rate": 0.008085714285714286,
+      "loss": 0.5118,
+      "step": 134
+    },
+    {
+      "epoch": 12.71,
+      "learning_rate": 0.008071428571428571,
+      "loss": 0.4521,
+      "step": 135
+    },
+    {
+      "epoch": 12.8,
+      "learning_rate": 0.008057142857142856,
+      "loss": 0.5865,
+      "step": 136
+    },
+    {
+      "epoch": 12.89,
+      "learning_rate": 0.008042857142857143,
+      "loss": 0.5977,
+      "step": 137
+    },
+    {
+      "epoch": 12.99,
+      "learning_rate": 0.008028571428571428,
+      "loss": 0.6977,
+      "step": 138
+    },
+    {
+      "epoch": 13.08,
+      "learning_rate": 0.008014285714285713,
+      "loss": 0.5625,
+      "step": 139
+    },
+    {
+      "epoch": 13.18,
+      "learning_rate": 0.008,
+      "loss": 0.3611,
+      "step": 140
+    },
+    {
+      "epoch": 13.27,
+      "learning_rate": 0.007985714285714287,
+      "loss": 0.5168,
+      "step": 141
+    },
+    {
+      "epoch": 13.36,
+      "learning_rate": 0.007971428571428572,
+      "loss": 0.4429,
+      "step": 142
+    },
+    {
+      "epoch": 13.46,
+      "learning_rate": 0.007957142857142857,
+      "loss": 0.4998,
+      "step": 143
+    },
+    {
+      "epoch": 13.55,
+      "learning_rate": 0.007942857142857142,
+      "loss": 0.4437,
+      "step": 144
+    },
+    {
+      "epoch": 13.65,
+      "learning_rate": 0.007928571428571429,
+      "loss": 0.4958,
+      "step": 145
+    },
+    {
+      "epoch": 13.74,
+      "learning_rate": 0.007914285714285716,
+      "loss": 0.4021,
+      "step": 146
+    },
+    {
+      "epoch": 13.84,
+      "learning_rate": 0.0079,
+      "loss": 0.6163,
+      "step": 147
+    },
+    {
+      "epoch": 13.93,
+      "learning_rate": 0.007885714285714286,
+      "loss": 0.406,
+      "step": 148
+    },
+    {
+      "epoch": 14.02,
+      "learning_rate": 0.007871428571428571,
+      "loss": 0.4905,
+      "step": 149
+    },
+    {
+      "epoch": 14.12,
+      "learning_rate": 0.007857142857142858,
+      "loss": 0.3824,
+      "step": 150
+    },
+    {
+      "epoch": 14.21,
+      "learning_rate": 0.007842857142857143,
+      "loss": 0.3591,
+      "step": 151
+    },
+    {
+      "epoch": 14.31,
+      "learning_rate": 0.007828571428571428,
+      "loss": 0.342,
+      "step": 152
+    },
+    {
+      "epoch": 14.4,
+      "learning_rate": 0.007814285714285715,
+      "loss": 0.4565,
+      "step": 153
+    },
+    {
+      "epoch": 14.49,
+      "learning_rate": 0.0078000000000000005,
+      "loss": 0.3287,
+      "step": 154
+    },
+    {
+      "epoch": 14.59,
+      "learning_rate": 0.007785714285714286,
+      "loss": 0.4179,
+      "step": 155
+    },
+    {
+      "epoch": 14.68,
+      "learning_rate": 0.0077714285714285715,
+      "loss": 0.3586,
+      "step": 156
+    },
+    {
+      "epoch": 14.78,
+      "learning_rate": 0.007757142857142857,
+      "loss": 0.4618,
+      "step": 157
+    },
+    {
+      "epoch": 14.87,
+      "learning_rate": 0.0077428571428571425,
+      "loss": 0.4133,
+      "step": 158
+    },
+    {
+      "epoch": 14.96,
+      "learning_rate": 0.007728571428571429,
+      "loss": 0.4326,
+      "step": 159
+    },
+    {
+      "epoch": 15.06,
+      "learning_rate": 0.007714285714285715,
+      "loss": 0.3838,
+      "step": 160
+    },
+    {
+      "epoch": 15.15,
+      "learning_rate": 0.0077,
+      "loss": 0.2978,
+      "step": 161
+    },
+    {
+      "epoch": 15.25,
+      "learning_rate": 0.007685714285714286,
+      "loss": 0.3993,
+      "step": 162
+    },
+    {
+      "epoch": 15.34,
+      "learning_rate": 0.007671428571428571,
+      "loss": 0.3249,
+      "step": 163
+    },
+    {
+      "epoch": 15.44,
+      "learning_rate": 0.007657142857142857,
+      "loss": 0.2796,
+      "step": 164
+    },
+    {
+      "epoch": 15.53,
+      "learning_rate": 0.007642857142857142,
+      "loss": 0.3918,
+      "step": 165
+    },
+    {
+      "epoch": 15.62,
+      "learning_rate": 0.007628571428571429,
+      "loss": 0.4122,
+      "step": 166
+    },
+    {
+      "epoch": 15.72,
+      "learning_rate": 0.007614285714285715,
+      "loss": 0.3403,
+      "step": 167
+    },
+    {
+      "epoch": 15.81,
+      "learning_rate": 0.0076,
+      "loss": 0.3759,
+      "step": 168
+    },
+    {
+      "epoch": 15.91,
+      "learning_rate": 0.007585714285714286,
+      "loss": 0.3621,
+      "step": 169
+    },
+    {
+      "epoch": 16.0,
+      "learning_rate": 0.007571428571428571,
+      "loss": 0.2991,
+      "step": 170
+    },
+    {
+      "epoch": 16.09,
+      "learning_rate": 0.007557142857142857,
+      "loss": 0.3039,
+      "step": 171
+    },
+    {
+      "epoch": 16.19,
+      "learning_rate": 0.007542857142857144,
+      "loss": 0.4571,
+      "step": 172
+    },
+    {
+      "epoch": 16.28,
+      "learning_rate": 0.007528571428571429,
+      "loss": 0.2759,
+      "step": 173
+    },
+    {
+      "epoch": 16.38,
+      "learning_rate": 0.007514285714285715,
+      "loss": 0.2835,
+      "step": 174
+    },
+    {
+      "epoch": 16.47,
+      "learning_rate": 0.0075,
+      "loss": 0.3221,
+      "step": 175
+    },
+    {
+      "epoch": 16.56,
+      "learning_rate": 0.007485714285714286,
+      "loss": 0.3072,
+      "step": 176
+    },
+    {
+      "epoch": 16.66,
+      "learning_rate": 0.007471428571428572,
+      "loss": 0.2852,
+      "step": 177
+    },
+    {
+      "epoch": 16.75,
+      "learning_rate": 0.007457142857142857,
+      "loss": 0.2559,
+      "step": 178
+    },
+    {
+      "epoch": 16.85,
+      "learning_rate": 0.007442857142857143,
+      "loss": 0.2787,
+      "step": 179
+    },
+    {
+      "epoch": 16.94,
+      "learning_rate": 0.007428571428571429,
+      "loss": 0.3331,
+      "step": 180
+    },
+    {
+      "epoch": 17.04,
+      "learning_rate": 0.007414285714285714,
+      "loss": 0.1929,
+      "step": 181
+    },
+    {
+      "epoch": 17.13,
+      "learning_rate": 0.0074,
+      "loss": 0.2065,
+      "step": 182
+    },
+    {
+      "epoch": 17.22,
+      "learning_rate": 0.007385714285714285,
+      "loss": 0.2868,
+      "step": 183
+    },
+    {
+      "epoch": 17.32,
+      "learning_rate": 0.007371428571428571,
+      "loss": 0.2206,
+      "step": 184
+    },
+    {
+      "epoch": 17.41,
+      "learning_rate": 0.007357142857142858,
+      "loss": 0.2355,
+      "step": 185
+    },
+    {
+      "epoch": 17.51,
+      "learning_rate": 0.007342857142857143,
+      "loss": 0.3041,
+      "step": 186
+    },
+    {
+      "epoch": 17.6,
+      "learning_rate": 0.007328571428571429,
+      "loss": 0.3028,
+      "step": 187
+    },
+    {
+      "epoch": 17.69,
+      "learning_rate": 0.007314285714285714,
+      "loss": 0.2435,
+      "step": 188
+    },
+    {
+      "epoch": 17.79,
+      "learning_rate": 0.0073,
+      "loss": 0.1869,
+      "step": 189
+    },
+    {
+      "epoch": 17.88,
+      "learning_rate": 0.007285714285714285,
+      "loss": 0.3036,
+      "step": 190
+    },
+    {
+      "epoch": 17.98,
+      "learning_rate": 0.007271428571428571,
+      "loss": 0.246,
+      "step": 191
+    },
+    {
+      "epoch": 18.07,
+      "learning_rate": 0.007257142857142858,
+      "loss": 0.2316,
+      "step": 192
+    },
+    {
+      "epoch": 18.16,
+      "learning_rate": 0.007242857142857143,
+      "loss": 0.186,
+      "step": 193
+    },
+    {
+      "epoch": 18.26,
+      "learning_rate": 0.007228571428571429,
+      "loss": 0.2616,
+      "step": 194
+    },
+    {
+      "epoch": 18.35,
+      "learning_rate": 0.007214285714285715,
+      "loss": 0.2824,
+      "step": 195
+    },
+    {
+      "epoch": 18.45,
+      "learning_rate": 0.0072,
+      "loss": 0.2,
+      "step": 196
+    },
+    {
+      "epoch": 18.54,
+      "learning_rate": 0.007185714285714286,
+      "loss": 0.1978,
+      "step": 197
+    },
+    {
+      "epoch": 18.64,
+      "learning_rate": 0.007171428571428572,
+      "loss": 0.1897,
+      "step": 198
+    },
+    {
+      "epoch": 18.73,
+      "learning_rate": 0.007157142857142858,
+      "loss": 0.1958,
+      "step": 199
+    },
+    {
+      "epoch": 18.82,
+      "learning_rate": 0.0071428571428571435,
+      "loss": 0.203,
+      "step": 200
+    },
+    {
+      "epoch": 18.92,
+      "learning_rate": 0.0071285714285714286,
+      "loss": 0.2451,
+      "step": 201
+    },
+    {
+      "epoch": 19.01,
+      "learning_rate": 0.0071142857142857145,
+      "loss": 0.2045,
+      "step": 202
+    },
+    {
+      "epoch": 19.11,
+      "learning_rate": 0.0070999999999999995,
+      "loss": 0.1937,
+      "step": 203
+    },
+    {
+      "epoch": 19.2,
+      "learning_rate": 0.0070857142857142855,
+      "loss": 0.1814,
+      "step": 204
+    },
+    {
+      "epoch": 19.29,
+      "learning_rate": 0.007071428571428572,
+      "loss": 0.1869,
+      "step": 205
+    },
+    {
+      "epoch": 19.39,
+      "learning_rate": 0.007057142857142857,
+      "loss": 0.2089,
+      "step": 206
+    },
+    {
+      "epoch": 19.48,
+      "learning_rate": 0.007042857142857143,
+      "loss": 0.1924,
+      "step": 207
+    },
+    {
+      "epoch": 19.58,
+      "learning_rate": 0.007028571428571428,
+      "loss": 0.1512,
+      "step": 208
+    },
+    {
+      "epoch": 19.67,
+      "learning_rate": 0.007014285714285714,
+      "loss": 0.1375,
+      "step": 209
+    },
+    {
+      "epoch": 19.76,
+      "learning_rate": 0.006999999999999999,
+      "loss": 0.187,
+      "step": 210
+    },
+    {
+      "epoch": 19.86,
+      "learning_rate": 0.006985714285714286,
+      "loss": 0.2488,
+      "step": 211
+    },
+    {
+      "epoch": 19.95,
+      "learning_rate": 0.006971428571428572,
+      "loss": 0.1864,
+      "step": 212
+    },
+    {
+      "epoch": 20.05,
+      "learning_rate": 0.006957142857142857,
+      "loss": 0.1984,
+      "step": 213
+    },
+    {
+      "epoch": 20.14,
+      "learning_rate": 0.006942857142857143,
+      "loss": 0.156,
+      "step": 214
+    },
+    {
+      "epoch": 20.24,
+      "learning_rate": 0.006928571428571429,
+      "loss": 0.2082,
+      "step": 215
+    },
+    {
+      "epoch": 20.33,
+      "learning_rate": 0.006914285714285714,
+      "loss": 0.094,
+      "step": 216
+    },
+    {
+      "epoch": 20.42,
+      "learning_rate": 0.0069,
+      "loss": 0.1784,
+      "step": 217
+    },
+    {
+      "epoch": 20.52,
+      "learning_rate": 0.006885714285714287,
+      "loss": 0.1293,
+      "step": 218
+    },
+    {
+      "epoch": 20.61,
+      "learning_rate": 0.006871428571428572,
+      "loss": 0.1635,
+      "step": 219
+    },
+    {
+      "epoch": 20.71,
+      "learning_rate": 0.006857142857142858,
+      "loss": 0.1668,
+      "step": 220
+    },
+    {
+      "epoch": 20.8,
+      "learning_rate": 0.006842857142857143,
+      "loss": 0.1946,
+      "step": 221
+    },
+    {
+      "epoch": 20.89,
+      "learning_rate": 0.006828571428571429,
+      "loss": 0.2347,
+      "step": 222
+    },
+    {
+      "epoch": 20.99,
+      "learning_rate": 0.006814285714285714,
+      "loss": 0.1523,
+      "step": 223
+    },
+    {
+      "epoch": 21.08,
+      "learning_rate": 0.0068000000000000005,
+      "loss": 0.1337,
+      "step": 224
+    },
+    {
+      "epoch": 21.18,
+      "learning_rate": 0.006785714285714286,
+      "loss": 0.1511,
+      "step": 225
+    },
+    {
+      "epoch": 21.27,
+      "learning_rate": 0.0067714285714285715,
+      "loss": 0.1058,
+      "step": 226
+    },
+    {
+      "epoch": 21.36,
+      "learning_rate": 0.006757142857142857,
+      "loss": 0.172,
+      "step": 227
+    },
+    {
+      "epoch": 21.46,
+      "learning_rate": 0.0067428571428571425,
+      "loss": 0.1077,
+      "step": 228
+    },
+    {
+      "epoch": 21.55,
+      "learning_rate": 0.006728571428571428,
+      "loss": 0.1993,
+      "step": 229
+    },
+    {
+      "epoch": 21.65,
+      "learning_rate": 0.006714285714285714,
+      "loss": 0.1414,
+      "step": 230
+    },
+    {
+      "epoch": 21.74,
+      "learning_rate": 0.0067,
+      "loss": 0.126,
+      "step": 231
+    },
+    {
+      "epoch": 21.84,
+      "learning_rate": 0.006685714285714286,
+      "loss": 0.1528,
+      "step": 232
+    },
+    {
+      "epoch": 21.93,
+      "learning_rate": 0.006671428571428571,
+      "loss": 0.1316,
+      "step": 233
+    },
+    {
+      "epoch": 22.02,
+      "learning_rate": 0.006657142857142857,
+      "loss": 0.1565,
+      "step": 234
+    },
+    {
+      "epoch": 22.12,
+      "learning_rate": 0.006642857142857143,
+      "loss": 0.1088,
+      "step": 235
+    },
+    {
+      "epoch": 22.21,
+      "learning_rate": 0.006628571428571428,
+      "loss": 0.088,
+      "step": 236
+    },
+    {
+      "epoch": 22.31,
+      "learning_rate": 0.006614285714285715,
+      "loss": 0.1348,
+      "step": 237
+    },
+    {
+      "epoch": 22.4,
+      "learning_rate": 0.006600000000000001,
+      "loss": 0.1702,
+      "step": 238
+    },
+    {
+      "epoch": 22.49,
+      "learning_rate": 0.006585714285714286,
+      "loss": 0.132,
+      "step": 239
+    },
+    {
+      "epoch": 22.59,
+      "learning_rate": 0.006571428571428572,
+      "loss": 0.1115,
+      "step": 240
+    },
+    {
+      "epoch": 22.68,
+      "learning_rate": 0.006557142857142857,
+      "loss": 0.1173,
+      "step": 241
+    },
+    {
+      "epoch": 22.78,
+      "learning_rate": 0.006542857142857143,
+      "loss": 0.0967,
+      "step": 242
+    },
+    {
+      "epoch": 22.87,
+      "learning_rate": 0.006528571428571428,
+      "loss": 0.1484,
+      "step": 243
+    },
+    {
+      "epoch": 22.96,
+      "learning_rate": 0.006514285714285715,
+      "loss": 0.1566,
+      "step": 244
+    },
+    {
+      "epoch": 23.06,
+      "learning_rate": 0.006500000000000001,
+      "loss": 0.162,
+      "step": 245
+    },
+    {
+      "epoch": 23.15,
+      "learning_rate": 0.006485714285714286,
+      "loss": 0.1099,
+      "step": 246
+    },
+    {
+      "epoch": 23.25,
+      "learning_rate": 0.0064714285714285716,
+      "loss": 0.1087,
+      "step": 247
+    },
+    {
+      "epoch": 23.34,
+      "learning_rate": 0.006457142857142857,
+      "loss": 0.116,
+      "step": 248
+    },
+    {
+      "epoch": 23.44,
+      "learning_rate": 0.0064428571428571425,
+      "loss": 0.1096,
+      "step": 249
+    },
+    {
+      "epoch": 23.53,
+      "learning_rate": 0.006428571428571429,
+      "loss": 0.0972,
+      "step": 250
+    },
+    {
+      "epoch": 23.62,
+      "learning_rate": 0.006414285714285714,
+      "loss": 0.0889,
+      "step": 251
+    },
+    {
+      "epoch": 23.72,
+      "learning_rate": 0.0064,
+      "loss": 0.1199,
+      "step": 252
+    },
+    {
+      "epoch": 23.81,
+      "learning_rate": 0.006385714285714286,
+      "loss": 0.1337,
+      "step": 253
+    },
+    {
+      "epoch": 23.91,
+      "learning_rate": 0.006371428571428571,
+      "loss": 0.0977,
+      "step": 254
+    },
+    {
+      "epoch": 24.0,
+      "learning_rate": 0.006357142857142857,
+      "loss": 0.146,
+      "step": 255
+    },
+    {
+      "epoch": 24.09,
+      "learning_rate": 0.006342857142857142,
+      "loss": 0.1102,
+      "step": 256
+    },
+    {
+      "epoch": 24.19,
+      "learning_rate": 0.006328571428571429,
+      "loss": 0.1025,
+      "step": 257
+    },
+    {
+      "epoch": 24.28,
+      "learning_rate": 0.006314285714285715,
+      "loss": 0.09,
+      "step": 258
+    },
+    {
+      "epoch": 24.38,
+      "learning_rate": 0.0063,
+      "loss": 0.1302,
+      "step": 259
+    },
+    {
+      "epoch": 24.47,
+      "learning_rate": 0.006285714285714286,
+      "loss": 0.0739,
+      "step": 260
+    },
+    {
+      "epoch": 24.56,
+      "learning_rate": 0.006271428571428571,
+      "loss": 0.1172,
+      "step": 261
+    },
+    {
+      "epoch": 24.66,
+      "learning_rate": 0.006257142857142857,
+      "loss": 0.1048,
+      "step": 262
+    },
+    {
+      "epoch": 24.75,
+      "learning_rate": 0.006242857142857144,
+      "loss": 0.0977,
+      "step": 263
+    },
+    {
+      "epoch": 24.85,
+      "learning_rate": 0.006228571428571429,
+      "loss": 0.1056,
+      "step": 264
+    },
+    {
+      "epoch": 24.94,
+      "learning_rate": 0.006214285714285715,
+      "loss": 0.1252,
+      "step": 265
+    },
+    {
+      "epoch": 25.04,
+      "learning_rate": 0.0062,
+      "loss": 0.1107,
+      "step": 266
+    },
+    {
+      "epoch": 25.13,
+      "learning_rate": 0.006185714285714286,
+      "loss": 0.0887,
+      "step": 267
+    },
+    {
+      "epoch": 25.22,
+      "learning_rate": 0.006171428571428571,
+      "loss": 0.0836,
+      "step": 268
+    },
+    {
+      "epoch": 25.32,
+      "learning_rate": 0.0061571428571428576,
+      "loss": 0.0957,
+      "step": 269
+    },
+    {
+      "epoch": 25.41,
+      "learning_rate": 0.0061428571428571435,
+      "loss": 0.1165,
+      "step": 270
+    },
+    {
+      "epoch": 25.51,
+      "learning_rate": 0.0061285714285714285,
+      "loss": 0.1135,
+      "step": 271
+    },
+    {
+      "epoch": 25.6,
+      "learning_rate": 0.0061142857142857145,
+      "loss": 0.0901,
+      "step": 272
+    },
+    {
+      "epoch": 25.69,
+      "learning_rate": 0.0061,
+      "loss": 0.0751,
+      "step": 273
+    },
+    {
+      "epoch": 25.79,
+      "learning_rate": 0.0060857142857142854,
+      "loss": 0.109,
+      "step": 274
+    },
+    {
+      "epoch": 25.88,
+      "learning_rate": 0.006071428571428571,
+      "loss": 0.102,
+      "step": 275
+    },
+    {
+      "epoch": 25.98,
+      "learning_rate": 0.006057142857142858,
+      "loss": 0.0916,
+      "step": 276
+    },
+    {
+      "epoch": 26.07,
+      "learning_rate": 0.006042857142857143,
+      "loss": 0.0821,
+      "step": 277
+    },
+    {
+      "epoch": 26.16,
+      "learning_rate": 0.006028571428571429,
+      "loss": 0.0797,
+      "step": 278
+    },
+    {
+      "epoch": 26.26,
+      "learning_rate": 0.006014285714285714,
+      "loss": 0.0804,
+      "step": 279
+    },
+    {
+      "epoch": 26.35,
+      "learning_rate": 0.006,
+      "loss": 0.0987,
+      "step": 280
+    },
+    {
+      "epoch": 26.45,
+      "learning_rate": 0.005985714285714285,
+      "loss": 0.1192,
+      "step": 281
+    },
+    {
+      "epoch": 26.54,
+      "learning_rate": 0.005971428571428572,
+      "loss": 0.0699,
+      "step": 282
+    },
+    {
+      "epoch": 26.64,
+      "learning_rate": 0.005957142857142858,
+      "loss": 0.0902,
+      "step": 283
+    },
+    {
+      "epoch": 26.73,
+      "learning_rate": 0.005942857142857143,
+      "loss": 0.0916,
+      "step": 284
+    },
+    {
+      "epoch": 26.82,
+      "learning_rate": 0.005928571428571429,
+      "loss": 0.0753,
+      "step": 285
+    },
+    {
+      "epoch": 26.92,
+      "learning_rate": 0.005914285714285714,
+      "loss": 0.0964,
+      "step": 286
+    },
+    {
+      "epoch": 27.01,
+      "learning_rate": 0.0059,
+      "loss": 0.1108,
+      "step": 287
+    },
+    {
+      "epoch": 27.11,
+      "learning_rate": 0.005885714285714286,
+      "loss": 0.1062,
+      "step": 288
+    },
+    {
+      "epoch": 27.2,
+      "learning_rate": 0.005871428571428572,
+      "loss": 0.0846,
+      "step": 289
+    },
+    {
+      "epoch": 27.29,
+      "learning_rate": 0.005857142857142858,
+      "loss": 0.0986,
+      "step": 290
+    },
+    {
+      "epoch": 27.39,
+      "learning_rate": 0.005842857142857143,
+      "loss": 0.0713,
+      "step": 291
+    },
+    {
+      "epoch": 27.48,
+      "learning_rate": 0.005828571428571429,
+      "loss": 0.0829,
+      "step": 292
+    },
+    {
+      "epoch": 27.58,
+      "learning_rate": 0.0058142857142857145,
+      "loss": 0.1026,
+      "step": 293
+    },
+    {
+      "epoch": 27.67,
+      "learning_rate": 0.0058,
+      "loss": 0.0785,
+      "step": 294
+    },
+    {
+      "epoch": 27.76,
+      "learning_rate": 0.005785714285714286,
+      "loss": 0.0729,
+      "step": 295
+    },
+    {
+      "epoch": 27.86,
+      "learning_rate": 0.005771428571428572,
+      "loss": 0.0738,
+      "step": 296
+    },
+    {
+      "epoch": 27.95,
+      "learning_rate": 0.005757142857142857,
+      "loss": 0.079,
+      "step": 297
+    },
+    {
+      "epoch": 28.05,
+      "learning_rate": 0.005742857142857143,
+      "loss": 0.0761,
+      "step": 298
+    },
+    {
+      "epoch": 28.14,
+      "learning_rate": 0.005728571428571428,
+      "loss": 0.0792,
+      "step": 299
+    },
+    {
+      "epoch": 28.24,
+      "learning_rate": 0.005714285714285714,
+      "loss": 0.0881,
+      "step": 300
+    },
+    {
+      "epoch": 28.33,
+      "learning_rate": 0.005699999999999999,
+      "loss": 0.1073,
+      "step": 301
+    },
+    {
+      "epoch": 28.42,
+      "learning_rate": 0.005685714285714286,
+      "loss": 0.0686,
+      "step": 302
+    },
+    {
+      "epoch": 28.52,
+      "learning_rate": 0.005671428571428572,
+      "loss": 0.0701,
+      "step": 303
+    },
+    {
+      "epoch": 28.61,
+      "learning_rate": 0.005657142857142857,
+      "loss": 0.1114,
+      "step": 304
+    },
+    {
+      "epoch": 28.71,
+      "learning_rate": 0.005642857142857143,
+      "loss": 0.0595,
+      "step": 305
+    },
+    {
+      "epoch": 28.8,
+      "learning_rate": 0.005628571428571428,
+      "loss": 0.086,
+      "step": 306
+    },
+    {
+      "epoch": 28.89,
+      "learning_rate": 0.005614285714285714,
+      "loss": 0.0877,
+      "step": 307
+    },
+    {
+      "epoch": 28.99,
+      "learning_rate": 0.005600000000000001,
+      "loss": 0.0582,
+      "step": 308
+    },
+    {
+      "epoch": 29.08,
+      "learning_rate": 0.005585714285714286,
+      "loss": 0.0645,
+      "step": 309
+    },
+    {
+      "epoch": 29.18,
+      "learning_rate": 0.005571428571428572,
+      "loss": 0.1025,
+      "step": 310
+    },
+    {
+      "epoch": 29.27,
+      "learning_rate": 0.005557142857142857,
+      "loss": 0.0612,
+      "step": 311
+    },
+    {
+      "epoch": 29.36,
+      "learning_rate": 0.005542857142857143,
+      "loss": 0.0706,
+      "step": 312
+    },
+    {
+      "epoch": 29.46,
+      "learning_rate": 0.005528571428571429,
+      "loss": 0.0636,
+      "step": 313
+    },
+    {
+      "epoch": 29.55,
+      "learning_rate": 0.005514285714285714,
+      "loss": 0.0721,
+      "step": 314
+    },
+    {
+      "epoch": 29.65,
+      "learning_rate": 0.0055000000000000005,
+      "loss": 0.1062,
+      "step": 315
+    },
+    {
+      "epoch": 29.74,
+      "learning_rate": 0.0054857142857142865,
+      "loss": 0.0739,
+      "step": 316
+    },
+    {
+      "epoch": 29.84,
+      "learning_rate": 0.0054714285714285715,
+      "loss": 0.0688,
+      "step": 317
+    },
+    {
+      "epoch": 29.93,
+      "learning_rate": 0.0054571428571428575,
+      "loss": 0.0715,
+      "step": 318
+    },
+    {
+      "epoch": 30.02,
+      "learning_rate": 0.0054428571428571425,
+      "loss": 0.0628,
+      "step": 319
+    },
+    {
+      "epoch": 30.12,
+      "learning_rate": 0.0054285714285714284,
+      "loss": 0.0831,
+      "step": 320
+    },
+    {
+      "epoch": 30.21,
+      "learning_rate": 0.005414285714285715,
+      "loss": 0.0833,
+      "step": 321
+    },
+    {
+      "epoch": 30.31,
+      "learning_rate": 0.0054,
+      "loss": 0.09,
+      "step": 322
+    },
+    {
+      "epoch": 30.4,
+      "learning_rate": 0.005385714285714286,
+      "loss": 0.0469,
+      "step": 323
+    },
+    {
+      "epoch": 30.49,
+      "learning_rate": 0.005371428571428571,
+      "loss": 0.0631,
+      "step": 324
+    },
+    {
+      "epoch": 30.59,
+      "learning_rate": 0.005357142857142857,
+      "loss": 0.0685,
+      "step": 325
+    },
+    {
+      "epoch": 30.68,
+      "learning_rate": 0.005342857142857142,
+      "loss": 0.0798,
+      "step": 326
+    },
+    {
+      "epoch": 30.78,
+      "learning_rate": 0.005328571428571428,
+      "loss": 0.0653,
+      "step": 327
+    },
+    {
+      "epoch": 30.87,
+      "learning_rate": 0.005314285714285715,
+      "loss": 0.0615,
+      "step": 328
+    },
+    {
+      "epoch": 30.96,
+      "learning_rate": 0.0053,
+      "loss": 0.0548,
+      "step": 329
+    },
+    {
+      "epoch": 31.06,
+      "learning_rate": 0.005285714285714286,
+      "loss": 0.0592,
+      "step": 330
+    },
+    {
+      "epoch": 31.15,
+      "learning_rate": 0.005271428571428572,
+      "loss": 0.0628,
+      "step": 331
+    },
+    {
+      "epoch": 31.25,
+      "learning_rate": 0.005257142857142857,
+      "loss": 0.0604,
+      "step": 332
+    },
+    {
+      "epoch": 31.34,
+      "learning_rate": 0.005242857142857143,
+      "loss": 0.0833,
+      "step": 333
+    },
+    {
+      "epoch": 31.44,
+      "learning_rate": 0.005228571428571429,
+      "loss": 0.0748,
+      "step": 334
+    },
+    {
+      "epoch": 31.53,
+      "learning_rate": 0.005214285714285715,
+      "loss": 0.0495,
+      "step": 335
+    },
+    {
+      "epoch": 31.62,
+      "learning_rate": 0.005200000000000001,
+      "loss": 0.0589,
+      "step": 336
+    },
+    {
+      "epoch": 31.72,
+      "learning_rate": 0.005185714285714286,
+      "loss": 0.0655,
+      "step": 337
+    },
+    {
+      "epoch": 31.81,
+      "learning_rate": 0.005171428571428572,
+      "loss": 0.0695,
+      "step": 338
+    },
+    {
+      "epoch": 31.91,
+      "learning_rate": 0.005157142857142857,
+      "loss": 0.0609,
+      "step": 339
+    },
+    {
+      "epoch": 32.0,
+      "learning_rate": 0.005142857142857143,
+      "loss": 0.0636,
+      "step": 340
+    },
+    {
+      "epoch": 32.09,
+      "learning_rate": 0.005128571428571429,
+      "loss": 0.0606,
+      "step": 341
+    },
+    {
+      "epoch": 32.19,
+      "learning_rate": 0.0051142857142857144,
+      "loss": 0.0739,
+      "step": 342
+    },
+    {
+      "epoch": 32.28,
+      "learning_rate": 0.0051,
+      "loss": 0.0535,
+      "step": 343
+    },
+    {
+      "epoch": 32.38,
+      "learning_rate": 0.005085714285714285,
+      "loss": 0.0598,
+      "step": 344
+    },
+    {
+      "epoch": 32.47,
+      "learning_rate": 0.005071428571428571,
+      "loss": 0.06,
+      "step": 345
+    },
+    {
+      "epoch": 32.56,
+      "learning_rate": 0.005057142857142856,
+      "loss": 0.0734,
+      "step": 346
+    },
+    {
+      "epoch": 32.66,
+      "learning_rate": 0.005042857142857143,
+      "loss": 0.078,
+      "step": 347
+    },
+    {
+      "epoch": 32.75,
+      "learning_rate": 0.005028571428571429,
+      "loss": 0.0618,
+      "step": 348
+    },
+    {
+      "epoch": 32.85,
+      "learning_rate": 0.005014285714285714,
+      "loss": 0.0655,
+      "step": 349
+    },
+    {
+      "epoch": 32.94,
+      "learning_rate": 0.005,
+      "loss": 0.0615,
+      "step": 350
+    },
+    {
+      "epoch": 33.04,
+      "learning_rate": 0.004985714285714286,
+      "loss": 0.0556,
+      "step": 351
+    },
+    {
+      "epoch": 33.13,
+      "learning_rate": 0.004971428571428572,
+      "loss": 0.0637,
+      "step": 352
+    },
+    {
+      "epoch": 33.22,
+      "learning_rate": 0.004957142857142857,
+      "loss": 0.0518,
+      "step": 353
+    },
+    {
+      "epoch": 33.32,
+      "learning_rate": 0.004942857142857143,
+      "loss": 0.0466,
+      "step": 354
+    },
+    {
+      "epoch": 33.41,
+      "learning_rate": 0.004928571428571429,
+      "loss": 0.0732,
+      "step": 355
+    },
+    {
+      "epoch": 33.51,
+      "learning_rate": 0.004914285714285715,
+      "loss": 0.0584,
+      "step": 356
+    },
+    {
+      "epoch": 33.6,
+      "learning_rate": 0.0049,
+      "loss": 0.0586,
+      "step": 357
+    },
+    {
+      "epoch": 33.69,
+      "learning_rate": 0.004885714285714286,
+      "loss": 0.0481,
+      "step": 358
+    },
+    {
+      "epoch": 33.79,
+      "learning_rate": 0.004871428571428572,
+      "loss": 0.0552,
+      "step": 359
+    },
+    {
+      "epoch": 33.88,
+      "learning_rate": 0.004857142857142858,
+      "loss": 0.0567,
+      "step": 360
+    },
+    {
+      "epoch": 33.98,
+      "learning_rate": 0.004842857142857143,
+      "loss": 0.0664,
+      "step": 361
+    },
+    {
+      "epoch": 34.07,
+      "learning_rate": 0.004828571428571429,
+      "loss": 0.0701,
+      "step": 362
+    },
+    {
+      "epoch": 34.16,
+      "learning_rate": 0.0048142857142857145,
+      "loss": 0.069,
+      "step": 363
+    },
+    {
+      "epoch": 34.26,
+      "learning_rate": 0.0048,
+      "loss": 0.066,
+      "step": 364
+    },
+    {
+      "epoch": 34.35,
+      "learning_rate": 0.004785714285714286,
+      "loss": 0.0546,
+      "step": 365
+    },
+    {
+      "epoch": 34.45,
+      "learning_rate": 0.004771428571428571,
+      "loss": 0.0616,
+      "step": 366
+    },
+    {
+      "epoch": 34.54,
+      "learning_rate": 0.004757142857142857,
+      "loss": 0.0374,
+      "step": 367
+    },
+    {
+      "epoch": 34.64,
+      "learning_rate": 0.004742857142857143,
+      "loss": 0.046,
+      "step": 368
+    },
+    {
+      "epoch": 34.73,
+      "learning_rate": 0.004728571428571428,
+      "loss": 0.0459,
+      "step": 369
+    },
+    {
+      "epoch": 34.82,
+      "learning_rate": 0.004714285714285714,
+      "loss": 0.0648,
+      "step": 370
+    },
+    {
+      "epoch": 34.92,
+      "learning_rate": 0.0047,
+      "loss": 0.0699,
+      "step": 371
+    },
+    {
+      "epoch": 35.01,
+      "learning_rate": 0.004685714285714286,
+      "loss": 0.0605,
+      "step": 372
+    },
+    {
+      "epoch": 35.11,
+      "learning_rate": 0.004671428571428571,
+      "loss": 0.0704,
+      "step": 373
+    },
+    {
+      "epoch": 35.2,
+      "learning_rate": 0.004657142857142857,
+      "loss": 0.0444,
+      "step": 374
+    },
+    {
+      "epoch": 35.29,
+      "learning_rate": 0.004642857142857143,
+      "loss": 0.062,
+      "step": 375
+    },
+    {
+      "epoch": 35.39,
+      "learning_rate": 0.004628571428571429,
+      "loss": 0.0464,
+      "step": 376
+    },
+    {
+      "epoch": 35.48,
+      "learning_rate": 0.004614285714285714,
+      "loss": 0.0548,
+      "step": 377
+    },
+    {
+      "epoch": 35.58,
+      "learning_rate": 0.0046,
+      "loss": 0.0555,
+      "step": 378
+    },
+    {
+      "epoch": 35.67,
+      "learning_rate": 0.004585714285714286,
+      "loss": 0.0654,
+      "step": 379
+    },
+    {
+      "epoch": 35.76,
+      "learning_rate": 0.004571428571428572,
+      "loss": 0.0592,
+      "step": 380
+    },
+    {
+      "epoch": 35.86,
+      "learning_rate": 0.004557142857142858,
+      "loss": 0.0521,
+      "step": 381
+    },
+    {
+      "epoch": 35.95,
+      "learning_rate": 0.004542857142857143,
+      "loss": 0.0633,
+      "step": 382
+    },
+    {
+      "epoch": 36.05,
+      "learning_rate": 0.004528571428571429,
+      "loss": 0.047,
+      "step": 383
+    },
+    {
+      "epoch": 36.14,
+      "learning_rate": 0.004514285714285714,
+      "loss": 0.0476,
+      "step": 384
+    },
+    {
+      "epoch": 36.24,
+      "learning_rate": 0.0045000000000000005,
+      "loss": 0.051,
+      "step": 385
+    },
+    {
+      "epoch": 36.33,
+      "learning_rate": 0.004485714285714286,
+      "loss": 0.064,
+      "step": 386
+    },
+    {
+      "epoch": 36.42,
+      "learning_rate": 0.0044714285714285715,
+      "loss": 0.0309,
+      "step": 387
+    },
+    {
+      "epoch": 36.52,
+      "learning_rate": 0.0044571428571428574,
+      "loss": 0.0632,
+      "step": 388
+    },
+    {
+      "epoch": 36.61,
+      "learning_rate": 0.004442857142857143,
+      "loss": 0.0583,
+      "step": 389
+    },
+    {
+      "epoch": 36.71,
+      "learning_rate": 0.004428571428571428,
+      "loss": 0.0524,
+      "step": 390
+    },
+    {
+      "epoch": 36.8,
+      "learning_rate": 0.004414285714285714,
+      "loss": 0.0574,
+      "step": 391
+    },
+    {
+      "epoch": 36.89,
+      "learning_rate": 0.0044,
+      "loss": 0.043,
+      "step": 392
+    },
+    {
+      "epoch": 36.99,
+      "learning_rate": 0.004385714285714285,
+      "loss": 0.0482,
+      "step": 393
+    },
+    {
+      "epoch": 37.08,
+      "learning_rate": 0.004371428571428572,
+      "loss": 0.0585,
+      "step": 394
+    },
+    {
+      "epoch": 37.18,
+      "learning_rate": 0.004357142857142857,
+      "loss": 0.0467,
+      "step": 395
+    },
+    {
+      "epoch": 37.27,
+      "learning_rate": 0.004342857142857143,
+      "loss": 0.0498,
+      "step": 396
+    },
+    {
+      "epoch": 37.36,
+      "learning_rate": 0.004328571428571429,
+      "loss": 0.0578,
+      "step": 397
+    },
+    {
+      "epoch": 37.46,
+      "learning_rate": 0.004314285714285714,
+      "loss": 0.0469,
+      "step": 398
+    },
+    {
+      "epoch": 37.55,
+      "learning_rate": 0.0043,
+      "loss": 0.0447,
+      "step": 399
+    },
+    {
+      "epoch": 37.65,
+      "learning_rate": 0.004285714285714286,
+      "loss": 0.0669,
+      "step": 400
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 700,
+  "num_train_epochs": 70,
+  "save_steps": 100,
+  "total_flos": 4.700697287196672e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/training_args.bin b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..17f9bfbf1a7cdd9e0e808e0672d55ad9ad4efb5f
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00669a32a6ddac0a3243bbc04d3f1f70ffc8f89f2626c1fdafa93ce68c311aa0
+size 4664
diff --git a/linghua_pt-20231202-155337-128-1e-2/train.log b/linghua_pt-20231202-155337-128-1e-2/train.log
new file mode 100644
index 0000000000000000000000000000000000000000..f0083873ad0e076c5283417ad46ac56382c053a5
--- /dev/null
+++ b/linghua_pt-20231202-155337-128-1e-2/train.log
@@ -0,0 +1,2712 @@
+[2023-12-02 15:53:38,497] torch.distributed.run: [WARNING] master_addr is only used for static rdzv_backend and when rdzv_endpoint is not specified.
+12/02/2023 15:53:40 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False
+12/02/2023 15:53:40 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
+_n_gpu=1,
+adafactor=False,
+adam_beta1=0.9,
+adam_beta2=0.999,
+adam_epsilon=1e-08,
+auto_find_batch_size=False,
+bf16=False,
+bf16_full_eval=False,
+data_seed=None,
+dataloader_drop_last=False,
+dataloader_num_workers=0,
+dataloader_pin_memory=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=1800,
+debug=[],
+deepspeed=None,
+disable_tqdm=False,
+dispatch_batches=None,
+do_eval=False,
+do_predict=False,
+do_train=False,
+eval_accumulation_steps=None,
+eval_delay=0,
+eval_steps=None,
+evaluation_strategy=no,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+fsdp=[],
+fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+generation_config=None,
+generation_max_length=None,
+generation_num_beams=None,
+gradient_accumulation_steps=16,
+gradient_checkpointing=False,
+greater_is_better=None,
+group_by_length=False,
+half_precision_backend=auto,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=False,
+hub_strategy=every_save,
+hub_token=<HUB_TOKEN>,
+ignore_data_skip=False,
+include_inputs_for_metrics=False,
+include_tokens_per_second=False,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+learning_rate=0.01,
+length_column_name=length,
+load_best_model_at_end=False,
+local_rank=0,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=output/linghua_pt-20231202-155337-128-1e-2/runs/Dec02_15-53-40_2e1f45f46fdb,
+logging_first_step=False,
+logging_nan_inf_filter=True,
+logging_steps=1.0,
+logging_strategy=steps,
+lr_scheduler_type=linear,
+max_grad_norm=1.0,
+max_steps=700,
+metric_for_best_model=None,
+mp_parameters=,
+no_cuda=False,
+num_train_epochs=3.0,
+optim=adamw_torch,
+optim_args=None,
+output_dir=output/linghua_pt-20231202-155337-128-1e-2,
+overwrite_output_dir=False,
+past_index=-1,
+per_device_eval_batch_size=8,
+per_device_train_batch_size=1,
+predict_with_generate=False,
+prediction_loss_only=False,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+ray_scope=last,
+remove_unused_columns=True,
+report_to=[],
+resume_from_checkpoint=None,
+run_name=output/linghua_pt-20231202-155337-128-1e-2,
+save_on_each_node=False,
+save_safetensors=False,
+save_steps=100,
+save_strategy=steps,
+save_total_limit=None,
+seed=42,
+sharded_ddp=[],
+skip_memory_metrics=True,
+sortish_sampler=False,
+tf32=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torchdynamo=None,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+use_cpu=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_mps_device=False,
+warmup_ratio=0.0,
+warmup_steps=0,
+weight_decay=0.0,
+)
+[INFO|configuration_utils.py:713] 2023-12-02 15:53:40,603 >> loading configuration file chatglm3-6b/config.json
+[INFO|configuration_utils.py:713] 2023-12-02 15:53:40,607 >> loading configuration file chatglm3-6b/config.json
+[INFO|configuration_utils.py:775] 2023-12-02 15:53:40,608 >> Model config ChatGLMConfig {
+  "_name_or_path": "chatglm3-6b",
+  "add_bias_linear": false,
+  "add_qkv_bias": true,
+  "apply_query_key_layer_scaling": true,
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "ChatGLMModel"
+  ],
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "auto_map": {
+    "AutoConfig": "configuration_chatglm.ChatGLMConfig",
+    "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
+  },
+  "bias_dropout_fusion": true,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "ffn_hidden_size": 13696,
+  "fp32_residual_connection": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 4096,
+  "kv_channels": 128,
+  "layernorm_epsilon": 1e-05,
+  "model_type": "chatglm",
+  "multi_query_attention": true,
+  "multi_query_group_num": 2,
+  "num_attention_heads": 32,
+  "num_layers": 28,
+  "original_rope": true,
+  "pad_token_id": 0,
+  "padded_vocab_size": 65024,
+  "post_layer_norm": true,
+  "pre_seq_len": null,
+  "prefix_projection": false,
+  "quantization_bit": 0,
+  "rmsnorm": true,
+  "seq_length": 8192,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.34.0",
+  "use_cache": true,
+  "vocab_size": 65024
+}
+
+[INFO|tokenization_utils_base.py:2041] 2023-12-02 15:53:40,612 >> loading file tokenizer.model
+[INFO|tokenization_utils_base.py:2041] 2023-12-02 15:53:40,612 >> loading file added_tokens.json
+[INFO|tokenization_utils_base.py:2041] 2023-12-02 15:53:40,612 >> loading file special_tokens_map.json
+[INFO|tokenization_utils_base.py:2041] 2023-12-02 15:53:40,612 >> loading file tokenizer_config.json
+[INFO|tokenization_utils_base.py:2041] 2023-12-02 15:53:40,612 >> loading file tokenizer.json
+[INFO|modeling_utils.py:2990] 2023-12-02 15:53:40,832 >> loading weights file chatglm3-6b/pytorch_model.bin.index.json
+[INFO|configuration_utils.py:770] 2023-12-02 15:53:40,833 >> Generate config GenerationConfig {
+  "eos_token_id": 2,
+  "pad_token_id": 0
+}
+
+Loading checkpoint shards:   0%|                                                                                   | 0/7 [00:00<?, ?it/s]Loading checkpoint shards:  14%|██████████▋                                                                | 1/7 [00:01<00:08,  1.37s/it]Loading checkpoint shards:  29%|█████████████████████▍                                                     | 2/7 [00:02<00:07,  1.44s/it]Loading checkpoint shards:  43%|████████████████████████████████▏                                          | 3/7 [00:04<00:05,  1.44s/it]Loading checkpoint shards:  57%|██████████████████████████████████████████▊                                | 4/7 [00:05<00:04,  1.41s/it]Loading checkpoint shards:  71%|█████████████████████████████████████████████████████▌                     | 5/7 [00:07<00:02,  1.43s/it]Loading checkpoint shards:  86%|████████████████████████████████████████████████████████████████▎          | 6/7 [00:08<00:01,  1.44s/it]Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.23s/it]Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.34s/it]
+[INFO|modeling_utils.py:3775] 2023-12-02 15:53:50,295 >> All model checkpoint weights were used when initializing ChatGLMForConditionalGeneration.
+
+[WARNING|modeling_utils.py:3777] 2023-12-02 15:53:50,295 >> Some weights of ChatGLMForConditionalGeneration were not initialized from the model checkpoint at chatglm3-6b and are newly initialized: ['transformer.prefix_encoder.embedding.weight']
+You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+[INFO|modeling_utils.py:3352] 2023-12-02 15:53:50,297 >> Generation config file not found, using a generation config created from the model config.
+Sanity Check >>>>>>>>>>>>>
+           '[gMASK]':  64790 ->   -100
+               'sop':  64792 ->   -100
+        '<|system|>':  64794 ->   -100
+                  '':  30910 ->   -100
+                '\n':     13 ->   -100
+                  '':  30910 ->   -100
+                '你的':  31822 ->   -100
+                '名字':  32873 ->   -100
+                 '是':  54532 ->   -100
+                 '神':  54826 ->   -100
+                 '里':  54662 ->   -100
+                 '绫':  60309 ->   -100
+                 '华':  54855 ->   -100
+                 '，':  31123 ->   -100
+                '你是':  34607 ->   -100
+                 '稻':  56929 ->   -100
+                 '妻':  55769 ->   -100
+                 '「':  31519 ->   -100
+                 '社':  54731 ->   -100
+                 '奉':  56053 ->   -100
+                 '行':  54560 ->   -100
+                 '」':  31522 ->   -100
+                 '神':  54826 ->   -100
+                 '里':  54662 ->   -100
+                 '家':  54561 ->   -100
+                '的大':  31922 ->   -100
+                '小姐':  36028 ->   -100
+                 '。':  31155 ->   -100
+                 '请':  55073 ->   -100
+               '详细的':  42196 ->   -100
+                '回答':  33287 ->   -100
+                '用户':  32053 ->   -100
+               '的一切':  34688 ->   -100
+                '问题':  31639 ->   -100
+                 '。':  31155 ->   -100
+          '<|user|>':  64795 ->   -100
+                  '':  30910 ->   -100
+                '\n':     13 ->   -100
+                 '你':  36474 ->   -100
+                 '好':  54591 ->   -100
+                 '呀':  56657 ->   -100
+                 '！':  31404 ->   -100
+     '<|assistant|>':  64796 ->   -100
+                  '':  30910 ->  30910
+                '\n':     13 ->     13
+                 '你':  36474 ->  36474
+                 '好':  54591 ->  54591
+                 '呀':  56657 ->  56657
+                 '，':  31123 ->  31123
+                '旅行':  33450 ->  33450
+                 '者':  54631 ->  54631
+                 '！':  31404 ->  31404
+                '我是':  33030 ->  33030
+                 '神':  54826 ->  54826
+                 '里':  54662 ->  54662
+                 '绫':  60309 ->  60309
+                 '华':  54855 ->  54855
+                 '，':  31123 ->  31123
+               '很高兴':  48895 ->  48895
+                '认识':  32254 ->  32254
+                 '你':  54622 ->  54622
+                 '！':  31404 ->  31404
+                  '':      2 ->      2
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+                  '':      0 ->   -100
+<<<<<<<<<<<<< Sanity Check
+[INFO|trainer.py:576] 2023-12-02 15:53:52,690 >> max_steps is given, it will override any value given in num_train_epochs
+[INFO|trainer.py:1760] 2023-12-02 15:53:53,665 >> ***** Running training *****
+[INFO|trainer.py:1761] 2023-12-02 15:53:53,665 >>   Num examples = 170
+[INFO|trainer.py:1762] 2023-12-02 15:53:53,665 >>   Num Epochs = 70
+[INFO|trainer.py:1763] 2023-12-02 15:53:53,666 >>   Instantaneous batch size per device = 1
+[INFO|trainer.py:1766] 2023-12-02 15:53:53,666 >>   Total train batch size (w. parallel, distributed & accumulation) = 16
+[INFO|trainer.py:1767] 2023-12-02 15:53:53,666 >>   Gradient Accumulation steps = 16
+[INFO|trainer.py:1768] 2023-12-02 15:53:53,666 >>   Total optimization steps = 700
+[INFO|trainer.py:1769] 2023-12-02 15:53:53,666 >>   Number of trainable parameters = 1,835,008
+  0%|                                                                                                            | 0/700 [00:00<?, ?it/s]12/02/2023 15:53:54 - WARNING - transformers_modules.chatglm3-6b.modeling_chatglm - `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
+/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+  0%|▏                                                                                                 | 1/700 [00:13<2:38:25, 13.60s/it]                                                                                                                                         {'loss': 2.6971, 'learning_rate': 0.009985714285714285, 'epoch': 0.09}
+  0%|▏                                                                                                 | 1/700 [00:13<2:38:25, 13.60s/it]  0%|▎                                                                                                 | 2/700 [00:25<2:27:54, 12.71s/it]                                                                                                                                         {'loss': 2.3927, 'learning_rate': 0.009971428571428572, 'epoch': 0.19}
+  0%|▎                                                                                                 | 2/700 [00:25<2:27:54, 12.71s/it]  0%|▍                                                                                                 | 3/700 [00:37<2:24:28, 12.44s/it]                                                                                                                                         {'loss': 2.2539, 'learning_rate': 0.009957142857142857, 'epoch': 0.28}
+  0%|▍                                                                                                 | 3/700 [00:37<2:24:28, 12.44s/it]  1%|▌                                                                                                 | 4/700 [00:49<2:22:47, 12.31s/it]                                                                                                                                         {'loss': 2.1408, 'learning_rate': 0.009942857142857144, 'epoch': 0.38}
+  1%|▌                                                                                                 | 4/700 [00:49<2:22:47, 12.31s/it]  1%|▋                                                                                                 | 5/700 [01:02<2:21:52, 12.25s/it]                                                                                                                                         {'loss': 2.2672, 'learning_rate': 0.009928571428571429, 'epoch': 0.47}
+  1%|▋                                                                                                 | 5/700 [01:02<2:21:52, 12.25s/it]  1%|▊                                                                                                 | 6/700 [01:14<2:21:16, 12.21s/it]                                                                                                                                         {'loss': 1.6433, 'learning_rate': 0.009914285714285714, 'epoch': 0.56}
+  1%|▊                                                                                                 | 6/700 [01:14<2:21:16, 12.21s/it]  1%|▉                                                                                                 | 7/700 [01:26<2:20:50, 12.19s/it]                                                                                                                                         {'loss': 2.1405, 'learning_rate': 0.0099, 'epoch': 0.66}
+  1%|▉                                                                                                 | 7/700 [01:26<2:20:50, 12.19s/it]  1%|█                                                                                                 | 8/700 [01:38<2:20:30, 12.18s/it]                                                                                                                                         {'loss': 2.1464, 'learning_rate': 0.009885714285714286, 'epoch': 0.75}
+  1%|█                                                                                                 | 8/700 [01:38<2:20:30, 12.18s/it]  1%|█▎                                                                                                | 9/700 [01:50<2:20:13, 12.18s/it]                                                                                                                                         {'loss': 1.8498, 'learning_rate': 0.009871428571428571, 'epoch': 0.85}
+  1%|█▎                                                                                                | 9/700 [01:50<2:20:13, 12.18s/it]  1%|█▍                                                                                               | 10/700 [02:02<2:19:58, 12.17s/it]                                                                                                                                         {'loss': 1.6896, 'learning_rate': 0.009857142857142858, 'epoch': 0.94}
+  1%|█▍                                                                                               | 10/700 [02:02<2:19:58, 12.17s/it]  2%|█▌                                                                                               | 11/700 [02:15<2:19:44, 12.17s/it]                                                                                                                                         {'loss': 2.1932, 'learning_rate': 0.009842857142857143, 'epoch': 1.04}
+  2%|█▌                                                                                               | 11/700 [02:15<2:19:44, 12.17s/it]  2%|█▋                                                                                               | 12/700 [02:27<2:19:31, 12.17s/it]                                                                                                                                         {'loss': 1.8236, 'learning_rate': 0.00982857142857143, 'epoch': 1.13}
+  2%|█▋                                                                                               | 12/700 [02:27<2:19:31, 12.17s/it]  2%|█▊                                                                                               | 13/700 [02:39<2:19:18, 12.17s/it]                                                                                                                                         {'loss': 1.735, 'learning_rate': 0.009814285714285715, 'epoch': 1.22}
+  2%|█▊                                                                                               | 13/700 [02:39<2:19:18, 12.17s/it]  2%|█▉                                                                                               | 14/700 [02:51<2:19:05, 12.17s/it]                                                                                                                                         {'loss': 1.7488, 'learning_rate': 0.0098, 'epoch': 1.32}
+  2%|█▉                                                                                               | 14/700 [02:51<2:19:05, 12.17s/it]  2%|██                                                                                               | 15/700 [03:03<2:18:53, 12.17s/it]                                                                                                                                         {'loss': 1.8336, 'learning_rate': 0.009785714285714285, 'epoch': 1.41}
+  2%|██                                                                                               | 15/700 [03:03<2:18:53, 12.17s/it]  2%|██▏                                                                                              | 16/700 [03:15<2:18:40, 12.16s/it]                                                                                                                                         {'loss': 1.9438, 'learning_rate': 0.009771428571428572, 'epoch': 1.51}
+  2%|██▏                                                                                              | 16/700 [03:15<2:18:40, 12.16s/it]  2%|██▎                                                                                              | 17/700 [03:27<2:18:29, 12.17s/it]                                                                                                                                         {'loss': 1.7178, 'learning_rate': 0.009757142857142858, 'epoch': 1.6}
+  2%|██▎                                                                                              | 17/700 [03:27<2:18:29, 12.17s/it]  3%|██▍                                                                                              | 18/700 [03:40<2:18:16, 12.17s/it]                                                                                                                                         {'loss': 1.5714, 'learning_rate': 0.009742857142857143, 'epoch': 1.69}
+  3%|██▍                                                                                              | 18/700 [03:40<2:18:16, 12.17s/it]  3%|██▋                                                                                              | 19/700 [03:52<2:18:04, 12.17s/it]                                                                                                                                         {'loss': 1.537, 'learning_rate': 0.009728571428571428, 'epoch': 1.79}
+  3%|██▋                                                                                              | 19/700 [03:52<2:18:04, 12.17s/it]  3%|██▊                                                                                              | 20/700 [04:04<2:17:52, 12.17s/it]                                                                                                                                         {'loss': 1.6764, 'learning_rate': 0.009714285714285715, 'epoch': 1.88}
+  3%|██▊                                                                                              | 20/700 [04:04<2:17:52, 12.17s/it]  3%|██▉                                                                                              | 21/700 [04:16<2:17:42, 12.17s/it]                                                                                                                                         {'loss': 1.8919, 'learning_rate': 0.0097, 'epoch': 1.98}
+  3%|██▉                                                                                              | 21/700 [04:16<2:17:42, 12.17s/it]  3%|███                                                                                              | 22/700 [04:28<2:17:30, 12.17s/it]                                                                                                                                         {'loss': 1.346, 'learning_rate': 0.009685714285714285, 'epoch': 2.07}
+  3%|███                                                                                              | 22/700 [04:28<2:17:30, 12.17s/it]  3%|███▏                                                                                             | 23/700 [04:41<2:17:21, 12.17s/it]                                                                                                                                         {'loss': 1.5036, 'learning_rate': 0.009671428571428572, 'epoch': 2.16}
+  3%|███▏                                                                                             | 23/700 [04:41<2:17:21, 12.17s/it]  3%|███▎                                                                                             | 24/700 [04:53<2:17:10, 12.18s/it]                                                                                                                                         {'loss': 1.6788, 'learning_rate': 0.009657142857142857, 'epoch': 2.26}
+  3%|███▎                                                                                             | 24/700 [04:53<2:17:10, 12.18s/it]  4%|███▍                                                                                             | 25/700 [05:05<2:17:02, 12.18s/it]                                                                                                                                         {'loss': 1.6667, 'learning_rate': 0.009642857142857144, 'epoch': 2.35}
+  4%|███▍                                                                                             | 25/700 [05:05<2:17:02, 12.18s/it]  4%|███▌                                                                                             | 26/700 [05:17<2:16:46, 12.18s/it]                                                                                                                                         {'loss': 1.7153, 'learning_rate': 0.009628571428571429, 'epoch': 2.45}
+  4%|███▌                                                                                             | 26/700 [05:17<2:16:46, 12.18s/it]  4%|███▋                                                                                             | 27/700 [05:29<2:16:32, 12.17s/it]                                                                                                                                         {'loss': 1.601, 'learning_rate': 0.009614285714285714, 'epoch': 2.54}
+  4%|███▋                                                                                             | 27/700 [05:29<2:16:32, 12.17s/it]  4%|███▉                                                                                             | 28/700 [05:41<2:16:18, 12.17s/it]                                                                                                                                         {'loss': 1.3002, 'learning_rate': 0.0096, 'epoch': 2.64}
+  4%|███▉                                                                                             | 28/700 [05:41<2:16:18, 12.17s/it]  4%|████                                                                                             | 29/700 [05:54<2:16:05, 12.17s/it]                                                                                                                                         {'loss': 1.3294, 'learning_rate': 0.009585714285714286, 'epoch': 2.73}
+  4%|████                                                                                             | 29/700 [05:54<2:16:05, 12.17s/it]  4%|████▏                                                                                            | 30/700 [06:06<2:15:54, 12.17s/it]                                                                                                                                         {'loss': 1.7477, 'learning_rate': 0.009571428571428573, 'epoch': 2.82}
+  4%|████▏                                                                                            | 30/700 [06:06<2:15:54, 12.17s/it]  4%|████▎                                                                                            | 31/700 [06:18<2:15:41, 12.17s/it]                                                                                                                                         {'loss': 1.7961, 'learning_rate': 0.009557142857142858, 'epoch': 2.92}
+  4%|████▎                                                                                            | 31/700 [06:18<2:15:41, 12.17s/it]  5%|████▍                                                                                            | 32/700 [06:30<2:15:29, 12.17s/it]                                                                                                                                         {'loss': 1.4954, 'learning_rate': 0.009542857142857143, 'epoch': 3.01}
+  5%|████▍                                                                                            | 32/700 [06:30<2:15:29, 12.17s/it]  5%|████▌                                                                                            | 33/700 [06:42<2:15:22, 12.18s/it]                                                                                                                                         {'loss': 1.6452, 'learning_rate': 0.009528571428571428, 'epoch': 3.11}
+  5%|████▌                                                                                            | 33/700 [06:42<2:15:22, 12.18s/it]  5%|████▋                                                                                            | 34/700 [06:54<2:15:13, 12.18s/it]                                                                                                                                         {'loss': 1.3528, 'learning_rate': 0.009514285714285715, 'epoch': 3.2}
+  5%|████▋                                                                                            | 34/700 [06:54<2:15:13, 12.18s/it]  5%|████▊                                                                                            | 35/700 [07:07<2:15:03, 12.19s/it]                                                                                                                                         {'loss': 1.4811, 'learning_rate': 0.0095, 'epoch': 3.29}
+  5%|████▊                                                                                            | 35/700 [07:07<2:15:03, 12.19s/it]  5%|████▉                                                                                            | 36/700 [07:19<2:14:53, 12.19s/it]                                                                                                                                         {'loss': 1.4738, 'learning_rate': 0.009485714285714287, 'epoch': 3.39}
+  5%|████▉                                                                                            | 36/700 [07:19<2:14:53, 12.19s/it]  5%|█████▏                                                                                           | 37/700 [07:31<2:14:40, 12.19s/it]                                                                                                                                         {'loss': 1.174, 'learning_rate': 0.009471428571428572, 'epoch': 3.48}
+  5%|█████▏                                                                                           | 37/700 [07:31<2:14:40, 12.19s/it]  5%|█████▎                                                                                           | 38/700 [07:43<2:14:24, 12.18s/it]                                                                                                                                         {'loss': 1.2346, 'learning_rate': 0.009457142857142857, 'epoch': 3.58}
+  5%|█████▎                                                                                           | 38/700 [07:43<2:14:24, 12.18s/it]  6%|█████▍                                                                                           | 39/700 [07:55<2:14:12, 12.18s/it]                                                                                                                                         {'loss': 1.5327, 'learning_rate': 0.009442857142857143, 'epoch': 3.67}
+  6%|█████▍                                                                                           | 39/700 [07:55<2:14:12, 12.18s/it]  6%|█████▌                                                                                           | 40/700 [08:08<2:13:57, 12.18s/it]                                                                                                                                         {'loss': 1.5249, 'learning_rate': 0.009428571428571429, 'epoch': 3.76}
+  6%|█████▌                                                                                           | 40/700 [08:08<2:13:57, 12.18s/it]  6%|█████▋                                                                                           | 41/700 [08:20<2:13:42, 12.17s/it]                                                                                                                                         {'loss': 1.5086, 'learning_rate': 0.009414285714285714, 'epoch': 3.86}
+  6%|█████▋                                                                                           | 41/700 [08:20<2:13:42, 12.17s/it]  6%|█████▊                                                                                           | 42/700 [08:32<2:13:31, 12.18s/it]                                                                                                                                         {'loss': 1.8425, 'learning_rate': 0.0094, 'epoch': 3.95}
+  6%|█████▊                                                                                           | 42/700 [08:32<2:13:31, 12.18s/it]  6%|█████▉                                                                                           | 43/700 [08:44<2:13:23, 12.18s/it]                                                                                                                                         {'loss': 1.1943, 'learning_rate': 0.009385714285714287, 'epoch': 4.05}
+  6%|█████▉                                                                                           | 43/700 [08:44<2:13:23, 12.18s/it]  6%|██████                                                                                           | 44/700 [08:56<2:13:13, 12.19s/it]                                                                                                                                         {'loss': 1.6835, 'learning_rate': 0.009371428571428572, 'epoch': 4.14}
+  6%|██████                                                                                           | 44/700 [08:56<2:13:13, 12.19s/it]  6%|██████▏                                                                                          | 45/700 [09:08<2:12:59, 12.18s/it]                                                                                                                                         {'loss': 1.75, 'learning_rate': 0.009357142857142857, 'epoch': 4.24}
+  6%|██████▏                                                                                          | 45/700 [09:08<2:12:59, 12.18s/it]  7%|██████▎                                                                                          | 46/700 [09:21<2:12:44, 12.18s/it]                                                                                                                                         {'loss': 1.2561, 'learning_rate': 0.009342857142857142, 'epoch': 4.33}
+  7%|██████▎                                                                                          | 46/700 [09:21<2:12:44, 12.18s/it]  7%|██████▌                                                                                          | 47/700 [09:33<2:12:33, 12.18s/it]                                                                                                                                         {'loss': 1.3784, 'learning_rate': 0.009328571428571429, 'epoch': 4.42}
+  7%|██████▌                                                                                          | 47/700 [09:33<2:12:33, 12.18s/it]  7%|██████▋                                                                                          | 48/700 [09:45<2:12:21, 12.18s/it]                                                                                                                                         {'loss': 1.2538, 'learning_rate': 0.009314285714285714, 'epoch': 4.52}
+  7%|██████▋                                                                                          | 48/700 [09:45<2:12:21, 12.18s/it]  7%|██████▊                                                                                          | 49/700 [09:57<2:12:06, 12.18s/it]                                                                                                                                         {'loss': 1.4429, 'learning_rate': 0.009300000000000001, 'epoch': 4.61}
+  7%|██████▊                                                                                          | 49/700 [09:57<2:12:06, 12.18s/it]  7%|██████▉                                                                                          | 50/700 [10:09<2:11:53, 12.18s/it]                                                                                                                                         {'loss': 1.3687, 'learning_rate': 0.009285714285714286, 'epoch': 4.71}
+  7%|██████▉                                                                                          | 50/700 [10:09<2:11:53, 12.18s/it]  7%|███████                                                                                          | 51/700 [10:22<2:11:41, 12.18s/it]                                                                                                                                         {'loss': 1.1511, 'learning_rate': 0.009271428571428571, 'epoch': 4.8}
+  7%|███████                                                                                          | 51/700 [10:22<2:11:41, 12.18s/it]  7%|███████▏                                                                                         | 52/700 [10:34<2:11:27, 12.17s/it]                                                                                                                                         {'loss': 1.181, 'learning_rate': 0.009257142857142858, 'epoch': 4.89}
+  7%|███████▏                                                                                         | 52/700 [10:34<2:11:27, 12.17s/it]  8%|███████▎                                                                                         | 53/700 [10:46<2:11:14, 12.17s/it]                                                                                                                                         {'loss': 1.1753, 'learning_rate': 0.009242857142857143, 'epoch': 4.99}
+  8%|███████▎                                                                                         | 53/700 [10:46<2:11:14, 12.17s/it]  8%|███████▍                                                                                         | 54/700 [10:58<2:11:01, 12.17s/it]                                                                                                                                         {'loss': 1.1562, 'learning_rate': 0.009228571428571428, 'epoch': 5.08}
+  8%|███████▍                                                                                         | 54/700 [10:58<2:11:01, 12.17s/it]  8%|███████▌                                                                                         | 55/700 [11:10<2:10:48, 12.17s/it]                                                                                                                                         {'loss': 1.2936, 'learning_rate': 0.009214285714285715, 'epoch': 5.18}
+  8%|███████▌                                                                                         | 55/700 [11:10<2:10:48, 12.17s/it]  8%|███████▊                                                                                         | 56/700 [11:22<2:10:36, 12.17s/it]                                                                                                                                         {'loss': 1.3591, 'learning_rate': 0.0092, 'epoch': 5.27}
+  8%|███████▊                                                                                         | 56/700 [11:22<2:10:36, 12.17s/it]  8%|███████▉                                                                                         | 57/700 [11:35<2:10:27, 12.17s/it]                                                                                                                                         {'loss': 1.1376, 'learning_rate': 0.009185714285714287, 'epoch': 5.36}
+  8%|███████▉                                                                                         | 57/700 [11:35<2:10:27, 12.17s/it]  8%|████████                                                                                         | 58/700 [11:47<2:10:14, 12.17s/it]                                                                                                                                         {'loss': 1.372, 'learning_rate': 0.009171428571428572, 'epoch': 5.46}
+  8%|████████                                                                                         | 58/700 [11:47<2:10:14, 12.17s/it]  8%|████████▏                                                                                        | 59/700 [11:59<2:10:04, 12.17s/it]                                                                                                                                         {'loss': 1.5141, 'learning_rate': 0.009157142857142857, 'epoch': 5.55}
+  8%|████████▏                                                                                        | 59/700 [11:59<2:10:04, 12.17s/it]  9%|████████▎                                                                                        | 60/700 [12:11<2:09:50, 12.17s/it]                                                                                                                                         {'loss': 1.2087, 'learning_rate': 0.009142857142857144, 'epoch': 5.65}
+  9%|████████▎                                                                                        | 60/700 [12:11<2:09:50, 12.17s/it]  9%|████████▍                                                                                        | 61/700 [12:23<2:09:37, 12.17s/it]                                                                                                                                         {'loss': 1.136, 'learning_rate': 0.009128571428571429, 'epoch': 5.74}
+  9%|████████▍                                                                                        | 61/700 [12:23<2:09:37, 12.17s/it]  9%|████████▌                                                                                        | 62/700 [12:35<2:09:25, 12.17s/it]                                                                                                                                         {'loss': 1.2948, 'learning_rate': 0.009114285714285715, 'epoch': 5.84}
+  9%|████████▌                                                                                        | 62/700 [12:35<2:09:25, 12.17s/it]  9%|████████▋                                                                                        | 63/700 [12:48<2:09:14, 12.17s/it]                                                                                                                                         {'loss': 1.0592, 'learning_rate': 0.0091, 'epoch': 5.93}
+  9%|████████▋                                                                                        | 63/700 [12:48<2:09:14, 12.17s/it]  9%|████████▊                                                                                        | 64/700 [13:00<2:09:02, 12.17s/it]                                                                                                                                         {'loss': 1.2321, 'learning_rate': 0.009085714285714286, 'epoch': 6.02}
+  9%|████████▊                                                                                        | 64/700 [13:00<2:09:02, 12.17s/it]  9%|█████████                                                                                        | 65/700 [13:12<2:08:48, 12.17s/it]                                                                                                                                         {'loss': 1.0827, 'learning_rate': 0.009071428571428572, 'epoch': 6.12}
+  9%|█████████                                                                                        | 65/700 [13:12<2:08:48, 12.17s/it]  9%|█████████▏                                                                                       | 66/700 [13:24<2:08:37, 12.17s/it]                                                                                                                                         {'loss': 1.1136, 'learning_rate': 0.009057142857142857, 'epoch': 6.21}
+  9%|█████████▏                                                                                       | 66/700 [13:24<2:08:37, 12.17s/it] 10%|█████████▎                                                                                       | 67/700 [13:36<2:08:25, 12.17s/it]                                                                                                                                         {'loss': 1.475, 'learning_rate': 0.009042857142857142, 'epoch': 6.31}
+ 10%|█████████▎                                                                                       | 67/700 [13:36<2:08:25, 12.17s/it] 10%|█████████▍                                                                                       | 68/700 [13:48<2:08:13, 12.17s/it]                                                                                                                                         {'loss': 1.1316, 'learning_rate': 0.009028571428571427, 'epoch': 6.4}
+ 10%|█████████▍                                                                                       | 68/700 [13:48<2:08:13, 12.17s/it] 10%|█████████▌                                                                                       | 69/700 [14:01<2:08:02, 12.18s/it]                                                                                                                                         {'loss': 1.1688, 'learning_rate': 0.009014285714285714, 'epoch': 6.49}
+ 10%|█████████▌                                                                                       | 69/700 [14:01<2:08:02, 12.18s/it] 10%|█████████▋                                                                                       | 70/700 [14:13<2:07:50, 12.17s/it]                                                                                                                                         {'loss': 1.0882, 'learning_rate': 0.009000000000000001, 'epoch': 6.59}
+ 10%|█████████▋                                                                                       | 70/700 [14:13<2:07:50, 12.17s/it] 10%|█████████▊                                                                                       | 71/700 [14:25<2:07:37, 12.17s/it]                                                                                                                                         {'loss': 1.1085, 'learning_rate': 0.008985714285714286, 'epoch': 6.68}
+ 10%|█████████▊                                                                                       | 71/700 [14:25<2:07:37, 12.17s/it] 10%|█████████▉                                                                                       | 72/700 [14:37<2:07:25, 12.17s/it]                                                                                                                                         {'loss': 1.2029, 'learning_rate': 0.008971428571428571, 'epoch': 6.78}
+ 10%|█████████▉                                                                                       | 72/700 [14:37<2:07:25, 12.17s/it] 10%|██████████                                                                                       | 73/700 [14:49<2:07:13, 12.17s/it]                                                                                                                                         {'loss': 1.098, 'learning_rate': 0.008957142857142856, 'epoch': 6.87}
+ 10%|██████████                                                                                       | 73/700 [14:49<2:07:13, 12.17s/it] 11%|██████████▎                                                                                      | 74/700 [15:01<2:06:59, 12.17s/it]                                                                                                                                         {'loss': 1.219, 'learning_rate': 0.008942857142857143, 'epoch': 6.96}
+ 11%|██████████▎                                                                                      | 74/700 [15:01<2:06:59, 12.17s/it] 11%|██████████▍                                                                                      | 75/700 [15:14<2:06:49, 12.17s/it]                                                                                                                                         {'loss': 1.0092, 'learning_rate': 0.00892857142857143, 'epoch': 7.06}
+ 11%|██████████▍                                                                                      | 75/700 [15:14<2:06:49, 12.17s/it] 11%|██████████▌                                                                                      | 76/700 [15:26<2:06:36, 12.17s/it]                                                                                                                                         {'loss': 1.0112, 'learning_rate': 0.008914285714285715, 'epoch': 7.15}
+ 11%|██████████▌                                                                                      | 76/700 [15:26<2:06:36, 12.17s/it] 11%|██████████▋                                                                                      | 77/700 [15:38<2:06:22, 12.17s/it]                                                                                                                                         {'loss': 1.1481, 'learning_rate': 0.0089, 'epoch': 7.25}
+ 11%|██████████▋                                                                                      | 77/700 [15:38<2:06:22, 12.17s/it] 11%|██████████▊                                                                                      | 78/700 [15:50<2:06:09, 12.17s/it]                                                                                                                                         {'loss': 0.9873, 'learning_rate': 0.008885714285714287, 'epoch': 7.34}
+ 11%|██████████▊                                                                                      | 78/700 [15:50<2:06:09, 12.17s/it] 11%|██████████▉                                                                                      | 79/700 [16:02<2:05:57, 12.17s/it]                                                                                                                                         {'loss': 1.0586, 'learning_rate': 0.008871428571428572, 'epoch': 7.44}
+ 11%|██████████▉                                                                                      | 79/700 [16:02<2:05:57, 12.17s/it] 11%|███████████                                                                                      | 80/700 [16:14<2:05:44, 12.17s/it]                                                                                                                                         {'loss': 1.1177, 'learning_rate': 0.008857142857142857, 'epoch': 7.53}
+ 11%|███████████                                                                                      | 80/700 [16:14<2:05:44, 12.17s/it] 12%|███████████▏                                                                                     | 81/700 [16:27<2:05:31, 12.17s/it]                                                                                                                                         {'loss': 0.7814, 'learning_rate': 0.008842857142857142, 'epoch': 7.62}
+ 12%|███████████▏                                                                                     | 81/700 [16:27<2:05:31, 12.17s/it] 12%|███████████▎                                                                                     | 82/700 [16:39<2:05:19, 12.17s/it]                                                                                                                                         {'loss': 1.2043, 'learning_rate': 0.008828571428571429, 'epoch': 7.72}
+ 12%|███████████▎                                                                                     | 82/700 [16:39<2:05:19, 12.17s/it] 12%|███████████▌                                                                                     | 83/700 [16:51<2:05:06, 12.17s/it]                                                                                                                                         {'loss': 1.0062, 'learning_rate': 0.008814285714285715, 'epoch': 7.81}
+ 12%|███████████▌                                                                                     | 83/700 [16:51<2:05:06, 12.17s/it] 12%|███████████▋                                                                                     | 84/700 [17:03<2:04:55, 12.17s/it]                                                                                                                                         {'loss': 1.0831, 'learning_rate': 0.0088, 'epoch': 7.91}
+ 12%|███████████▋                                                                                     | 84/700 [17:03<2:04:55, 12.17s/it] 12%|███████████▊                                                                                     | 85/700 [17:15<2:04:45, 12.17s/it]                                                                                                                                         {'loss': 0.9554, 'learning_rate': 0.008785714285714286, 'epoch': 8.0}
+ 12%|███████████▊                                                                                     | 85/700 [17:15<2:04:45, 12.17s/it] 12%|███████████▉                                                                                     | 86/700 [17:27<2:04:32, 12.17s/it]                                                                                                                                         {'loss': 1.1674, 'learning_rate': 0.00877142857142857, 'epoch': 8.09}
+ 12%|███████████▉                                                                                     | 86/700 [17:27<2:04:32, 12.17s/it] 12%|████████████                                                                                     | 87/700 [17:40<2:04:22, 12.17s/it]                                                                                                                                         {'loss': 0.8226, 'learning_rate': 0.008757142857142857, 'epoch': 8.19}
+ 12%|████████████                                                                                     | 87/700 [17:40<2:04:22, 12.17s/it] 13%|████████████▏                                                                                    | 88/700 [17:52<2:04:10, 12.17s/it]                                                                                                                                         {'loss': 0.9166, 'learning_rate': 0.008742857142857144, 'epoch': 8.28}
+ 13%|████████████▏                                                                                    | 88/700 [17:52<2:04:10, 12.17s/it] 13%|████████████▎                                                                                    | 89/700 [18:04<2:03:56, 12.17s/it]                                                                                                                                         {'loss': 0.734, 'learning_rate': 0.00872857142857143, 'epoch': 8.38}
+ 13%|████████████▎                                                                                    | 89/700 [18:04<2:03:56, 12.17s/it] 13%|████████████▍                                                                                    | 90/700 [18:16<2:03:43, 12.17s/it]                                                                                                                                         {'loss': 0.8641, 'learning_rate': 0.008714285714285714, 'epoch': 8.47}
+ 13%|████████████▍                                                                                    | 90/700 [18:16<2:03:43, 12.17s/it] 13%|████████████▌                                                                                    | 91/700 [18:28<2:03:29, 12.17s/it]                                                                                                                                         {'loss': 0.9517, 'learning_rate': 0.0087, 'epoch': 8.56}
+ 13%|████████████▌                                                                                    | 91/700 [18:28<2:03:29, 12.17s/it] 13%|████████████▋                                                                                    | 92/700 [18:41<2:03:17, 12.17s/it]                                                                                                                                         {'loss': 0.9995, 'learning_rate': 0.008685714285714286, 'epoch': 8.66}
+ 13%|████████████▋                                                                                    | 92/700 [18:41<2:03:17, 12.17s/it] 13%|████████████▉                                                                                    | 93/700 [18:53<2:03:05, 12.17s/it]                                                                                                                                         {'loss': 0.763, 'learning_rate': 0.008671428571428571, 'epoch': 8.75}
+ 13%|████████████▉                                                                                    | 93/700 [18:53<2:03:05, 12.17s/it] 13%|█████████████                                                                                    | 94/700 [19:05<2:02:54, 12.17s/it]                                                                                                                                         {'loss': 1.0712, 'learning_rate': 0.008657142857142858, 'epoch': 8.85}
+ 13%|█████████████                                                                                    | 94/700 [19:05<2:02:54, 12.17s/it] 14%|█████████████▏                                                                                   | 95/700 [19:17<2:02:42, 12.17s/it]                                                                                                                                         {'loss': 1.1111, 'learning_rate': 0.008642857142857143, 'epoch': 8.94}
+ 14%|█████████████▏                                                                                   | 95/700 [19:17<2:02:42, 12.17s/it] 14%|█████████████▎                                                                                   | 96/700 [19:29<2:02:29, 12.17s/it]                                                                                                                                         {'loss': 0.9626, 'learning_rate': 0.008628571428571428, 'epoch': 9.04}
+ 14%|█████████████▎                                                                                   | 96/700 [19:29<2:02:29, 12.17s/it] 14%|█████████████▍                                                                                   | 97/700 [19:41<2:02:17, 12.17s/it]                                                                                                                                         {'loss': 0.6385, 'learning_rate': 0.008614285714285715, 'epoch': 9.13}
+ 14%|█████████████▍                                                                                   | 97/700 [19:41<2:02:17, 12.17s/it] 14%|█████████████▌                                                                                   | 98/700 [19:54<2:02:06, 12.17s/it]                                                                                                                                         {'loss': 0.8147, 'learning_rate': 0.0086, 'epoch': 9.22}
+ 14%|█████████████▌                                                                                   | 98/700 [19:54<2:02:06, 12.17s/it] 14%|█████████████▋                                                                                   | 99/700 [20:06<2:01:56, 12.17s/it]                                                                                                                                         {'loss': 0.8109, 'learning_rate': 0.008585714285714285, 'epoch': 9.32}
+ 14%|█████████████▋                                                                                   | 99/700 [20:06<2:01:56, 12.17s/it] 14%|█████████████▋                                                                                  | 100/700 [20:18<2:01:43, 12.17s/it]                                                                                                                                         {'loss': 1.0953, 'learning_rate': 0.008571428571428572, 'epoch': 9.41}
+ 14%|█████████████▋                                                                                  | 100/700 [20:18<2:01:43, 12.17s/it]Saving PrefixEncoder
+[INFO|configuration_utils.py:460] 2023-12-02 16:14:12,098 >> Configuration saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/config.json
+[INFO|configuration_utils.py:544] 2023-12-02 16:14:12,099 >> Configuration saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/generation_config.json
+[INFO|modeling_utils.py:2118] 2023-12-02 16:14:12,111 >> Model weights saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/pytorch_model.bin
+[INFO|tokenization_utils_base.py:2437] 2023-12-02 16:14:12,112 >> tokenizer config file saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2446] 2023-12-02 16:14:12,112 >> Special tokens file saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-100/special_tokens_map.json
+/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 14%|█████████████▊                                                                                  | 101/700 [20:30<2:01:40, 12.19s/it]                                                                                                                                         {'loss': 0.7104, 'learning_rate': 0.008557142857142859, 'epoch': 9.51}
+ 14%|█████████████▊                                                                                  | 101/700 [20:30<2:01:40, 12.19s/it] 15%|█████████████▉                                                                                  | 102/700 [20:42<2:01:26, 12.18s/it]                                                                                                                                         {'loss': 0.9672, 'learning_rate': 0.008542857142857144, 'epoch': 9.6}
+ 15%|█████████████▉                                                                                  | 102/700 [20:42<2:01:26, 12.18s/it] 15%|██████████████▏                                                                                 | 103/700 [20:54<2:01:10, 12.18s/it]                                                                                                                                         {'loss': 0.7593, 'learning_rate': 0.008528571428571429, 'epoch': 9.69}
+ 15%|██████████████▏                                                                                 | 103/700 [20:54<2:01:10, 12.18s/it] 15%|██████████████▎                                                                                 | 104/700 [21:07<2:00:59, 12.18s/it]                                                                                                                                         {'loss': 1.0186, 'learning_rate': 0.008514285714285714, 'epoch': 9.79}
+ 15%|██████████████▎                                                                                 | 104/700 [21:07<2:00:59, 12.18s/it] 15%|██████████████▍                                                                                 | 105/700 [21:19<2:00:45, 12.18s/it]                                                                                                                                         {'loss': 0.7898, 'learning_rate': 0.0085, 'epoch': 9.88}
+ 15%|██████████████▍                                                                                 | 105/700 [21:19<2:00:45, 12.18s/it] 15%|██████████████▌                                                                                 | 106/700 [21:31<2:00:33, 12.18s/it]                                                                                                                                         {'loss': 0.7392, 'learning_rate': 0.008485714285714286, 'epoch': 9.98}
+ 15%|██████████████▌                                                                                 | 106/700 [21:31<2:00:33, 12.18s/it] 15%|██████████████▋                                                                                 | 107/700 [21:43<2:00:20, 12.18s/it]                                                                                                                                         {'loss': 0.7295, 'learning_rate': 0.008471428571428572, 'epoch': 10.07}
+ 15%|██████████████▋                                                                                 | 107/700 [21:43<2:00:20, 12.18s/it] 15%|██████████████▊                                                                                 | 108/700 [21:55<2:00:08, 12.18s/it]                                                                                                                                         {'loss': 0.7211, 'learning_rate': 0.008457142857142858, 'epoch': 10.16}
+ 15%|██████████████▊                                                                                 | 108/700 [21:55<2:00:08, 12.18s/it] 16%|██████████████▉                                                                                 | 109/700 [22:07<1:59:54, 12.17s/it]                                                                                                                                         {'loss': 0.769, 'learning_rate': 0.008442857142857143, 'epoch': 10.26}
+ 16%|██████████████▉                                                                                 | 109/700 [22:07<1:59:54, 12.17s/it] 16%|███████████████                                                                                 | 110/700 [22:20<1:59:43, 12.18s/it]                                                                                                                                         {'loss': 0.718, 'learning_rate': 0.00842857142857143, 'epoch': 10.35}
+ 16%|███████████████                                                                                 | 110/700 [22:20<1:59:43, 12.18s/it] 16%|███████████████▏                                                                                | 111/700 [22:32<1:59:29, 12.17s/it]                                                                                                                                         {'loss': 0.6411, 'learning_rate': 0.008414285714285714, 'epoch': 10.45}
+ 16%|███████████████▏                                                                                | 111/700 [22:32<1:59:29, 12.17s/it] 16%|███████████████▎                                                                                | 112/700 [22:44<1:59:16, 12.17s/it]                                                                                                                                         {'loss': 0.8016, 'learning_rate': 0.0084, 'epoch': 10.54}
+ 16%|███████████████▎                                                                                | 112/700 [22:44<1:59:16, 12.17s/it] 16%|███████████████▍                                                                                | 113/700 [22:56<1:59:03, 12.17s/it]                                                                                                                                         {'loss': 0.6633, 'learning_rate': 0.008385714285714286, 'epoch': 10.64}
+ 16%|███████████████▍                                                                                | 113/700 [22:56<1:59:03, 12.17s/it] 16%|███████████████▋                                                                                | 114/700 [23:08<1:58:50, 12.17s/it]                                                                                                                                         {'loss': 0.7257, 'learning_rate': 0.008371428571428571, 'epoch': 10.73}
+ 16%|███████████████▋                                                                                | 114/700 [23:08<1:58:50, 12.17s/it] 16%|███████████████▊                                                                                | 115/700 [23:21<1:58:38, 12.17s/it]                                                                                                                                         {'loss': 0.7785, 'learning_rate': 0.008357142857142858, 'epoch': 10.82}
+ 16%|███████████████▊                                                                                | 115/700 [23:21<1:58:38, 12.17s/it] 17%|███████████████▉                                                                                | 116/700 [23:33<1:58:27, 12.17s/it]                                                                                                                                         {'loss': 0.8927, 'learning_rate': 0.008342857142857143, 'epoch': 10.92}
+ 17%|███████████████▉                                                                                | 116/700 [23:33<1:58:27, 12.17s/it] 17%|████████████████                                                                                | 117/700 [23:45<1:58:19, 12.18s/it]                                                                                                                                         {'loss': 0.7242, 'learning_rate': 0.008328571428571428, 'epoch': 11.01}
+ 17%|████████████████                                                                                | 117/700 [23:45<1:58:19, 12.18s/it] 17%|████████████████▏                                                                               | 118/700 [23:57<1:58:07, 12.18s/it]                                                                                                                                         {'loss': 0.8297, 'learning_rate': 0.008314285714285715, 'epoch': 11.11}
+ 17%|████████████████▏                                                                               | 118/700 [23:57<1:58:07, 12.18s/it] 17%|████████████████▎                                                                               | 119/700 [24:09<1:57:56, 12.18s/it]                                                                                                                                         {'loss': 0.6761, 'learning_rate': 0.0083, 'epoch': 11.2}
+ 17%|████████████████▎                                                                               | 119/700 [24:09<1:57:56, 12.18s/it] 17%|████████████████▍                                                                               | 120/700 [24:21<1:57:41, 12.18s/it]                                                                                                                                         {'loss': 0.6699, 'learning_rate': 0.008285714285714287, 'epoch': 11.29}
+ 17%|████████████████▍                                                                               | 120/700 [24:21<1:57:41, 12.18s/it] 17%|████████████████▌                                                                               | 121/700 [24:34<1:57:28, 12.17s/it]                                                                                                                                         {'loss': 0.5365, 'learning_rate': 0.008271428571428572, 'epoch': 11.39}
+ 17%|████████████████▌                                                                               | 121/700 [24:34<1:57:28, 12.17s/it] 17%|████████████████▋                                                                               | 122/700 [24:46<1:57:15, 12.17s/it]                                                                                                                                         {'loss': 0.9045, 'learning_rate': 0.008257142857142857, 'epoch': 11.48}
+ 17%|████████████████▋                                                                               | 122/700 [24:46<1:57:15, 12.17s/it] 18%|████████████████▊                                                                               | 123/700 [24:58<1:57:04, 12.17s/it]                                                                                                                                         {'loss': 0.5071, 'learning_rate': 0.008242857142857144, 'epoch': 11.58}
+ 18%|████████████████▊                                                                               | 123/700 [24:58<1:57:04, 12.17s/it] 18%|█████████████████                                                                               | 124/700 [25:10<1:56:51, 12.17s/it]                                                                                                                                         {'loss': 0.6472, 'learning_rate': 0.008228571428571429, 'epoch': 11.67}
+ 18%|█████████████████                                                                               | 124/700 [25:10<1:56:51, 12.17s/it] 18%|█████████████████▏                                                                              | 125/700 [25:22<1:56:38, 12.17s/it]                                                                                                                                         {'loss': 0.6232, 'learning_rate': 0.008214285714285714, 'epoch': 11.76}
+ 18%|█████████████████▏                                                                              | 125/700 [25:22<1:56:38, 12.17s/it] 18%|█████████████████▎                                                                              | 126/700 [25:34<1:56:25, 12.17s/it]                                                                                                                                         {'loss': 0.4905, 'learning_rate': 0.008199999999999999, 'epoch': 11.86}
+ 18%|█████████████████▎                                                                              | 126/700 [25:34<1:56:25, 12.17s/it] 18%|█████████████████▍                                                                              | 127/700 [25:47<1:56:12, 12.17s/it]                                                                                                                                         {'loss': 0.557, 'learning_rate': 0.008185714285714286, 'epoch': 11.95}
+ 18%|█████████████████▍                                                                              | 127/700 [25:47<1:56:12, 12.17s/it] 18%|█████████████████▌                                                                              | 128/700 [25:59<1:56:00, 12.17s/it]                                                                                                                                         {'loss': 0.5517, 'learning_rate': 0.008171428571428573, 'epoch': 12.05}
+ 18%|█████████████████▌                                                                              | 128/700 [25:59<1:56:00, 12.17s/it] 18%|█████████████████▋                                                                              | 129/700 [26:11<1:55:50, 12.17s/it]                                                                                                                                         {'loss': 0.6321, 'learning_rate': 0.008157142857142858, 'epoch': 12.14}
+ 18%|█████████████████▋                                                                              | 129/700 [26:11<1:55:50, 12.17s/it] 19%|█████████████████▊                                                                              | 130/700 [26:23<1:55:37, 12.17s/it]                                                                                                                                         {'loss': 0.6619, 'learning_rate': 0.008142857142857143, 'epoch': 12.24}
+ 19%|█████████████████▊                                                                              | 130/700 [26:23<1:55:37, 12.17s/it] 19%|█████████████████▉                                                                              | 131/700 [26:35<1:55:27, 12.17s/it]                                                                                                                                         {'loss': 0.5524, 'learning_rate': 0.008128571428571428, 'epoch': 12.33}
+ 19%|█████████████████▉                                                                              | 131/700 [26:35<1:55:27, 12.17s/it] 19%|██████████████████                                                                              | 132/700 [26:47<1:55:14, 12.17s/it]                                                                                                                                         {'loss': 0.4688, 'learning_rate': 0.008114285714285715, 'epoch': 12.42}
+ 19%|██████████████████                                                                              | 132/700 [26:47<1:55:14, 12.17s/it] 19%|██████████████████▏                                                                             | 133/700 [27:00<1:55:01, 12.17s/it]                                                                                                                                         {'loss': 0.3717, 'learning_rate': 0.008100000000000001, 'epoch': 12.52}
+ 19%|██████████████████▏                                                                             | 133/700 [27:00<1:55:01, 12.17s/it] 19%|██████████████████▍                                                                             | 134/700 [27:12<1:54:48, 12.17s/it]                                                                                                                                         {'loss': 0.5118, 'learning_rate': 0.008085714285714286, 'epoch': 12.61}
+ 19%|██████████████████▍                                                                             | 134/700 [27:12<1:54:48, 12.17s/it] 19%|██████████████████▌                                                                             | 135/700 [27:24<1:54:38, 12.17s/it]                                                                                                                                         {'loss': 0.4521, 'learning_rate': 0.008071428571428571, 'epoch': 12.71}
+ 19%|██████████████████▌                                                                             | 135/700 [27:24<1:54:38, 12.17s/it] 19%|██████████████████▋                                                                             | 136/700 [27:36<1:54:25, 12.17s/it]                                                                                                                                         {'loss': 0.5865, 'learning_rate': 0.008057142857142856, 'epoch': 12.8}
+ 19%|██████████████████▋                                                                             | 136/700 [27:36<1:54:25, 12.17s/it] 20%|██████████████████▊                                                                             | 137/700 [27:48<1:54:14, 12.17s/it]                                                                                                                                         {'loss': 0.5977, 'learning_rate': 0.008042857142857143, 'epoch': 12.89}
+ 20%|██████████████████▊                                                                             | 137/700 [27:48<1:54:14, 12.17s/it] 20%|██████████████████▉                                                                             | 138/700 [28:00<1:54:00, 12.17s/it]                                                                                                                                         {'loss': 0.6977, 'learning_rate': 0.008028571428571428, 'epoch': 12.99}
+ 20%|██████████████████▉                                                                             | 138/700 [28:00<1:54:00, 12.17s/it] 20%|███████████████████                                                                             | 139/700 [28:13<1:53:47, 12.17s/it]                                                                                                                                         {'loss': 0.5625, 'learning_rate': 0.008014285714285713, 'epoch': 13.08}
+ 20%|███████████████████                                                                             | 139/700 [28:13<1:53:47, 12.17s/it] 20%|███████████████████▏                                                                            | 140/700 [28:25<1:53:34, 12.17s/it]                                                                                                                                         {'loss': 0.3611, 'learning_rate': 0.008, 'epoch': 13.18}
+ 20%|███████████████████▏                                                                            | 140/700 [28:25<1:53:34, 12.17s/it] 20%|███████████████████▎                                                                            | 141/700 [28:37<1:53:22, 12.17s/it]                                                                                                                                         {'loss': 0.5168, 'learning_rate': 0.007985714285714287, 'epoch': 13.27}
+ 20%|███████████████████▎                                                                            | 141/700 [28:37<1:53:22, 12.17s/it] 20%|███████████████████▍                                                                            | 142/700 [28:49<1:53:12, 12.17s/it]                                                                                                                                         {'loss': 0.4429, 'learning_rate': 0.007971428571428572, 'epoch': 13.36}
+ 20%|███████████████████▍                                                                            | 142/700 [28:49<1:53:12, 12.17s/it] 20%|███████████████████▌                                                                            | 143/700 [29:01<1:52:58, 12.17s/it]                                                                                                                                         {'loss': 0.4998, 'learning_rate': 0.007957142857142857, 'epoch': 13.46}
+ 20%|███████████████████▌                                                                            | 143/700 [29:01<1:52:58, 12.17s/it] 21%|███████████████████▋                                                                            | 144/700 [29:14<1:52:47, 12.17s/it]                                                                                                                                         {'loss': 0.4437, 'learning_rate': 0.007942857142857142, 'epoch': 13.55}
+ 21%|███████████████████▋                                                                            | 144/700 [29:14<1:52:47, 12.17s/it] 21%|███████████████████▉                                                                            | 145/700 [29:26<1:52:34, 12.17s/it]                                                                                                                                         {'loss': 0.4958, 'learning_rate': 0.007928571428571429, 'epoch': 13.65}
+ 21%|███████████████████▉                                                                            | 145/700 [29:26<1:52:34, 12.17s/it] 21%|████████████████████                                                                            | 146/700 [29:38<1:52:22, 12.17s/it]                                                                                                                                         {'loss': 0.4021, 'learning_rate': 0.007914285714285716, 'epoch': 13.74}
+ 21%|████████████████████                                                                            | 146/700 [29:38<1:52:22, 12.17s/it] 21%|████████████████████▏                                                                           | 147/700 [29:50<1:52:10, 12.17s/it]                                                                                                                                         {'loss': 0.6163, 'learning_rate': 0.0079, 'epoch': 13.84}
+ 21%|████████████████████▏                                                                           | 147/700 [29:50<1:52:10, 12.17s/it] 21%|████████████████████▎                                                                           | 148/700 [30:02<1:51:58, 12.17s/it]                                                                                                                                         {'loss': 0.406, 'learning_rate': 0.007885714285714286, 'epoch': 13.93}
+ 21%|████████████████████▎                                                                           | 148/700 [30:02<1:51:58, 12.17s/it] 21%|████████████████████▍                                                                           | 149/700 [30:14<1:51:47, 12.17s/it]                                                                                                                                         {'loss': 0.4905, 'learning_rate': 0.007871428571428571, 'epoch': 14.02}
+ 21%|████████████████████▍                                                                           | 149/700 [30:14<1:51:47, 12.17s/it] 21%|████████████████████▌                                                                           | 150/700 [30:27<1:51:34, 12.17s/it]                                                                                                                                         {'loss': 0.3824, 'learning_rate': 0.007857142857142858, 'epoch': 14.12}
+ 21%|████████████████████▌                                                                           | 150/700 [30:27<1:51:34, 12.17s/it] 22%|████████████████████▋                                                                           | 151/700 [30:39<1:51:21, 12.17s/it]                                                                                                                                         {'loss': 0.3591, 'learning_rate': 0.007842857142857143, 'epoch': 14.21}
+ 22%|████████████████████▋                                                                           | 151/700 [30:39<1:51:21, 12.17s/it] 22%|████████████████████▊                                                                           | 152/700 [30:51<1:51:08, 12.17s/it]                                                                                                                                         {'loss': 0.342, 'learning_rate': 0.007828571428571428, 'epoch': 14.31}
+ 22%|████████████████████▊                                                                           | 152/700 [30:51<1:51:08, 12.17s/it] 22%|████████████████████▉                                                                           | 153/700 [31:03<1:50:57, 12.17s/it]                                                                                                                                         {'loss': 0.4565, 'learning_rate': 0.007814285714285715, 'epoch': 14.4}
+ 22%|████████████████████▉                                                                           | 153/700 [31:03<1:50:57, 12.17s/it] 22%|█████████████████████                                                                           | 154/700 [31:15<1:50:45, 12.17s/it]                                                                                                                                         {'loss': 0.3287, 'learning_rate': 0.0078000000000000005, 'epoch': 14.49}
+ 22%|█████████████████████                                                                           | 154/700 [31:15<1:50:45, 12.17s/it] 22%|█████████████████████▎                                                                          | 155/700 [31:27<1:50:32, 12.17s/it]                                                                                                                                         {'loss': 0.4179, 'learning_rate': 0.007785714285714286, 'epoch': 14.59}
+ 22%|█████████████████████▎                                                                          | 155/700 [31:27<1:50:32, 12.17s/it] 22%|█████████████████████▍                                                                          | 156/700 [31:40<1:50:19, 12.17s/it]                                                                                                                                         {'loss': 0.3586, 'learning_rate': 0.0077714285714285715, 'epoch': 14.68}
+ 22%|█████████████████████▍                                                                          | 156/700 [31:40<1:50:19, 12.17s/it] 22%|█████████████████████▌                                                                          | 157/700 [31:52<1:50:07, 12.17s/it]                                                                                                                                         {'loss': 0.4618, 'learning_rate': 0.007757142857142857, 'epoch': 14.78}
+ 22%|█████████████████████▌                                                                          | 157/700 [31:52<1:50:07, 12.17s/it] 23%|█████████████████████▋                                                                          | 158/700 [32:04<1:49:56, 12.17s/it]                                                                                                                                         {'loss': 0.4133, 'learning_rate': 0.0077428571428571425, 'epoch': 14.87}
+ 23%|█████████████████████▋                                                                          | 158/700 [32:04<1:49:56, 12.17s/it] 23%|█████████████████████▊                                                                          | 159/700 [32:16<1:49:46, 12.18s/it]                                                                                                                                         {'loss': 0.4326, 'learning_rate': 0.007728571428571429, 'epoch': 14.96}
+ 23%|█████████████████████▊                                                                          | 159/700 [32:16<1:49:46, 12.18s/it] 23%|█████████████████████▉                                                                          | 160/700 [32:28<1:49:35, 12.18s/it]                                                                                                                                         {'loss': 0.3838, 'learning_rate': 0.007714285714285715, 'epoch': 15.06}
+ 23%|█████████████████████▉                                                                          | 160/700 [32:28<1:49:35, 12.18s/it] 23%|██████████████████████                                                                          | 161/700 [32:40<1:49:24, 12.18s/it]                                                                                                                                         {'loss': 0.2978, 'learning_rate': 0.0077, 'epoch': 15.15}
+ 23%|██████████████████████                                                                          | 161/700 [32:40<1:49:24, 12.18s/it] 23%|██████████████████████▏                                                                         | 162/700 [32:53<1:49:13, 12.18s/it]                                                                                                                                         {'loss': 0.3993, 'learning_rate': 0.007685714285714286, 'epoch': 15.25}
+ 23%|██████████████████████▏                                                                         | 162/700 [32:53<1:49:13, 12.18s/it] 23%|██████████████████████▎                                                                         | 163/700 [33:05<1:49:02, 12.18s/it]                                                                                                                                         {'loss': 0.3249, 'learning_rate': 0.007671428571428571, 'epoch': 15.34}
+ 23%|██████████████████████▎                                                                         | 163/700 [33:05<1:49:02, 12.18s/it] 23%|██████████████████████▍                                                                         | 164/700 [33:17<1:48:51, 12.19s/it]                                                                                                                                         {'loss': 0.2796, 'learning_rate': 0.007657142857142857, 'epoch': 15.44}
+ 23%|██████████████████████▍                                                                         | 164/700 [33:17<1:48:51, 12.19s/it] 24%|██████████████████████▋                                                                         | 165/700 [33:29<1:48:41, 12.19s/it]                                                                                                                                         {'loss': 0.3918, 'learning_rate': 0.007642857142857142, 'epoch': 15.53}
+ 24%|██████████████████████▋                                                                         | 165/700 [33:29<1:48:41, 12.19s/it] 24%|██████████████████████▊                                                                         | 166/700 [33:41<1:48:27, 12.19s/it]                                                                                                                                         {'loss': 0.4122, 'learning_rate': 0.007628571428571429, 'epoch': 15.62}
+ 24%|██████████████████████▊                                                                         | 166/700 [33:41<1:48:27, 12.19s/it] 24%|██████████████████████▉                                                                         | 167/700 [33:54<1:48:12, 12.18s/it]                                                                                                                                         {'loss': 0.3403, 'learning_rate': 0.007614285714285715, 'epoch': 15.72}
+ 24%|██████████████████████▉                                                                         | 167/700 [33:54<1:48:12, 12.18s/it] 24%|███████████████████████                                                                         | 168/700 [34:06<1:47:57, 12.18s/it]                                                                                                                                         {'loss': 0.3759, 'learning_rate': 0.0076, 'epoch': 15.81}
+ 24%|███████████████████████                                                                         | 168/700 [34:06<1:47:57, 12.18s/it] 24%|███████████████████████▏                                                                        | 169/700 [34:18<1:47:43, 12.17s/it]                                                                                                                                         {'loss': 0.3621, 'learning_rate': 0.007585714285714286, 'epoch': 15.91}
+ 24%|███████████████████████▏                                                                        | 169/700 [34:18<1:47:43, 12.17s/it] 24%|███████████████████████▎                                                                        | 170/700 [34:30<1:47:32, 12.17s/it]                                                                                                                                         {'loss': 0.2991, 'learning_rate': 0.007571428571428571, 'epoch': 16.0}
+ 24%|███████████████████████▎                                                                        | 170/700 [34:30<1:47:32, 12.17s/it] 24%|███████████████████████▍                                                                        | 171/700 [34:42<1:47:20, 12.17s/it]                                                                                                                                         {'loss': 0.3039, 'learning_rate': 0.007557142857142857, 'epoch': 16.09}
+ 24%|███████████████████████▍                                                                        | 171/700 [34:42<1:47:20, 12.17s/it] 25%|███████████████████████▌                                                                        | 172/700 [34:54<1:47:07, 12.17s/it]                                                                                                                                         {'loss': 0.4571, 'learning_rate': 0.007542857142857144, 'epoch': 16.19}
+ 25%|███████████████████████▌                                                                        | 172/700 [34:54<1:47:07, 12.17s/it] 25%|███████████████████████▋                                                                        | 173/700 [35:07<1:46:55, 12.17s/it]                                                                                                                                         {'loss': 0.2759, 'learning_rate': 0.007528571428571429, 'epoch': 16.28}
+ 25%|███████████████████████▋                                                                        | 173/700 [35:07<1:46:55, 12.17s/it] 25%|███████████████████████▊                                                                        | 174/700 [35:19<1:46:41, 12.17s/it]                                                                                                                                         {'loss': 0.2835, 'learning_rate': 0.007514285714285715, 'epoch': 16.38}
+ 25%|███████████████████████▊                                                                        | 174/700 [35:19<1:46:41, 12.17s/it] 25%|████████████████████████                                                                        | 175/700 [35:31<1:46:29, 12.17s/it]                                                                                                                                         {'loss': 0.3221, 'learning_rate': 0.0075, 'epoch': 16.47}
+ 25%|████████████████████████                                                                        | 175/700 [35:31<1:46:29, 12.17s/it] 25%|████████████████████████▏                                                                       | 176/700 [35:43<1:46:16, 12.17s/it]                                                                                                                                         {'loss': 0.3072, 'learning_rate': 0.007485714285714286, 'epoch': 16.56}
+ 25%|████████████████████████▏                                                                       | 176/700 [35:43<1:46:16, 12.17s/it] 25%|████████████████████████▎                                                                       | 177/700 [35:55<1:46:05, 12.17s/it]                                                                                                                                         {'loss': 0.2852, 'learning_rate': 0.007471428571428572, 'epoch': 16.66}
+ 25%|████████████████████████▎                                                                       | 177/700 [35:55<1:46:05, 12.17s/it] 25%|████████████████████████▍                                                                       | 178/700 [36:07<1:45:55, 12.18s/it]                                                                                                                                         {'loss': 0.2559, 'learning_rate': 0.007457142857142857, 'epoch': 16.75}
+ 25%|████████████████████████▍                                                                       | 178/700 [36:07<1:45:55, 12.18s/it] 26%|████████████████████████▌                                                                       | 179/700 [36:20<1:45:43, 12.17s/it]                                                                                                                                         {'loss': 0.2787, 'learning_rate': 0.007442857142857143, 'epoch': 16.85}
+ 26%|████████████████████████▌                                                                       | 179/700 [36:20<1:45:43, 12.17s/it] 26%|████████████████████████▋                                                                       | 180/700 [36:32<1:45:30, 12.17s/it]                                                                                                                                         {'loss': 0.3331, 'learning_rate': 0.007428571428571429, 'epoch': 16.94}
+ 26%|████████████████████████▋                                                                       | 180/700 [36:32<1:45:30, 12.17s/it] 26%|████████████████████████▊                                                                       | 181/700 [36:44<1:45:19, 12.18s/it]                                                                                                                                         {'loss': 0.1929, 'learning_rate': 0.007414285714285714, 'epoch': 17.04}
+ 26%|████████████████████████▊                                                                       | 181/700 [36:44<1:45:19, 12.18s/it] 26%|████████████████████████▉                                                                       | 182/700 [36:56<1:45:05, 12.17s/it]                                                                                                                                         {'loss': 0.2065, 'learning_rate': 0.0074, 'epoch': 17.13}
+ 26%|████████████████████████▉                                                                       | 182/700 [36:56<1:45:05, 12.17s/it] 26%|█████████████████████████                                                                       | 183/700 [37:08<1:44:52, 12.17s/it]                                                                                                                                         {'loss': 0.2868, 'learning_rate': 0.007385714285714285, 'epoch': 17.22}
+ 26%|█████████████████████████                                                                       | 183/700 [37:08<1:44:52, 12.17s/it] 26%|█████████████████████████▏                                                                      | 184/700 [37:20<1:44:39, 12.17s/it]                                                                                                                                         {'loss': 0.2206, 'learning_rate': 0.007371428571428571, 'epoch': 17.32}
+ 26%|█████████████████████████▏                                                                      | 184/700 [37:20<1:44:39, 12.17s/it] 26%|█████████████████████████▎                                                                      | 185/700 [37:33<1:44:29, 12.17s/it]                                                                                                                                         {'loss': 0.2355, 'learning_rate': 0.007357142857142858, 'epoch': 17.41}
+ 26%|█████████████████████████▎                                                                      | 185/700 [37:33<1:44:29, 12.17s/it] 27%|█████████████████████████▌                                                                      | 186/700 [37:45<1:44:18, 12.18s/it]                                                                                                                                         {'loss': 0.3041, 'learning_rate': 0.007342857142857143, 'epoch': 17.51}
+ 27%|█████████████████████████▌                                                                      | 186/700 [37:45<1:44:18, 12.18s/it] 27%|█████████████████████████▋                                                                      | 187/700 [37:57<1:44:05, 12.18s/it]                                                                                                                                         {'loss': 0.3028, 'learning_rate': 0.007328571428571429, 'epoch': 17.6}
+ 27%|█████████████████████████▋                                                                      | 187/700 [37:57<1:44:05, 12.18s/it] 27%|█████████████████████████▊                                                                      | 188/700 [38:09<1:43:53, 12.17s/it]                                                                                                                                         {'loss': 0.2435, 'learning_rate': 0.007314285714285714, 'epoch': 17.69}
+ 27%|█████████████████████████▊                                                                      | 188/700 [38:09<1:43:53, 12.17s/it] 27%|█████████████████████████▉                                                                      | 189/700 [38:21<1:43:39, 12.17s/it]                                                                                                                                         {'loss': 0.1869, 'learning_rate': 0.0073, 'epoch': 17.79}
+ 27%|█████████████████████████▉                                                                      | 189/700 [38:21<1:43:39, 12.17s/it] 27%|██████████████████████████                                                                      | 190/700 [38:34<1:43:26, 12.17s/it]                                                                                                                                         {'loss': 0.3036, 'learning_rate': 0.007285714285714285, 'epoch': 17.88}
+ 27%|██████████████████████████                                                                      | 190/700 [38:34<1:43:26, 12.17s/it] 27%|██████████████████████████▏                                                                     | 191/700 [38:46<1:43:14, 12.17s/it]                                                                                                                                         {'loss': 0.246, 'learning_rate': 0.007271428571428571, 'epoch': 17.98}
+ 27%|██████████████████████████▏                                                                     | 191/700 [38:46<1:43:14, 12.17s/it] 27%|██████████████████████████▎                                                                     | 192/700 [38:58<1:43:01, 12.17s/it]                                                                                                                                         {'loss': 0.2316, 'learning_rate': 0.007257142857142858, 'epoch': 18.07}
+ 27%|██████████████████████████▎                                                                     | 192/700 [38:58<1:43:01, 12.17s/it] 28%|██████████████████████████▍                                                                     | 193/700 [39:10<1:42:49, 12.17s/it]                                                                                                                                         {'loss': 0.186, 'learning_rate': 0.007242857142857143, 'epoch': 18.16}
+ 28%|██████████████████████████▍                                                                     | 193/700 [39:10<1:42:49, 12.17s/it] 28%|██████████████████████████▌                                                                     | 194/700 [39:22<1:42:38, 12.17s/it]                                                                                                                                         {'loss': 0.2616, 'learning_rate': 0.007228571428571429, 'epoch': 18.26}
+ 28%|██████████████████████████▌                                                                     | 194/700 [39:22<1:42:38, 12.17s/it] 28%|██████████████████████████▋                                                                     | 195/700 [39:34<1:42:25, 12.17s/it]                                                                                                                                         {'loss': 0.2824, 'learning_rate': 0.007214285714285715, 'epoch': 18.35}
+ 28%|██████████████████████████▋                                                                     | 195/700 [39:34<1:42:25, 12.17s/it] 28%|██████████████████████████▉                                                                     | 196/700 [39:47<1:42:12, 12.17s/it]                                                                                                                                         {'loss': 0.2, 'learning_rate': 0.0072, 'epoch': 18.45}
+ 28%|██████████████████████████▉                                                                     | 196/700 [39:47<1:42:12, 12.17s/it] 28%|███████████████████████████                                                                     | 197/700 [39:59<1:42:01, 12.17s/it]                                                                                                                                         {'loss': 0.1978, 'learning_rate': 0.007185714285714286, 'epoch': 18.54}
+ 28%|███████████████████████████                                                                     | 197/700 [39:59<1:42:01, 12.17s/it] 28%|███████████████████████████▏                                                                    | 198/700 [40:11<1:41:50, 12.17s/it]                                                                                                                                         {'loss': 0.1897, 'learning_rate': 0.007171428571428572, 'epoch': 18.64}
+ 28%|███████████████████████████▏                                                                    | 198/700 [40:11<1:41:50, 12.17s/it] 28%|███████████████████████████▎                                                                    | 199/700 [40:23<1:41:41, 12.18s/it]                                                                                                                                         {'loss': 0.1958, 'learning_rate': 0.007157142857142858, 'epoch': 18.73}
+ 28%|███████████████████████████▎                                                                    | 199/700 [40:23<1:41:41, 12.18s/it] 29%|███████████████████████████▍                                                                    | 200/700 [40:35<1:41:26, 12.17s/it]                                                                                                                                         {'loss': 0.203, 'learning_rate': 0.0071428571428571435, 'epoch': 18.82}
+ 29%|███████████████████████████▍                                                                    | 200/700 [40:35<1:41:26, 12.17s/it]Saving PrefixEncoder
+[INFO|configuration_utils.py:460] 2023-12-02 16:34:29,439 >> Configuration saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/config.json
+[INFO|configuration_utils.py:544] 2023-12-02 16:34:29,439 >> Configuration saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/generation_config.json
+[INFO|modeling_utils.py:2118] 2023-12-02 16:34:29,451 >> Model weights saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/pytorch_model.bin
+[INFO|tokenization_utils_base.py:2437] 2023-12-02 16:34:29,452 >> tokenizer config file saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2446] 2023-12-02 16:34:29,452 >> Special tokens file saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-200/special_tokens_map.json
+/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 29%|███████████████████████████▌                                                                    | 201/700 [40:47<1:41:18, 12.18s/it]                                                                                                                                         {'loss': 0.2451, 'learning_rate': 0.0071285714285714286, 'epoch': 18.92}
+ 29%|███████████████████████████▌                                                                    | 201/700 [40:47<1:41:18, 12.18s/it] 29%|███████████████████████████▋                                                                    | 202/700 [41:00<1:41:03, 12.17s/it]                                                                                                                                         {'loss': 0.2045, 'learning_rate': 0.0071142857142857145, 'epoch': 19.01}
+ 29%|███████████████████████████▋                                                                    | 202/700 [41:00<1:41:03, 12.17s/it] 29%|███████████████████████████▊                                                                    | 203/700 [41:12<1:40:48, 12.17s/it]                                                                                                                                         {'loss': 0.1937, 'learning_rate': 0.0070999999999999995, 'epoch': 19.11}
+ 29%|███████████████████████████▊                                                                    | 203/700 [41:12<1:40:48, 12.17s/it] 29%|███████████████████████████▉                                                                    | 204/700 [41:24<1:40:35, 12.17s/it]                                                                                                                                         {'loss': 0.1814, 'learning_rate': 0.0070857142857142855, 'epoch': 19.2}
+ 29%|███████████████████████████▉                                                                    | 204/700 [41:24<1:40:35, 12.17s/it] 29%|████████████████████████████                                                                    | 205/700 [41:36<1:40:21, 12.16s/it]                                                                                                                                         {'loss': 0.1869, 'learning_rate': 0.007071428571428572, 'epoch': 19.29}
+ 29%|████████████████████████████                                                                    | 205/700 [41:36<1:40:21, 12.16s/it] 29%|████████████████████████████▎                                                                   | 206/700 [41:48<1:40:08, 12.16s/it]                                                                                                                                         {'loss': 0.2089, 'learning_rate': 0.007057142857142857, 'epoch': 19.39}
+ 29%|████████████████████████████▎                                                                   | 206/700 [41:48<1:40:08, 12.16s/it] 30%|████████████████████████████▍                                                                   | 207/700 [42:00<1:39:55, 12.16s/it]                                                                                                                                         {'loss': 0.1924, 'learning_rate': 0.007042857142857143, 'epoch': 19.48}
+ 30%|████████████████████████████▍                                                                   | 207/700 [42:00<1:39:55, 12.16s/it] 30%|████████████████████████████▌                                                                   | 208/700 [42:13<1:39:43, 12.16s/it]                                                                                                                                         {'loss': 0.1512, 'learning_rate': 0.007028571428571428, 'epoch': 19.58}
+ 30%|████████████████████████████▌                                                                   | 208/700 [42:13<1:39:43, 12.16s/it] 30%|████████████████████████████▋                                                                   | 209/700 [42:25<1:39:30, 12.16s/it]                                                                                                                                         {'loss': 0.1375, 'learning_rate': 0.007014285714285714, 'epoch': 19.67}
+ 30%|████████████████████████████▋                                                                   | 209/700 [42:25<1:39:30, 12.16s/it] 30%|████████████████████████████▊                                                                   | 210/700 [42:37<1:39:18, 12.16s/it]                                                                                                                                         {'loss': 0.187, 'learning_rate': 0.006999999999999999, 'epoch': 19.76}
+ 30%|████████████████████████████▊                                                                   | 210/700 [42:37<1:39:18, 12.16s/it] 30%|████████████████████████████▉                                                                   | 211/700 [42:49<1:39:06, 12.16s/it]                                                                                                                                         {'loss': 0.2488, 'learning_rate': 0.006985714285714286, 'epoch': 19.86}
+ 30%|████████████████████████████▉                                                                   | 211/700 [42:49<1:39:06, 12.16s/it] 30%|█████████████████████████████                                                                   | 212/700 [43:01<1:38:53, 12.16s/it]                                                                                                                                         {'loss': 0.1864, 'learning_rate': 0.006971428571428572, 'epoch': 19.95}
+ 30%|█████████████████████████████                                                                   | 212/700 [43:01<1:38:53, 12.16s/it] 30%|█████████████████████████████▏                                                                  | 213/700 [43:13<1:38:41, 12.16s/it]                                                                                                                                         {'loss': 0.1984, 'learning_rate': 0.006957142857142857, 'epoch': 20.05}
+ 30%|█████████████████████████████▏                                                                  | 213/700 [43:13<1:38:41, 12.16s/it] 31%|█████████████████████████████▎                                                                  | 214/700 [43:25<1:38:29, 12.16s/it]                                                                                                                                         {'loss': 0.156, 'learning_rate': 0.006942857142857143, 'epoch': 20.14}
+ 31%|█████████████████████████████▎                                                                  | 214/700 [43:25<1:38:29, 12.16s/it] 31%|█████████████████████████████▍                                                                  | 215/700 [43:38<1:38:17, 12.16s/it]                                                                                                                                         {'loss': 0.2082, 'learning_rate': 0.006928571428571429, 'epoch': 20.24}
+ 31%|█████████████████████████████▍                                                                  | 215/700 [43:38<1:38:17, 12.16s/it] 31%|█████████████████████████████▌                                                                  | 216/700 [43:50<1:38:05, 12.16s/it]                                                                                                                                         {'loss': 0.094, 'learning_rate': 0.006914285714285714, 'epoch': 20.33}
+ 31%|█████████████████████████████▌                                                                  | 216/700 [43:50<1:38:05, 12.16s/it] 31%|█████████████████████████████▊                                                                  | 217/700 [44:02<1:37:53, 12.16s/it]                                                                                                                                         {'loss': 0.1784, 'learning_rate': 0.0069, 'epoch': 20.42}
+ 31%|█████████████████████████████▊                                                                  | 217/700 [44:02<1:37:53, 12.16s/it] 31%|█████████████████████████████▉                                                                  | 218/700 [44:14<1:37:40, 12.16s/it]                                                                                                                                         {'loss': 0.1293, 'learning_rate': 0.006885714285714287, 'epoch': 20.52}
+ 31%|█████████████████████████████▉                                                                  | 218/700 [44:14<1:37:40, 12.16s/it] 31%|██████████████████████████████                                                                  | 219/700 [44:26<1:37:28, 12.16s/it]                                                                                                                                         {'loss': 0.1635, 'learning_rate': 0.006871428571428572, 'epoch': 20.61}
+ 31%|██████████████████████████████                                                                  | 219/700 [44:26<1:37:28, 12.16s/it] 31%|██████████████████████████████▏                                                                 | 220/700 [44:38<1:37:16, 12.16s/it]                                                                                                                                         {'loss': 0.1668, 'learning_rate': 0.006857142857142858, 'epoch': 20.71}
+ 31%|██████████████████████████████▏                                                                 | 220/700 [44:38<1:37:16, 12.16s/it] 32%|██████████████████████████████▎                                                                 | 221/700 [44:51<1:37:04, 12.16s/it]                                                                                                                                         {'loss': 0.1946, 'learning_rate': 0.006842857142857143, 'epoch': 20.8}
+ 32%|██████████████████████████████▎                                                                 | 221/700 [44:51<1:37:04, 12.16s/it] 32%|██████████████████████████████▍                                                                 | 222/700 [45:03<1:36:52, 12.16s/it]                                                                                                                                         {'loss': 0.2347, 'learning_rate': 0.006828571428571429, 'epoch': 20.89}
+ 32%|██████████████████████████████▍                                                                 | 222/700 [45:03<1:36:52, 12.16s/it] 32%|██████████████████████████████▌                                                                 | 223/700 [45:15<1:36:39, 12.16s/it]                                                                                                                                         {'loss': 0.1523, 'learning_rate': 0.006814285714285714, 'epoch': 20.99}
+ 32%|██████████████████████████████▌                                                                 | 223/700 [45:15<1:36:39, 12.16s/it] 32%|██████████████████████████████▋                                                                 | 224/700 [45:27<1:36:28, 12.16s/it]                                                                                                                                         {'loss': 0.1337, 'learning_rate': 0.0068000000000000005, 'epoch': 21.08}
+ 32%|██████████████████████████████▋                                                                 | 224/700 [45:27<1:36:28, 12.16s/it] 32%|██████████████████████████████▊                                                                 | 225/700 [45:39<1:36:15, 12.16s/it]                                                                                                                                         {'loss': 0.1511, 'learning_rate': 0.006785714285714286, 'epoch': 21.18}
+ 32%|██████████████████████████████▊                                                                 | 225/700 [45:39<1:36:15, 12.16s/it] 32%|██████████████████████████████▉                                                                 | 226/700 [45:51<1:36:03, 12.16s/it]                                                                                                                                         {'loss': 0.1058, 'learning_rate': 0.0067714285714285715, 'epoch': 21.27}
+ 32%|██████████████████████████████▉                                                                 | 226/700 [45:51<1:36:03, 12.16s/it] 32%|███████████████████████████████▏                                                                | 227/700 [46:04<1:35:51, 12.16s/it]                                                                                                                                         {'loss': 0.172, 'learning_rate': 0.006757142857142857, 'epoch': 21.36}
+ 32%|███████████████████████████████▏                                                                | 227/700 [46:04<1:35:51, 12.16s/it] 33%|███████████████████████████████▎                                                                | 228/700 [46:16<1:35:39, 12.16s/it]                                                                                                                                         {'loss': 0.1077, 'learning_rate': 0.0067428571428571425, 'epoch': 21.46}
+ 33%|███████████████████████████████▎                                                                | 228/700 [46:16<1:35:39, 12.16s/it] 33%|███████████████████████████████▍                                                                | 229/700 [46:28<1:35:28, 12.16s/it]                                                                                                                                         {'loss': 0.1993, 'learning_rate': 0.006728571428571428, 'epoch': 21.55}
+ 33%|███████████████████████████████▍                                                                | 229/700 [46:28<1:35:28, 12.16s/it] 33%|███████████████████████████████▌                                                                | 230/700 [46:40<1:35:16, 12.16s/it]                                                                                                                                         {'loss': 0.1414, 'learning_rate': 0.006714285714285714, 'epoch': 21.65}
+ 33%|███████████████████████████████▌                                                                | 230/700 [46:40<1:35:16, 12.16s/it] 33%|███████████████████████████████▋                                                                | 231/700 [46:52<1:35:04, 12.16s/it]                                                                                                                                         {'loss': 0.126, 'learning_rate': 0.0067, 'epoch': 21.74}
+ 33%|███████████████████████████████▋                                                                | 231/700 [46:52<1:35:04, 12.16s/it] 33%|███████████████████████████████▊                                                                | 232/700 [47:04<1:34:52, 12.16s/it]                                                                                                                                         {'loss': 0.1528, 'learning_rate': 0.006685714285714286, 'epoch': 21.84}
+ 33%|███████████████████████████████▊                                                                | 232/700 [47:04<1:34:52, 12.16s/it] 33%|███████████████████████████████▉                                                                | 233/700 [47:17<1:34:40, 12.16s/it]                                                                                                                                         {'loss': 0.1316, 'learning_rate': 0.006671428571428571, 'epoch': 21.93}
+ 33%|███████████████████████████████▉                                                                | 233/700 [47:17<1:34:40, 12.16s/it] 33%|████████████████████████████████                                                                | 234/700 [47:29<1:34:28, 12.16s/it]                                                                                                                                         {'loss': 0.1565, 'learning_rate': 0.006657142857142857, 'epoch': 22.02}
+ 33%|████████████████████████████████                                                                | 234/700 [47:29<1:34:28, 12.16s/it] 34%|████████████████████████████████▏                                                               | 235/700 [47:41<1:34:16, 12.16s/it]                                                                                                                                         {'loss': 0.1088, 'learning_rate': 0.006642857142857143, 'epoch': 22.12}
+ 34%|████████████████████████████████▏                                                               | 235/700 [47:41<1:34:16, 12.16s/it] 34%|████████████████████████████████▎                                                               | 236/700 [47:53<1:34:03, 12.16s/it]                                                                                                                                         {'loss': 0.088, 'learning_rate': 0.006628571428571428, 'epoch': 22.21}
+ 34%|████████████████████████████████▎                                                               | 236/700 [47:53<1:34:03, 12.16s/it] 34%|████████████████████████████████▌                                                               | 237/700 [48:05<1:33:51, 12.16s/it]                                                                                                                                         {'loss': 0.1348, 'learning_rate': 0.006614285714285715, 'epoch': 22.31}
+ 34%|████████████████████████████████▌                                                               | 237/700 [48:05<1:33:51, 12.16s/it] 34%|████████████████████████████████▋                                                               | 238/700 [48:17<1:33:39, 12.16s/it]                                                                                                                                         {'loss': 0.1702, 'learning_rate': 0.006600000000000001, 'epoch': 22.4}
+ 34%|████████████████████████████████▋                                                               | 238/700 [48:17<1:33:39, 12.16s/it] 34%|████████████████████████████████▊                                                               | 239/700 [48:30<1:33:27, 12.16s/it]                                                                                                                                         {'loss': 0.132, 'learning_rate': 0.006585714285714286, 'epoch': 22.49}
+ 34%|████████████████████████████████▊                                                               | 239/700 [48:30<1:33:27, 12.16s/it] 34%|████████████████████████████████▉                                                               | 240/700 [48:42<1:33:15, 12.16s/it]                                                                                                                                         {'loss': 0.1115, 'learning_rate': 0.006571428571428572, 'epoch': 22.59}
+ 34%|████████████████████████████████▉                                                               | 240/700 [48:42<1:33:15, 12.16s/it] 34%|█████████████████████████████████                                                               | 241/700 [48:54<1:33:03, 12.16s/it]                                                                                                                                         {'loss': 0.1173, 'learning_rate': 0.006557142857142857, 'epoch': 22.68}
+ 34%|█████████████████████████████████                                                               | 241/700 [48:54<1:33:03, 12.16s/it] 35%|█████████████████████████████████▏                                                              | 242/700 [49:06<1:32:51, 12.16s/it]                                                                                                                                         {'loss': 0.0967, 'learning_rate': 0.006542857142857143, 'epoch': 22.78}
+ 35%|█████████████████████████████████▏                                                              | 242/700 [49:06<1:32:51, 12.16s/it] 35%|█████████████████████████████████▎                                                              | 243/700 [49:18<1:32:39, 12.16s/it]                                                                                                                                         {'loss': 0.1484, 'learning_rate': 0.006528571428571428, 'epoch': 22.87}
+ 35%|█████████████████████████████████▎                                                              | 243/700 [49:18<1:32:39, 12.16s/it] 35%|█████████████████████████████████▍                                                              | 244/700 [49:30<1:32:26, 12.16s/it]                                                                                                                                         {'loss': 0.1566, 'learning_rate': 0.006514285714285715, 'epoch': 22.96}
+ 35%|█████████████████████████████████▍                                                              | 244/700 [49:30<1:32:26, 12.16s/it] 35%|█████████████████████████████████▌                                                              | 245/700 [49:43<1:32:15, 12.17s/it]                                                                                                                                         {'loss': 0.162, 'learning_rate': 0.006500000000000001, 'epoch': 23.06}
+ 35%|█████████████████████████████████▌                                                              | 245/700 [49:43<1:32:15, 12.17s/it] 35%|█████████████████████████████████▋                                                              | 246/700 [49:55<1:32:03, 12.17s/it]                                                                                                                                         {'loss': 0.1099, 'learning_rate': 0.006485714285714286, 'epoch': 23.15}
+ 35%|█████████████████████████████████▋                                                              | 246/700 [49:55<1:32:03, 12.17s/it] 35%|█████████████████████████████████▊                                                              | 247/700 [50:07<1:31:50, 12.17s/it]                                                                                                                                         {'loss': 0.1087, 'learning_rate': 0.0064714285714285716, 'epoch': 23.25}
+ 35%|█████████████████████████████████▊                                                              | 247/700 [50:07<1:31:50, 12.17s/it] 35%|██████████████████████████████████                                                              | 248/700 [50:19<1:31:38, 12.16s/it]                                                                                                                                         {'loss': 0.116, 'learning_rate': 0.006457142857142857, 'epoch': 23.34}
+ 35%|██████████████████████████████████                                                              | 248/700 [50:19<1:31:38, 12.16s/it] 36%|██████████████████████████████████▏                                                             | 249/700 [50:31<1:31:26, 12.16s/it]                                                                                                                                         {'loss': 0.1096, 'learning_rate': 0.0064428571428571425, 'epoch': 23.44}
+ 36%|██████████████████████████████████▏                                                             | 249/700 [50:31<1:31:26, 12.16s/it] 36%|██████████████████████████████████▎                                                             | 250/700 [50:43<1:31:14, 12.16s/it]                                                                                                                                         {'loss': 0.0972, 'learning_rate': 0.006428571428571429, 'epoch': 23.53}
+ 36%|██████████████████████████████████▎                                                             | 250/700 [50:43<1:31:14, 12.16s/it] 36%|██████████████████████████████████▍                                                             | 251/700 [50:56<1:31:01, 12.16s/it]                                                                                                                                         {'loss': 0.0889, 'learning_rate': 0.006414285714285714, 'epoch': 23.62}
+ 36%|██████████████████████████████████▍                                                             | 251/700 [50:56<1:31:01, 12.16s/it] 36%|██████████████████████████████████▌                                                             | 252/700 [51:08<1:30:49, 12.16s/it]                                                                                                                                         {'loss': 0.1199, 'learning_rate': 0.0064, 'epoch': 23.72}
+ 36%|██████████████████████████████████▌                                                             | 252/700 [51:08<1:30:49, 12.16s/it] 36%|██████████████████████████████████▋                                                             | 253/700 [51:20<1:30:37, 12.16s/it]                                                                                                                                         {'loss': 0.1337, 'learning_rate': 0.006385714285714286, 'epoch': 23.81}
+ 36%|██████████████████████████████████▋                                                             | 253/700 [51:20<1:30:37, 12.16s/it] 36%|██████████████████████████████████▊                                                             | 254/700 [51:32<1:30:25, 12.16s/it]                                                                                                                                         {'loss': 0.0977, 'learning_rate': 0.006371428571428571, 'epoch': 23.91}
+ 36%|██████████████████████████████████▊                                                             | 254/700 [51:32<1:30:25, 12.16s/it] 36%|██████████████████████████████████▉                                                             | 255/700 [51:44<1:30:12, 12.16s/it]                                                                                                                                         {'loss': 0.146, 'learning_rate': 0.006357142857142857, 'epoch': 24.0}
+ 36%|██████████████████████████████████▉                                                             | 255/700 [51:44<1:30:12, 12.16s/it] 37%|███████████████████████████████████                                                             | 256/700 [51:56<1:30:00, 12.16s/it]                                                                                                                                         {'loss': 0.1102, 'learning_rate': 0.006342857142857142, 'epoch': 24.09}
+ 37%|███████████████████████████████████                                                             | 256/700 [51:56<1:30:00, 12.16s/it] 37%|███████████████████████████████████▏                                                            | 257/700 [52:09<1:29:48, 12.16s/it]                                                                                                                                         {'loss': 0.1025, 'learning_rate': 0.006328571428571429, 'epoch': 24.19}
+ 37%|███████████████████████████████████▏                                                            | 257/700 [52:09<1:29:48, 12.16s/it] 37%|███████████████████████████████████▍                                                            | 258/700 [52:21<1:29:36, 12.16s/it]                                                                                                                                         {'loss': 0.09, 'learning_rate': 0.006314285714285715, 'epoch': 24.28}
+ 37%|███████████████████████████████████▍                                                            | 258/700 [52:21<1:29:36, 12.16s/it] 37%|███████████████████████████████████▌                                                            | 259/700 [52:33<1:29:24, 12.17s/it]                                                                                                                                         {'loss': 0.1302, 'learning_rate': 0.0063, 'epoch': 24.38}
+ 37%|███████████████████████████████████▌                                                            | 259/700 [52:33<1:29:24, 12.17s/it] 37%|███████████████████████████████████▋                                                            | 260/700 [52:45<1:29:12, 12.16s/it]                                                                                                                                         {'loss': 0.0739, 'learning_rate': 0.006285714285714286, 'epoch': 24.47}
+ 37%|███████████████████████████████████▋                                                            | 260/700 [52:45<1:29:12, 12.16s/it] 37%|███████████████████████████████████▊                                                            | 261/700 [52:57<1:29:00, 12.17s/it]                                                                                                                                         {'loss': 0.1172, 'learning_rate': 0.006271428571428571, 'epoch': 24.56}
+ 37%|███████████████████████████████████▊                                                            | 261/700 [52:57<1:29:00, 12.17s/it] 37%|███████████████████████████████████▉                                                            | 262/700 [53:09<1:28:48, 12.17s/it]                                                                                                                                         {'loss': 0.1048, 'learning_rate': 0.006257142857142857, 'epoch': 24.66}
+ 37%|███████████████████████████████████▉                                                            | 262/700 [53:09<1:28:48, 12.17s/it] 38%|████████████████████████████████████                                                            | 263/700 [53:21<1:28:36, 12.17s/it]                                                                                                                                         {'loss': 0.0977, 'learning_rate': 0.006242857142857144, 'epoch': 24.75}
+ 38%|████████████████████████████████████                                                            | 263/700 [53:21<1:28:36, 12.17s/it] 38%|████████████████████████████████████▏                                                           | 264/700 [53:34<1:28:24, 12.17s/it]                                                                                                                                         {'loss': 0.1056, 'learning_rate': 0.006228571428571429, 'epoch': 24.85}
+ 38%|████████████████████████████████████▏                                                           | 264/700 [53:34<1:28:24, 12.17s/it] 38%|████████████████████████████████████▎                                                           | 265/700 [53:46<1:28:11, 12.16s/it]                                                                                                                                         {'loss': 0.1252, 'learning_rate': 0.006214285714285715, 'epoch': 24.94}
+ 38%|████████████████████████████████████▎                                                           | 265/700 [53:46<1:28:11, 12.16s/it] 38%|████████████████████████████████████▍                                                           | 266/700 [53:58<1:27:59, 12.17s/it]                                                                                                                                         {'loss': 0.1107, 'learning_rate': 0.0062, 'epoch': 25.04}
+ 38%|████████████████████████████████████▍                                                           | 266/700 [53:58<1:27:59, 12.17s/it] 38%|████████████████████████████████████▌                                                           | 267/700 [54:10<1:27:47, 12.16s/it]                                                                                                                                         {'loss': 0.0887, 'learning_rate': 0.006185714285714286, 'epoch': 25.13}
+ 38%|████████████████████████████████████▌                                                           | 267/700 [54:10<1:27:47, 12.16s/it] 38%|████████████████████████████████████▊                                                           | 268/700 [54:22<1:27:35, 12.17s/it]                                                                                                                                         {'loss': 0.0836, 'learning_rate': 0.006171428571428571, 'epoch': 25.22}
+ 38%|████████████████████████████████████▊                                                           | 268/700 [54:22<1:27:35, 12.17s/it] 38%|████████████████████████████████████▉                                                           | 269/700 [54:34<1:27:23, 12.16s/it]                                                                                                                                         {'loss': 0.0957, 'learning_rate': 0.0061571428571428576, 'epoch': 25.32}
+ 38%|████████████████████████████████████▉                                                           | 269/700 [54:34<1:27:23, 12.16s/it] 39%|█████████████████████████████████████                                                           | 270/700 [54:47<1:27:10, 12.16s/it]                                                                                                                                         {'loss': 0.1165, 'learning_rate': 0.0061428571428571435, 'epoch': 25.41}
+ 39%|█████████████████████████████████████                                                           | 270/700 [54:47<1:27:10, 12.16s/it] 39%|█████████████████████████████████████▏                                                          | 271/700 [54:59<1:26:58, 12.16s/it]                                                                                                                                         {'loss': 0.1135, 'learning_rate': 0.0061285714285714285, 'epoch': 25.51}
+ 39%|█████████████████████████████████████▏                                                          | 271/700 [54:59<1:26:58, 12.16s/it] 39%|█████████████████████████████████████▎                                                          | 272/700 [55:11<1:26:46, 12.16s/it]                                                                                                                                         {'loss': 0.0901, 'learning_rate': 0.0061142857142857145, 'epoch': 25.6}
+ 39%|█████████████████████████████████████▎                                                          | 272/700 [55:11<1:26:46, 12.16s/it] 39%|█████████████████████████████████████▍                                                          | 273/700 [55:23<1:26:34, 12.17s/it]                                                                                                                                         {'loss': 0.0751, 'learning_rate': 0.0061, 'epoch': 25.69}
+ 39%|█████████████████████████████████████▍                                                          | 273/700 [55:23<1:26:34, 12.17s/it] 39%|█████████████████████████████████████▌                                                          | 274/700 [55:35<1:26:22, 12.16s/it]                                                                                                                                         {'loss': 0.109, 'learning_rate': 0.0060857142857142854, 'epoch': 25.79}
+ 39%|█████████████████████████████████████▌                                                          | 274/700 [55:35<1:26:22, 12.16s/it] 39%|█████████████████████████████████████▋                                                          | 275/700 [55:47<1:26:10, 12.16s/it]                                                                                                                                         {'loss': 0.102, 'learning_rate': 0.006071428571428571, 'epoch': 25.88}
+ 39%|█████████████████████████████████████▋                                                          | 275/700 [55:47<1:26:10, 12.16s/it] 39%|█████████████████████████████████████▊                                                          | 276/700 [56:00<1:25:57, 12.16s/it]                                                                                                                                         {'loss': 0.0916, 'learning_rate': 0.006057142857142858, 'epoch': 25.98}
+ 39%|█████████████████████████████████████▊                                                          | 276/700 [56:00<1:25:57, 12.16s/it] 40%|█████████████████████████████████████▉                                                          | 277/700 [56:12<1:25:45, 12.16s/it]                                                                                                                                         {'loss': 0.0821, 'learning_rate': 0.006042857142857143, 'epoch': 26.07}
+ 40%|█████████████████████████████████████▉                                                          | 277/700 [56:12<1:25:45, 12.16s/it] 40%|██████████████████████████████████████▏                                                         | 278/700 [56:24<1:25:33, 12.16s/it]                                                                                                                                         {'loss': 0.0797, 'learning_rate': 0.006028571428571429, 'epoch': 26.16}
+ 40%|██████████████████████████████████████▏                                                         | 278/700 [56:24<1:25:33, 12.16s/it] 40%|██████████████████████████████████████▎                                                         | 279/700 [56:36<1:25:20, 12.16s/it]                                                                                                                                         {'loss': 0.0804, 'learning_rate': 0.006014285714285714, 'epoch': 26.26}
+ 40%|██████████████████████████████████████▎                                                         | 279/700 [56:36<1:25:20, 12.16s/it] 40%|██████████████████████████████████████▍                                                         | 280/700 [56:48<1:25:08, 12.16s/it]                                                                                                                                         {'loss': 0.0987, 'learning_rate': 0.006, 'epoch': 26.35}
+ 40%|██████████████████████████████████████▍                                                         | 280/700 [56:48<1:25:08, 12.16s/it] 40%|██████████████████████████████████████▌                                                         | 281/700 [57:00<1:24:56, 12.16s/it]                                                                                                                                         {'loss': 0.1192, 'learning_rate': 0.005985714285714285, 'epoch': 26.45}
+ 40%|██████████████████████████████████████▌                                                         | 281/700 [57:00<1:24:56, 12.16s/it] 40%|██████████████████████████████████████▋                                                         | 282/700 [57:13<1:24:44, 12.16s/it]                                                                                                                                         {'loss': 0.0699, 'learning_rate': 0.005971428571428572, 'epoch': 26.54}
+ 40%|██████████████████████████████████████▋                                                         | 282/700 [57:13<1:24:44, 12.16s/it] 40%|██████████████████████████████████████▊                                                         | 283/700 [57:25<1:24:32, 12.16s/it]                                                                                                                                         {'loss': 0.0902, 'learning_rate': 0.005957142857142858, 'epoch': 26.64}
+ 40%|██████████████████████████████████████▊                                                         | 283/700 [57:25<1:24:32, 12.16s/it] 41%|██████████████████████████████████████▉                                                         | 284/700 [57:37<1:24:20, 12.16s/it]                                                                                                                                         {'loss': 0.0916, 'learning_rate': 0.005942857142857143, 'epoch': 26.73}
+ 41%|██████████████████████████████████████▉                                                         | 284/700 [57:37<1:24:20, 12.16s/it] 41%|███████████████████████████████████████                                                         | 285/700 [57:49<1:24:07, 12.16s/it]                                                                                                                                         {'loss': 0.0753, 'learning_rate': 0.005928571428571429, 'epoch': 26.82}
+ 41%|███████████████████████████████████████                                                         | 285/700 [57:49<1:24:07, 12.16s/it] 41%|███████████████████████████████████████▏                                                        | 286/700 [58:01<1:23:55, 12.16s/it]                                                                                                                                         {'loss': 0.0964, 'learning_rate': 0.005914285714285714, 'epoch': 26.92}
+ 41%|███████████████████████████████████████▏                                                        | 286/700 [58:01<1:23:55, 12.16s/it] 41%|███████████████████████████████████████▎                                                        | 287/700 [58:13<1:23:44, 12.16s/it]                                                                                                                                         {'loss': 0.1108, 'learning_rate': 0.0059, 'epoch': 27.01}
+ 41%|███████████████████████████████████████▎                                                        | 287/700 [58:13<1:23:44, 12.16s/it] 41%|███████████████████████████████████████▍                                                        | 288/700 [58:26<1:23:31, 12.16s/it]                                                                                                                                         {'loss': 0.1062, 'learning_rate': 0.005885714285714286, 'epoch': 27.11}
+ 41%|███████████████████████████████████████▍                                                        | 288/700 [58:26<1:23:31, 12.16s/it] 41%|███████████████████████████████████████▋                                                        | 289/700 [58:38<1:23:19, 12.16s/it]                                                                                                                                         {'loss': 0.0846, 'learning_rate': 0.005871428571428572, 'epoch': 27.2}
+ 41%|███████████████████████████████████████▋                                                        | 289/700 [58:38<1:23:19, 12.16s/it] 41%|███████████████████████████████████████▊                                                        | 290/700 [58:50<1:23:07, 12.16s/it]                                                                                                                                         {'loss': 0.0986, 'learning_rate': 0.005857142857142858, 'epoch': 27.29}
+ 41%|███████████████████████████████████████▊                                                        | 290/700 [58:50<1:23:07, 12.16s/it] 42%|███████████████████████████████████████▉                                                        | 291/700 [59:02<1:22:54, 12.16s/it]                                                                                                                                         {'loss': 0.0713, 'learning_rate': 0.005842857142857143, 'epoch': 27.39}
+ 42%|███████████████████████████████████████▉                                                        | 291/700 [59:02<1:22:54, 12.16s/it] 42%|████████████████████████████████████████                                                        | 292/700 [59:14<1:22:42, 12.16s/it]                                                                                                                                         {'loss': 0.0829, 'learning_rate': 0.005828571428571429, 'epoch': 27.48}
+ 42%|████████████████████████████████████████                                                        | 292/700 [59:14<1:22:42, 12.16s/it] 42%|████████████████████████████████████████▏                                                       | 293/700 [59:26<1:22:30, 12.16s/it]                                                                                                                                         {'loss': 0.1026, 'learning_rate': 0.0058142857142857145, 'epoch': 27.58}
+ 42%|████████████████████████████████████████▏                                                       | 293/700 [59:26<1:22:30, 12.16s/it] 42%|████████████████████████████████████████▎                                                       | 294/700 [59:39<1:22:18, 12.16s/it]                                                                                                                                         {'loss': 0.0785, 'learning_rate': 0.0058, 'epoch': 27.67}
+ 42%|████████████████████████████████████████▎                                                       | 294/700 [59:39<1:22:18, 12.16s/it] 42%|████████████████████████████████████████▍                                                       | 295/700 [59:51<1:22:06, 12.16s/it]                                                                                                                                         {'loss': 0.0729, 'learning_rate': 0.005785714285714286, 'epoch': 27.76}
+ 42%|████████████████████████████████████████▍                                                       | 295/700 [59:51<1:22:06, 12.16s/it] 42%|███████████████████████████████████████▋                                                      | 296/700 [1:00:03<1:21:54, 12.16s/it]                                                                                                                                         {'loss': 0.0738, 'learning_rate': 0.005771428571428572, 'epoch': 27.86}
+ 42%|███████████████████████████████████████▋                                                      | 296/700 [1:00:03<1:21:54, 12.16s/it] 42%|███████████████████████████████████████▉                                                      | 297/700 [1:00:15<1:21:41, 12.16s/it]                                                                                                                                         {'loss': 0.079, 'learning_rate': 0.005757142857142857, 'epoch': 27.95}
+ 42%|███████████████████████████████████████▉                                                      | 297/700 [1:00:15<1:21:41, 12.16s/it] 43%|████████████████████████████████████████                                                      | 298/700 [1:00:27<1:21:29, 12.16s/it]                                                                                                                                         {'loss': 0.0761, 'learning_rate': 0.005742857142857143, 'epoch': 28.05}
+ 43%|████████████████████████████████████████                                                      | 298/700 [1:00:27<1:21:29, 12.16s/it] 43%|████████████████████████████████████████▏                                                     | 299/700 [1:00:39<1:21:17, 12.16s/it]                                                                                                                                         {'loss': 0.0792, 'learning_rate': 0.005728571428571428, 'epoch': 28.14}
+ 43%|████████████████████████████████████████▏                                                     | 299/700 [1:00:39<1:21:17, 12.16s/it] 43%|████████████████████████████████████████▎                                                     | 300/700 [1:00:52<1:21:05, 12.16s/it]                                                                                                                                         {'loss': 0.0881, 'learning_rate': 0.005714285714285714, 'epoch': 28.24}
+ 43%|████████████████████████████████████████▎                                                     | 300/700 [1:00:52<1:21:05, 12.16s/it]Saving PrefixEncoder
+[INFO|configuration_utils.py:460] 2023-12-02 16:54:45,783 >> Configuration saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/config.json
+[INFO|configuration_utils.py:544] 2023-12-02 16:54:45,783 >> Configuration saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/generation_config.json
+[INFO|modeling_utils.py:2118] 2023-12-02 16:54:45,791 >> Model weights saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/pytorch_model.bin
+[INFO|tokenization_utils_base.py:2437] 2023-12-02 16:54:45,792 >> tokenizer config file saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2446] 2023-12-02 16:54:45,792 >> Special tokens file saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-300/special_tokens_map.json
+/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 43%|████████████████████████████████████████▍                                                     | 301/700 [1:01:04<1:20:57, 12.17s/it]                                                                                                                                         {'loss': 0.1073, 'learning_rate': 0.005699999999999999, 'epoch': 28.33}
+ 43%|████████████████████████████████████████▍                                                     | 301/700 [1:01:04<1:20:57, 12.17s/it] 43%|████████████████████████████████████████▌                                                     | 302/700 [1:01:16<1:20:44, 12.17s/it]                                                                                                                                         {'loss': 0.0686, 'learning_rate': 0.005685714285714286, 'epoch': 28.42}
+ 43%|████████████████████████████████████████▌                                                     | 302/700 [1:01:16<1:20:44, 12.17s/it] 43%|████████████████████████████████████████▋                                                     | 303/700 [1:01:28<1:20:30, 12.17s/it]                                                                                                                                         {'loss': 0.0701, 'learning_rate': 0.005671428571428572, 'epoch': 28.52}
+ 43%|████████████████████████████████████████▋                                                     | 303/700 [1:01:28<1:20:30, 12.17s/it] 43%|████████████████████████████████████████▊                                                     | 304/700 [1:01:40<1:20:18, 12.17s/it]                                                                                                                                         {'loss': 0.1114, 'learning_rate': 0.005657142857142857, 'epoch': 28.61}
+ 43%|████████████████████████████████████████▊                                                     | 304/700 [1:01:40<1:20:18, 12.17s/it] 44%|████████████████████████████████████████▉                                                     | 305/700 [1:01:52<1:20:05, 12.17s/it]                                                                                                                                         {'loss': 0.0595, 'learning_rate': 0.005642857142857143, 'epoch': 28.71}
+ 44%|████████████████████████████████████████▉                                                     | 305/700 [1:01:52<1:20:05, 12.17s/it] 44%|█████████████████████████████████████████                                                     | 306/700 [1:02:05<1:19:52, 12.16s/it]                                                                                                                                         {'loss': 0.086, 'learning_rate': 0.005628571428571428, 'epoch': 28.8}
+ 44%|█████████████████████████████████████████                                                     | 306/700 [1:02:05<1:19:52, 12.16s/it] 44%|█████████████████████████████████████████▏                                                    | 307/700 [1:02:17<1:19:41, 12.17s/it]                                                                                                                                         {'loss': 0.0877, 'learning_rate': 0.005614285714285714, 'epoch': 28.89}
+ 44%|█████████████████████████████████████████▏                                                    | 307/700 [1:02:17<1:19:41, 12.17s/it] 44%|█████████████████████████████████████████▎                                                    | 308/700 [1:02:29<1:19:28, 12.17s/it]                                                                                                                                         {'loss': 0.0582, 'learning_rate': 0.005600000000000001, 'epoch': 28.99}
+ 44%|█████████████████████████████████████████▎                                                    | 308/700 [1:02:29<1:19:28, 12.17s/it] 44%|█████████████████████████████████████████▍                                                    | 309/700 [1:02:41<1:19:16, 12.17s/it]                                                                                                                                         {'loss': 0.0645, 'learning_rate': 0.005585714285714286, 'epoch': 29.08}
+ 44%|█████████████████████████████████████████▍                                                    | 309/700 [1:02:41<1:19:16, 12.17s/it] 44%|█████████████████████████████████████████▋                                                    | 310/700 [1:02:53<1:19:04, 12.17s/it]                                                                                                                                         {'loss': 0.1025, 'learning_rate': 0.005571428571428572, 'epoch': 29.18}
+ 44%|█████████████████████████████████████████▋                                                    | 310/700 [1:02:53<1:19:04, 12.17s/it] 44%|█████████████████████████████████████████▊                                                    | 311/700 [1:03:05<1:18:52, 12.16s/it]                                                                                                                                         {'loss': 0.0612, 'learning_rate': 0.005557142857142857, 'epoch': 29.27}
+ 44%|█████████████████████████████████████████▊                                                    | 311/700 [1:03:05<1:18:52, 12.16s/it] 45%|█████████████████████████████████████████▉                                                    | 312/700 [1:03:18<1:18:40, 12.17s/it]                                                                                                                                         {'loss': 0.0706, 'learning_rate': 0.005542857142857143, 'epoch': 29.36}
+ 45%|█████████████████████████████████████████▉                                                    | 312/700 [1:03:18<1:18:40, 12.17s/it] 45%|██████████████████████████████████████████                                                    | 313/700 [1:03:30<1:18:27, 12.16s/it]                                                                                                                                         {'loss': 0.0636, 'learning_rate': 0.005528571428571429, 'epoch': 29.46}
+ 45%|██████████████████████████████████████████                                                    | 313/700 [1:03:30<1:18:27, 12.16s/it] 45%|██████████████████████████████████████████▏                                                   | 314/700 [1:03:42<1:18:15, 12.16s/it]                                                                                                                                         {'loss': 0.0721, 'learning_rate': 0.005514285714285714, 'epoch': 29.55}
+ 45%|██████████████████████████████████████████▏                                                   | 314/700 [1:03:42<1:18:15, 12.16s/it] 45%|██████████████████████████████████████████▎                                                   | 315/700 [1:03:54<1:18:03, 12.16s/it]                                                                                                                                         {'loss': 0.1062, 'learning_rate': 0.0055000000000000005, 'epoch': 29.65}
+ 45%|██████████████████████████████████████████▎                                                   | 315/700 [1:03:54<1:18:03, 12.16s/it] 45%|██████████████████████████████████████████▍                                                   | 316/700 [1:04:06<1:17:51, 12.16s/it]                                                                                                                                         {'loss': 0.0739, 'learning_rate': 0.0054857142857142865, 'epoch': 29.74}
+ 45%|██████████████████████████████████████████▍                                                   | 316/700 [1:04:06<1:17:51, 12.16s/it] 45%|██████████████████████████████████████████▌                                                   | 317/700 [1:04:18<1:17:38, 12.16s/it]                                                                                                                                         {'loss': 0.0688, 'learning_rate': 0.0054714285714285715, 'epoch': 29.84}
+ 45%|██████████████████████████████████████████▌                                                   | 317/700 [1:04:18<1:17:38, 12.16s/it] 45%|██████████████████████████████████████████▋                                                   | 318/700 [1:04:31<1:17:26, 12.16s/it]                                                                                                                                         {'loss': 0.0715, 'learning_rate': 0.0054571428571428575, 'epoch': 29.93}
+ 45%|██████████████████████████████████████████▋                                                   | 318/700 [1:04:31<1:17:26, 12.16s/it] 46%|██████████████████████████████████████████▊                                                   | 319/700 [1:04:43<1:17:14, 12.16s/it]                                                                                                                                         {'loss': 0.0628, 'learning_rate': 0.0054428571428571425, 'epoch': 30.02}
+ 46%|██████████████████████████████████████████▊                                                   | 319/700 [1:04:43<1:17:14, 12.16s/it] 46%|██████████████████████████████████████████▉                                                   | 320/700 [1:04:55<1:17:02, 12.16s/it]                                                                                                                                         {'loss': 0.0831, 'learning_rate': 0.0054285714285714284, 'epoch': 30.12}
+ 46%|██████████████████████████████████████████▉                                                   | 320/700 [1:04:55<1:17:02, 12.16s/it] 46%|███████████████████████████████████████████                                                   | 321/700 [1:05:07<1:16:50, 12.16s/it]                                                                                                                                         {'loss': 0.0833, 'learning_rate': 0.005414285714285715, 'epoch': 30.21}
+ 46%|███████████████████████████████████████████                                                   | 321/700 [1:05:07<1:16:50, 12.16s/it] 46%|███████████████████████████████████████████▏                                                  | 322/700 [1:05:19<1:16:38, 12.16s/it]                                                                                                                                         {'loss': 0.09, 'learning_rate': 0.0054, 'epoch': 30.31}
+ 46%|███████████████████████████████████████████▏                                                  | 322/700 [1:05:19<1:16:38, 12.16s/it] 46%|███████████████████████████████████████████▎                                                  | 323/700 [1:05:31<1:16:26, 12.16s/it]                                                                                                                                         {'loss': 0.0469, 'learning_rate': 0.005385714285714286, 'epoch': 30.4}
+ 46%|███████████████████████████████████████████▎                                                  | 323/700 [1:05:31<1:16:26, 12.16s/it] 46%|███████████████████████████████████████████▌                                                  | 324/700 [1:05:44<1:16:13, 12.16s/it]                                                                                                                                         {'loss': 0.0631, 'learning_rate': 0.005371428571428571, 'epoch': 30.49}
+ 46%|███████████████████████████████████████████▌                                                  | 324/700 [1:05:44<1:16:13, 12.16s/it] 46%|███████████████████████████████████████████▋                                                  | 325/700 [1:05:56<1:16:01, 12.16s/it]                                                                                                                                         {'loss': 0.0685, 'learning_rate': 0.005357142857142857, 'epoch': 30.59}
+ 46%|███████████████████████████████████████████▋                                                  | 325/700 [1:05:56<1:16:01, 12.16s/it] 47%|███████████████████████████████████████████▊                                                  | 326/700 [1:06:08<1:15:49, 12.16s/it]                                                                                                                                         {'loss': 0.0798, 'learning_rate': 0.005342857142857142, 'epoch': 30.68}
+ 47%|███████████████████████████████████████████▊                                                  | 326/700 [1:06:08<1:15:49, 12.16s/it] 47%|███████████████████████████████████████████▉                                                  | 327/700 [1:06:20<1:15:37, 12.16s/it]                                                                                                                                         {'loss': 0.0653, 'learning_rate': 0.005328571428571428, 'epoch': 30.78}
+ 47%|███████████████████████████████████████████▉                                                  | 327/700 [1:06:20<1:15:37, 12.16s/it] 47%|████████████████████████████████████████████                                                  | 328/700 [1:06:32<1:15:25, 12.17s/it]                                                                                                                                         {'loss': 0.0615, 'learning_rate': 0.005314285714285715, 'epoch': 30.87}
+ 47%|████████████████████████████████████████████                                                  | 328/700 [1:06:32<1:15:25, 12.17s/it] 47%|████████████████████████████████████████████▏                                                 | 329/700 [1:06:44<1:15:13, 12.17s/it]                                                                                                                                         {'loss': 0.0548, 'learning_rate': 0.0053, 'epoch': 30.96}
+ 47%|████████████████████████████████████████████▏                                                 | 329/700 [1:06:44<1:15:13, 12.17s/it] 47%|████████████████████████████████████████████▎                                                 | 330/700 [1:06:57<1:15:01, 12.17s/it]                                                                                                                                         {'loss': 0.0592, 'learning_rate': 0.005285714285714286, 'epoch': 31.06}
+ 47%|████████████████████████████████████████████▎                                                 | 330/700 [1:06:57<1:15:01, 12.17s/it] 47%|████████████████████████████████████████████▍                                                 | 331/700 [1:07:09<1:14:48, 12.16s/it]                                                                                                                                         {'loss': 0.0628, 'learning_rate': 0.005271428571428572, 'epoch': 31.15}
+ 47%|████████████████████████████████████████████▍                                                 | 331/700 [1:07:09<1:14:48, 12.16s/it] 47%|████████████████████████████████████████████▌                                                 | 332/700 [1:07:21<1:14:36, 12.16s/it]                                                                                                                                         {'loss': 0.0604, 'learning_rate': 0.005257142857142857, 'epoch': 31.25}
+ 47%|████████████████████████████████████████████▌                                                 | 332/700 [1:07:21<1:14:36, 12.16s/it] 48%|████████████████████████████████████████████▋                                                 | 333/700 [1:07:33<1:14:24, 12.16s/it]                                                                                                                                         {'loss': 0.0833, 'learning_rate': 0.005242857142857143, 'epoch': 31.34}
+ 48%|████████████████████████████████████████████▋                                                 | 333/700 [1:07:33<1:14:24, 12.16s/it] 48%|████████████████████████████████████████████▊                                                 | 334/700 [1:07:45<1:14:12, 12.16s/it]                                                                                                                                         {'loss': 0.0748, 'learning_rate': 0.005228571428571429, 'epoch': 31.44}
+ 48%|████████████████████████████████████████████▊                                                 | 334/700 [1:07:45<1:14:12, 12.16s/it] 48%|████████████████████████████████████████████▉                                                 | 335/700 [1:07:57<1:13:59, 12.16s/it]                                                                                                                                         {'loss': 0.0495, 'learning_rate': 0.005214285714285715, 'epoch': 31.53}
+ 48%|████████████████████████████████████████████▉                                                 | 335/700 [1:07:57<1:13:59, 12.16s/it] 48%|█████████████████████████████████████████████                                                 | 336/700 [1:08:10<1:13:48, 12.16s/it]                                                                                                                                         {'loss': 0.0589, 'learning_rate': 0.005200000000000001, 'epoch': 31.62}
+ 48%|█████████████████████████████████████████████                                                 | 336/700 [1:08:10<1:13:48, 12.16s/it] 48%|█████████████████████████████████████████████▎                                                | 337/700 [1:08:22<1:13:35, 12.16s/it]                                                                                                                                         {'loss': 0.0655, 'learning_rate': 0.005185714285714286, 'epoch': 31.72}
+ 48%|█████████████████████████████████████████████▎                                                | 337/700 [1:08:22<1:13:35, 12.16s/it] 48%|█████████████████████████████████████████████▍                                                | 338/700 [1:08:34<1:13:23, 12.16s/it]                                                                                                                                         {'loss': 0.0695, 'learning_rate': 0.005171428571428572, 'epoch': 31.81}
+ 48%|█████████████████████████████████████████████▍                                                | 338/700 [1:08:34<1:13:23, 12.16s/it] 48%|█████████████████████████████████████████████▌                                                | 339/700 [1:08:46<1:13:11, 12.16s/it]                                                                                                                                         {'loss': 0.0609, 'learning_rate': 0.005157142857142857, 'epoch': 31.91}
+ 48%|█████████████████████████████████████████████▌                                                | 339/700 [1:08:46<1:13:11, 12.16s/it] 49%|█████████████████████████████████████████████▋                                                | 340/700 [1:08:58<1:12:58, 12.16s/it]                                                                                                                                         {'loss': 0.0636, 'learning_rate': 0.005142857142857143, 'epoch': 32.0}
+ 49%|█████████████████████████████████████████████▋                                                | 340/700 [1:08:58<1:12:58, 12.16s/it] 49%|█████████████████████████████████████████████▊                                                | 341/700 [1:09:10<1:12:46, 12.16s/it]                                                                                                                                         {'loss': 0.0606, 'learning_rate': 0.005128571428571429, 'epoch': 32.09}
+ 49%|█████████████████████████████████████████████▊                                                | 341/700 [1:09:10<1:12:46, 12.16s/it] 49%|█████████████████████████████████████████████▉                                                | 342/700 [1:09:23<1:12:34, 12.16s/it]                                                                                                                                         {'loss': 0.0739, 'learning_rate': 0.0051142857142857144, 'epoch': 32.19}
+ 49%|█████████████████████████████████████████████▉                                                | 342/700 [1:09:23<1:12:34, 12.16s/it] 49%|██████████████████████████████████████████████                                                | 343/700 [1:09:35<1:12:22, 12.16s/it]                                                                                                                                         {'loss': 0.0535, 'learning_rate': 0.0051, 'epoch': 32.28}
+ 49%|██████████████████████████████████████████████                                                | 343/700 [1:09:35<1:12:22, 12.16s/it] 49%|██████████████████████████████████████████████▏                                               | 344/700 [1:09:47<1:12:10, 12.16s/it]                                                                                                                                         {'loss': 0.0598, 'learning_rate': 0.005085714285714285, 'epoch': 32.38}
+ 49%|██████████████████████████████████████████████▏                                               | 344/700 [1:09:47<1:12:10, 12.16s/it] 49%|██████████████████████████████████████████████▎                                               | 345/700 [1:09:59<1:11:58, 12.16s/it]                                                                                                                                         {'loss': 0.06, 'learning_rate': 0.005071428571428571, 'epoch': 32.47}
+ 49%|██████████████████████████████████████████████▎                                               | 345/700 [1:09:59<1:11:58, 12.16s/it] 49%|██████████████████████████████████████████████▍                                               | 346/700 [1:10:11<1:11:45, 12.16s/it]                                                                                                                                         {'loss': 0.0734, 'learning_rate': 0.005057142857142856, 'epoch': 32.56}
+ 49%|██████████████████████████████████████████████▍                                               | 346/700 [1:10:11<1:11:45, 12.16s/it] 50%|██████████████████████████████████████████████▌                                               | 347/700 [1:10:23<1:11:33, 12.16s/it]                                                                                                                                         {'loss': 0.078, 'learning_rate': 0.005042857142857143, 'epoch': 32.66}
+ 50%|██████████████████████████████████████████████▌                                               | 347/700 [1:10:23<1:11:33, 12.16s/it] 50%|██████████████████████████████████████████████▋                                               | 348/700 [1:10:35<1:11:21, 12.16s/it]                                                                                                                                         {'loss': 0.0618, 'learning_rate': 0.005028571428571429, 'epoch': 32.75}
+ 50%|██████████████████████████████████████████████▋                                               | 348/700 [1:10:35<1:11:21, 12.16s/it] 50%|██████████████████████████████████████████████▊                                               | 349/700 [1:10:48<1:11:09, 12.16s/it]                                                                                                                                         {'loss': 0.0655, 'learning_rate': 0.005014285714285714, 'epoch': 32.85}
+ 50%|██████████████████████████████████████████████▊                                               | 349/700 [1:10:48<1:11:09, 12.16s/it] 50%|███████████████████████████████████████████████                                               | 350/700 [1:11:00<1:10:57, 12.16s/it]                                                                                                                                         {'loss': 0.0615, 'learning_rate': 0.005, 'epoch': 32.94}
+ 50%|███████████████████████████████████████████████                                               | 350/700 [1:11:00<1:10:57, 12.16s/it] 50%|███████████████████████████████████████████████▏                                              | 351/700 [1:11:12<1:10:45, 12.16s/it]                                                                                                                                         {'loss': 0.0556, 'learning_rate': 0.004985714285714286, 'epoch': 33.04}
+ 50%|███████████████████████████████████████████████▏                                              | 351/700 [1:11:12<1:10:45, 12.16s/it] 50%|███████████████████████████████████████████████▎                                              | 352/700 [1:11:24<1:10:32, 12.16s/it]                                                                                                                                         {'loss': 0.0637, 'learning_rate': 0.004971428571428572, 'epoch': 33.13}
+ 50%|███████████████████████████████████████████████▎                                              | 352/700 [1:11:24<1:10:32, 12.16s/it] 50%|███████████████████████████████████████████████▍                                              | 353/700 [1:11:36<1:10:20, 12.16s/it]                                                                                                                                         {'loss': 0.0518, 'learning_rate': 0.004957142857142857, 'epoch': 33.22}
+ 50%|███████████████████████████████████████████████▍                                              | 353/700 [1:11:36<1:10:20, 12.16s/it] 51%|███████████████████████████████████████████████▌                                              | 354/700 [1:11:48<1:10:08, 12.16s/it]                                                                                                                                         {'loss': 0.0466, 'learning_rate': 0.004942857142857143, 'epoch': 33.32}
+ 51%|███████████████████████████████████████████████▌                                              | 354/700 [1:11:48<1:10:08, 12.16s/it] 51%|███████████████████████████████████████████████▋                                              | 355/700 [1:12:01<1:09:56, 12.16s/it]                                                                                                                                         {'loss': 0.0732, 'learning_rate': 0.004928571428571429, 'epoch': 33.41}
+ 51%|███████████████████████████████████████████████▋                                              | 355/700 [1:12:01<1:09:56, 12.16s/it] 51%|███████████████████████████████████████████████▊                                              | 356/700 [1:12:13<1:09:44, 12.16s/it]                                                                                                                                         {'loss': 0.0584, 'learning_rate': 0.004914285714285715, 'epoch': 33.51}
+ 51%|███████████████████████████████████████████████▊                                              | 356/700 [1:12:13<1:09:44, 12.16s/it] 51%|███████████████████████████████████████████████▉                                              | 357/700 [1:12:25<1:09:31, 12.16s/it]                                                                                                                                         {'loss': 0.0586, 'learning_rate': 0.0049, 'epoch': 33.6}
+ 51%|███████████████████████████████████████████████▉                                              | 357/700 [1:12:25<1:09:31, 12.16s/it] 51%|████████████████████████████████████████████████                                              | 358/700 [1:12:37<1:09:19, 12.16s/it]                                                                                                                                         {'loss': 0.0481, 'learning_rate': 0.004885714285714286, 'epoch': 33.69}
+ 51%|████████████████████████████████████████████████                                              | 358/700 [1:12:37<1:09:19, 12.16s/it] 51%|████████████████████████████████████████████████▏                                             | 359/700 [1:12:49<1:09:07, 12.16s/it]                                                                                                                                         {'loss': 0.0552, 'learning_rate': 0.004871428571428572, 'epoch': 33.79}
+ 51%|████████████████████████████████████████████████▏                                             | 359/700 [1:12:49<1:09:07, 12.16s/it] 51%|████████████████████████████████████████████████▎                                             | 360/700 [1:13:01<1:08:55, 12.16s/it]                                                                                                                                         {'loss': 0.0567, 'learning_rate': 0.004857142857142858, 'epoch': 33.88}
+ 51%|████████████████████████████████████████████████▎                                             | 360/700 [1:13:01<1:08:55, 12.16s/it] 52%|████████████████████████████████████████████████▍                                             | 361/700 [1:13:14<1:08:43, 12.16s/it]                                                                                                                                         {'loss': 0.0664, 'learning_rate': 0.004842857142857143, 'epoch': 33.98}
+ 52%|████████████████████████████████████████████████▍                                             | 361/700 [1:13:14<1:08:43, 12.16s/it] 52%|████████████████████████████████████████████████▌                                             | 362/700 [1:13:26<1:08:31, 12.16s/it]                                                                                                                                         {'loss': 0.0701, 'learning_rate': 0.004828571428571429, 'epoch': 34.07}
+ 52%|████████████████████████████████████████████████▌                                             | 362/700 [1:13:26<1:08:31, 12.16s/it] 52%|████████████████████████████████████████████████▋                                             | 363/700 [1:13:38<1:08:19, 12.16s/it]                                                                                                                                         {'loss': 0.069, 'learning_rate': 0.0048142857142857145, 'epoch': 34.16}
+ 52%|████████████████████████████████████████████████▋                                             | 363/700 [1:13:38<1:08:19, 12.16s/it] 52%|████████████████████████████████████████████████▉                                             | 364/700 [1:13:50<1:08:06, 12.16s/it]                                                                                                                                         {'loss': 0.066, 'learning_rate': 0.0048, 'epoch': 34.26}
+ 52%|████████████████████████████████████████████████▉                                             | 364/700 [1:13:50<1:08:06, 12.16s/it] 52%|█████████████████████████████████████████████████                                             | 365/700 [1:14:02<1:07:54, 12.16s/it]                                                                                                                                         {'loss': 0.0546, 'learning_rate': 0.004785714285714286, 'epoch': 34.35}
+ 52%|█████████████████████████████████████████████████                                             | 365/700 [1:14:02<1:07:54, 12.16s/it] 52%|█████████████████████████████████████████████████▏                                            | 366/700 [1:14:14<1:07:42, 12.16s/it]                                                                                                                                         {'loss': 0.0616, 'learning_rate': 0.004771428571428571, 'epoch': 34.45}
+ 52%|█████████████████████████████████████████████████▏                                            | 366/700 [1:14:14<1:07:42, 12.16s/it] 52%|█████████████████████████████████████████████████▎                                            | 367/700 [1:14:27<1:07:30, 12.16s/it]                                                                                                                                         {'loss': 0.0374, 'learning_rate': 0.004757142857142857, 'epoch': 34.54}
+ 52%|█████████████████████████████████████████████████▎                                            | 367/700 [1:14:27<1:07:30, 12.16s/it] 53%|█████████████████████████████████████████████████▍                                            | 368/700 [1:14:39<1:07:18, 12.16s/it]                                                                                                                                         {'loss': 0.046, 'learning_rate': 0.004742857142857143, 'epoch': 34.64}
+ 53%|█████████████████████████████████████████████████▍                                            | 368/700 [1:14:39<1:07:18, 12.16s/it] 53%|█████████████████████████████████████████████████▌                                            | 369/700 [1:14:51<1:07:06, 12.16s/it]                                                                                                                                         {'loss': 0.0459, 'learning_rate': 0.004728571428571428, 'epoch': 34.73}
+ 53%|█████████████████████████████████████████████████▌                                            | 369/700 [1:14:51<1:07:06, 12.16s/it] 53%|█████████████████████████████████████████████████▋                                            | 370/700 [1:15:03<1:06:53, 12.16s/it]                                                                                                                                         {'loss': 0.0648, 'learning_rate': 0.004714285714285714, 'epoch': 34.82}
+ 53%|█████████████████████████████████████████████████▋                                            | 370/700 [1:15:03<1:06:53, 12.16s/it] 53%|█████████████████████████████████████████████████▊                                            | 371/700 [1:15:15<1:06:41, 12.16s/it]                                                                                                                                         {'loss': 0.0699, 'learning_rate': 0.0047, 'epoch': 34.92}
+ 53%|█████████████████████████████████████████████████▊                                            | 371/700 [1:15:15<1:06:41, 12.16s/it] 53%|█████████████████████████████████████████████████▉                                            | 372/700 [1:15:27<1:06:29, 12.16s/it]                                                                                                                                         {'loss': 0.0605, 'learning_rate': 0.004685714285714286, 'epoch': 35.01}
+ 53%|█████████████████████████████████████████████████▉                                            | 372/700 [1:15:27<1:06:29, 12.16s/it] 53%|██████████████████████████████████████████████████                                            | 373/700 [1:15:40<1:06:17, 12.16s/it]                                                                                                                                         {'loss': 0.0704, 'learning_rate': 0.004671428571428571, 'epoch': 35.11}
+ 53%|██████████████████████████████████████████████████                                            | 373/700 [1:15:40<1:06:17, 12.16s/it] 53%|██████████████████████████████████████████████████▏                                           | 374/700 [1:15:52<1:06:05, 12.16s/it]                                                                                                                                         {'loss': 0.0444, 'learning_rate': 0.004657142857142857, 'epoch': 35.2}
+ 53%|██████████████████████████████████████████████████▏                                           | 374/700 [1:15:52<1:06:05, 12.16s/it] 54%|██████████████████████████████████████████████████▎                                           | 375/700 [1:16:04<1:05:52, 12.16s/it]                                                                                                                                         {'loss': 0.062, 'learning_rate': 0.004642857142857143, 'epoch': 35.29}
+ 54%|██████████████████████████████████████████████████▎                                           | 375/700 [1:16:04<1:05:52, 12.16s/it] 54%|██████████████████████████████████████████████████▍                                           | 376/700 [1:16:16<1:05:40, 12.16s/it]                                                                                                                                         {'loss': 0.0464, 'learning_rate': 0.004628571428571429, 'epoch': 35.39}
+ 54%|██████████████████████████████████████████████████▍                                           | 376/700 [1:16:16<1:05:40, 12.16s/it] 54%|██████████████████████████████████████████████████▋                                           | 377/700 [1:16:28<1:05:28, 12.16s/it]                                                                                                                                         {'loss': 0.0548, 'learning_rate': 0.004614285714285714, 'epoch': 35.48}
+ 54%|██████████████████████████████████████████████████▋                                           | 377/700 [1:16:28<1:05:28, 12.16s/it] 54%|██████████████████████████████████████████████████▊                                           | 378/700 [1:16:40<1:05:16, 12.16s/it]                                                                                                                                         {'loss': 0.0555, 'learning_rate': 0.0046, 'epoch': 35.58}
+ 54%|██████████████████████████████████████████████████▊                                           | 378/700 [1:16:40<1:05:16, 12.16s/it] 54%|██████████████████████████████████████████████████▉                                           | 379/700 [1:16:53<1:05:04, 12.16s/it]                                                                                                                                         {'loss': 0.0654, 'learning_rate': 0.004585714285714286, 'epoch': 35.67}
+ 54%|██████████████████████████████████████████████████▉                                           | 379/700 [1:16:53<1:05:04, 12.16s/it] 54%|███████████████████████████████████████████████████                                           | 380/700 [1:17:05<1:04:52, 12.16s/it]                                                                                                                                         {'loss': 0.0592, 'learning_rate': 0.004571428571428572, 'epoch': 35.76}
+ 54%|███████████████████████████████████████████████████                                           | 380/700 [1:17:05<1:04:52, 12.16s/it] 54%|███████████████████████████████████████████████████▏                                          | 381/700 [1:17:17<1:04:40, 12.16s/it]                                                                                                                                         {'loss': 0.0521, 'learning_rate': 0.004557142857142858, 'epoch': 35.86}
+ 54%|███████████████████████████████████████████████████▏                                          | 381/700 [1:17:17<1:04:40, 12.16s/it] 55%|███████████████████████████████████████████████████▎                                          | 382/700 [1:17:29<1:04:28, 12.16s/it]                                                                                                                                         {'loss': 0.0633, 'learning_rate': 0.004542857142857143, 'epoch': 35.95}
+ 55%|███████████████████████████████████████████████████▎                                          | 382/700 [1:17:29<1:04:28, 12.16s/it] 55%|███████████████████████████████████████████████████▍                                          | 383/700 [1:17:41<1:04:16, 12.16s/it]                                                                                                                                         {'loss': 0.047, 'learning_rate': 0.004528571428571429, 'epoch': 36.05}
+ 55%|███████████████████████████████████████████████████▍                                          | 383/700 [1:17:41<1:04:16, 12.16s/it] 55%|███████████████████████████████████████████████████▌                                          | 384/700 [1:17:53<1:04:03, 12.16s/it]                                                                                                                                         {'loss': 0.0476, 'learning_rate': 0.004514285714285714, 'epoch': 36.14}
+ 55%|███████████████████████████████████████████████████▌                                          | 384/700 [1:17:53<1:04:03, 12.16s/it] 55%|███████████████████████████████████████████████████▋                                          | 385/700 [1:18:06<1:03:51, 12.16s/it]                                                                                                                                         {'loss': 0.051, 'learning_rate': 0.0045000000000000005, 'epoch': 36.24}
+ 55%|███████████████████████████████████████████████████▋                                          | 385/700 [1:18:06<1:03:51, 12.16s/it] 55%|███████████████████████████████████████████████████▊                                          | 386/700 [1:18:18<1:03:39, 12.16s/it]                                                                                                                                         {'loss': 0.064, 'learning_rate': 0.004485714285714286, 'epoch': 36.33}
+ 55%|███████████████████████████████████████████████████▊                                          | 386/700 [1:18:18<1:03:39, 12.16s/it] 55%|███████████████████████████████████████████████████▉                                          | 387/700 [1:18:30<1:03:27, 12.16s/it]                                                                                                                                         {'loss': 0.0309, 'learning_rate': 0.0044714285714285715, 'epoch': 36.42}
+ 55%|███████████████████████████████████████████████████▉                                          | 387/700 [1:18:30<1:03:27, 12.16s/it] 55%|████████████████████████████████████████████████████                                          | 388/700 [1:18:42<1:03:15, 12.16s/it]                                                                                                                                         {'loss': 0.0632, 'learning_rate': 0.0044571428571428574, 'epoch': 36.52}
+ 55%|████████████████████████████████████████████████████                                          | 388/700 [1:18:42<1:03:15, 12.16s/it] 56%|████████████████████████████████████████████████████▏                                         | 389/700 [1:18:54<1:03:02, 12.16s/it]                                                                                                                                         {'loss': 0.0583, 'learning_rate': 0.004442857142857143, 'epoch': 36.61}
+ 56%|████████████████████████████████████████████████████▏                                         | 389/700 [1:18:54<1:03:02, 12.16s/it] 56%|████████████████████████████████████████████████████▎                                         | 390/700 [1:19:06<1:02:51, 12.16s/it]                                                                                                                                         {'loss': 0.0524, 'learning_rate': 0.004428571428571428, 'epoch': 36.71}
+ 56%|████████████████████████████████████████████████████▎                                         | 390/700 [1:19:06<1:02:51, 12.16s/it] 56%|████████████████████████████████████████████████████▌                                         | 391/700 [1:19:19<1:02:39, 12.17s/it]                                                                                                                                         {'loss': 0.0574, 'learning_rate': 0.004414285714285714, 'epoch': 36.8}
+ 56%|████████████████████████████████████████████████████▌                                         | 391/700 [1:19:19<1:02:39, 12.17s/it] 56%|████████████████████████████████████████████████████▋                                         | 392/700 [1:19:31<1:02:26, 12.16s/it]                                                                                                                                         {'loss': 0.043, 'learning_rate': 0.0044, 'epoch': 36.89}
+ 56%|████████████████████████████████████████████████████▋                                         | 392/700 [1:19:31<1:02:26, 12.16s/it] 56%|████████████████████████████████████████████████████▊                                         | 393/700 [1:19:43<1:02:14, 12.16s/it]                                                                                                                                         {'loss': 0.0482, 'learning_rate': 0.004385714285714285, 'epoch': 36.99}
+ 56%|████████████████████████████████████████████████████▊                                         | 393/700 [1:19:43<1:02:14, 12.16s/it] 56%|████████████████████████████████████████████████████▉                                         | 394/700 [1:19:55<1:02:02, 12.16s/it]                                                                                                                                         {'loss': 0.0585, 'learning_rate': 0.004371428571428572, 'epoch': 37.08}
+ 56%|████████████████████████████████████████████████████▉                                         | 394/700 [1:19:55<1:02:02, 12.16s/it] 56%|█████████████████████████████████████████████████████                                         | 395/700 [1:20:07<1:01:50, 12.16s/it]                                                                                                                                         {'loss': 0.0467, 'learning_rate': 0.004357142857142857, 'epoch': 37.18}
+ 56%|█████████████████████████████████████████████████████                                         | 395/700 [1:20:07<1:01:50, 12.16s/it] 57%|█████████████████████████████████████████████████████▏                                        | 396/700 [1:20:19<1:01:37, 12.16s/it]                                                                                                                                         {'loss': 0.0498, 'learning_rate': 0.004342857142857143, 'epoch': 37.27}
+ 57%|█████████████████████████████████████████████████████▏                                        | 396/700 [1:20:19<1:01:37, 12.16s/it] 57%|█████████████████████████████████████████████████████▎                                        | 397/700 [1:20:32<1:01:25, 12.16s/it]                                                                                                                                         {'loss': 0.0578, 'learning_rate': 0.004328571428571429, 'epoch': 37.36}
+ 57%|█████████████████████████████████████████████████████▎                                        | 397/700 [1:20:32<1:01:25, 12.16s/it] 57%|█████████████████████████████████████████████████████▍                                        | 398/700 [1:20:44<1:01:13, 12.16s/it]                                                                                                                                         {'loss': 0.0469, 'learning_rate': 0.004314285714285714, 'epoch': 37.46}
+ 57%|█████████████████████████████████████████████████████▍                                        | 398/700 [1:20:44<1:01:13, 12.16s/it] 57%|█████████████████████████████████████████████████████▌                                        | 399/700 [1:20:56<1:01:01, 12.16s/it]                                                                                                                                         {'loss': 0.0447, 'learning_rate': 0.0043, 'epoch': 37.55}
+ 57%|█████████████████████████████████████████████████████▌                                        | 399/700 [1:20:56<1:01:01, 12.16s/it] 57%|█████████████████████████████████████████████████████▋                                        | 400/700 [1:21:08<1:00:49, 12.16s/it]                                                                                                                                         {'loss': 0.0669, 'learning_rate': 0.004285714285714286, 'epoch': 37.65}
+ 57%|█████████████████████████████████████████████████████▋                                        | 400/700 [1:21:08<1:00:49, 12.16s/it]Saving PrefixEncoder
+[INFO|configuration_utils.py:460] 2023-12-02 17:15:02,214 >> Configuration saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/config.json
+[INFO|configuration_utils.py:544] 2023-12-02 17:15:02,215 >> Configuration saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/generation_config.json
+[INFO|modeling_utils.py:2118] 2023-12-02 17:15:02,222 >> Model weights saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/pytorch_model.bin
+[INFO|tokenization_utils_base.py:2437] 2023-12-02 17:15:02,223 >> tokenizer config file saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2446] 2023-12-02 17:15:02,223 >> Special tokens file saved in output/linghua_pt-20231202-155337-128-1e-2/checkpoint-400/special_tokens_map.json
+/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 57%|█████████████████████████████████████████████████████▊                                        | 401/700 [1:21:20<1:00:41, 12.18s/it]                                                                                                                                         {'loss': 0.0556, 'learning_rate': 0.004271428571428572, 'epoch': 37.74}
+ 57%|█████████████████████████████████████████████████████▊                                        | 401/700 [1:21:20<1:00:41, 12.18s/it] 57%|█████████████████████████████████████████████████████▉                                        | 402/700 [1:21:32<1:00:27, 12.17s/it]                                                                                                                                         {'loss': 0.0408, 'learning_rate': 0.004257142857142857, 'epoch': 37.84}
+ 57%|█████████████████████████████████████████████████████▉                                        | 402/700 [1:21:32<1:00:27, 12.17s/it] 58%|██████████████████████████████████████████████████████                                        | 403/700 [1:21:45<1:00:14, 12.17s/it]                                                                                                                                         {'loss': 0.043, 'learning_rate': 0.004242857142857143, 'epoch': 37.93}
+ 58%|██████████████████████████████████████████████████████                                        | 403/700 [1:21:45<1:00:14, 12.17s/it] 58%|██████████████████████████████████████████████████████▎                                       | 404/700 [1:21:57<1:00:02, 12.17s/it]                                                                                                                                         {'loss': 0.058, 'learning_rate': 0.004228571428571429, 'epoch': 38.02}
+ 58%|██████████████████████████████████████████████████████▎                                       | 404/700 [1:21:57<1:00:02, 12.17s/it] 58%|███████████████████████████████████████████████████████▌                                        | 405/700 [1:22:09<59:49, 12.17s/it]                                                                                                                                         {'loss': 0.0605, 'learning_rate': 0.004214285714285715, 'epoch': 38.12}
+ 58%|███████████████████████████████████████████████████████▌                                        | 405/700 [1:22:09<59:49, 12.17s/it] 58%|███████████████████████████████████████████████████████▋                                        | 406/700 [1:22:21<59:37, 12.17s/it]                                                                                                                                         {'loss': 0.0502, 'learning_rate': 0.0042, 'epoch': 38.21}
+ 58%|███████████████████████████████████████████████████████▋                                        | 406/700 [1:22:21<59:37, 12.17s/it] 58%|███████████████████████████████████████████████████████▊                                        | 407/700 [1:22:33<59:24, 12.17s/it]                                                                                                                                         {'loss': 0.054, 'learning_rate': 0.004185714285714286, 'epoch': 38.31}
+ 58%|███████████████████████████████████████████████████████▊                                        | 407/700 [1:22:33<59:24, 12.17s/it] 58%|███████████████████████████████████████████████████████▉                                        | 408/700 [1:22:45<59:12, 12.17s/it]                                                                                                                                         {'loss': 0.0421, 'learning_rate': 0.004171428571428572, 'epoch': 38.4}
+ 58%|███████████████████████████████████████████████████████▉                                        | 408/700 [1:22:45<59:12, 12.17s/it] 58%|████████████████████████████████████████████████████████                                        | 409/700 [1:22:58<59:00, 12.17s/it]                                                                                                                                         {'loss': 0.0446, 'learning_rate': 0.0041571428571428575, 'epoch': 38.49}
+ 58%|████████████████████████████████████████████████████████                                        | 409/700 [1:22:58<59:00, 12.17s/it] 59%|████████████████████████████████████████████████████████▏                                       | 410/700 [1:23:10<58:47, 12.17s/it]                                                                                                                                         {'loss': 0.0529, 'learning_rate': 0.0041428571428571434, 'epoch': 38.59}
+ 59%|████████████████████████████████████████████████████████▏                                       | 410/700 [1:23:10<58:47, 12.17s/it] 59%|████████████████████████████████████████████████████████▎                                       | 411/700 [1:23:22<58:36, 12.17s/it]                                                                                                                                         {'loss': 0.0547, 'learning_rate': 0.0041285714285714285, 'epoch': 38.68}
+ 59%|████████████████████████████████████████████████████████▎                                       | 411/700 [1:23:22<58:36, 12.17s/it] 59%|████████████████████████████████████████████████████████▌                                       | 412/700 [1:23:34<58:23, 12.17s/it]                                                                                                                                         {'loss': 0.0537, 'learning_rate': 0.004114285714285714, 'epoch': 38.78}
+ 59%|████████████████████████████████████████████████████████▌                                       | 412/700 [1:23:34<58:23, 12.17s/it] 59%|████████████████████████████████████████████████████████▋                                       | 413/700 [1:23:46<58:11, 12.17s/it]                                                                                                                                         {'loss': 0.0609, 'learning_rate': 0.0040999999999999995, 'epoch': 38.87}
+ 59%|████████████████████████████████████████████████████████▋                                       | 413/700 [1:23:46<58:11, 12.17s/it] 59%|████████████████████████████████████████████████████████▊                                       | 414/700 [1:23:58<57:58, 12.16s/it]                                                                                                                                         {'loss': 0.0508, 'learning_rate': 0.004085714285714286, 'epoch': 38.96}
+ 59%|████████████████████████████████████████████████████████▊                                       | 414/700 [1:23:58<57:58, 12.16s/it] 59%|████████████████████████████████████████████████████████▉                                       | 415/700 [1:24:11<57:46, 12.16s/it]                                                                                                                                         {'loss': 0.0446, 'learning_rate': 0.004071428571428571, 'epoch': 39.06}
+ 59%|████████████████████████████████████████████████████████▉                                       | 415/700 [1:24:11<57:46, 12.16s/it] 59%|█████████████████████████████████████████████████████████                                       | 416/700 [1:24:23<57:34, 12.16s/it]                                                                                                                                         {'loss': 0.0503, 'learning_rate': 0.004057142857142857, 'epoch': 39.15}
+ 59%|█████████████████████████████████████████████████████████                                       | 416/700 [1:24:23<57:34, 12.16s/it] 60%|█████████████████████████████████████████████████████████▏                                      | 417/700 [1:24:35<57:22, 12.16s/it]                                                                                                                                         {'loss': 0.0418, 'learning_rate': 0.004042857142857143, 'epoch': 39.25}
+ 60%|█████████████████████████████████████████████████████████▏                                      | 417/700 [1:24:35<57:22, 12.16s/it] 60%|█████████████████████████████████████████████████████████▎                                      | 418/700 [1:24:47<57:10, 12.16s/it]                                                                                                                                         {'loss': 0.0503, 'learning_rate': 0.004028571428571428, 'epoch': 39.34}
+ 60%|█████████████████████████████████████████████████████████▎                                      | 418/700 [1:24:47<57:10, 12.16s/it] 60%|█████████████████████████████████████████████████████████▍                                      | 419/700 [1:24:59<56:58, 12.16s/it]                                                                                                                                         {'loss': 0.0597, 'learning_rate': 0.004014285714285714, 'epoch': 39.44}
+ 60%|█████████████████████████████████████████████████████████▍                                      | 419/700 [1:24:59<56:58, 12.16s/it] 60%|█████████████████████████████████████████████████████████▌                                      | 420/700 [1:25:11<56:45, 12.16s/it]                                                                                                                                         {'loss': 0.0443, 'learning_rate': 0.004, 'epoch': 39.53}
+ 60%|█████████████████████████████████████████████████████████▌                                      | 420/700 [1:25:11<56:45, 12.16s/it] 60%|█████████████████████████████████████████████████████████▋                                      | 421/700 [1:25:23<56:33, 12.16s/it]                                                                                                                                         {'loss': 0.0495, 'learning_rate': 0.003985714285714286, 'epoch': 39.62}
+ 60%|█████████████████████████████████████████████████████████▋                                      | 421/700 [1:25:23<56:33, 12.16s/it] 60%|█████████████████████████████████████████████████████████▊                                      | 422/700 [1:25:36<56:21, 12.16s/it]                                                                                                                                         {'loss': 0.0435, 'learning_rate': 0.003971428571428571, 'epoch': 39.72}
+ 60%|█████████████████████████████████████████████████████████▊                                      | 422/700 [1:25:36<56:21, 12.16s/it] 60%|██████████████████████████████████████████████████████████                                      | 423/700 [1:25:48<56:09, 12.16s/it]                                                                                                                                         {'loss': 0.0528, 'learning_rate': 0.003957142857142858, 'epoch': 39.81}
+ 60%|██████████████████████████████████████████████████████████                                      | 423/700 [1:25:48<56:09, 12.16s/it] 61%|██████████████████████████████████████████████████████████▏                                     | 424/700 [1:26:00<55:57, 12.16s/it]                                                                                                                                         {'loss': 0.0457, 'learning_rate': 0.003942857142857143, 'epoch': 39.91}
+ 61%|██████████████████████████████████████████████████████████▏                                     | 424/700 [1:26:00<55:57, 12.16s/it] 61%|██████████████████████████████████████████████████████████▎                                     | 425/700 [1:26:12<55:45, 12.16s/it]                                                                                                                                         {'loss': 0.0491, 'learning_rate': 0.003928571428571429, 'epoch': 40.0}
+ 61%|██████████████████████████████████████████████████████████▎                                     | 425/700 [1:26:12<55:45, 12.16s/it] 61%|██████████████████████████████████████████████████████████▍                                     | 426/700 [1:26:24<55:32, 12.16s/it]                                                                                                                                         {'loss': 0.0514, 'learning_rate': 0.003914285714285714, 'epoch': 40.09}
+ 61%|██████████████████████████████████████████████████████████▍                                     | 426/700 [1:26:24<55:32, 12.16s/it] 61%|██████████████████████████████████████████████████████████▌                                     | 427/700 [1:26:36<55:20, 12.16s/it]                                                                                                                                         {'loss': 0.0389, 'learning_rate': 0.0039000000000000003, 'epoch': 40.19}
+ 61%|██████████████████████████████████████████████████████████▌                                     | 427/700 [1:26:36<55:20, 12.16s/it] 61%|██████████████████████████████████████████████████████████▋                                     | 428/700 [1:26:49<55:08, 12.16s/it]                                                                                                                                         {'loss': 0.0415, 'learning_rate': 0.0038857142857142857, 'epoch': 40.28}
+ 61%|██████████████████████████████████████████████████████████▋                                     | 428/700 [1:26:49<55:08, 12.16s/it] 61%|██████████████████████████████████████████████████████████▊                                     | 429/700 [1:27:01<54:56, 12.16s/it]                                                                                                                                         {'loss': 0.0508, 'learning_rate': 0.0038714285714285712, 'epoch': 40.38}
+ 61%|██████████████████████████████████████████████████████████▊                                     | 429/700 [1:27:01<54:56, 12.16s/it] 61%|██████████████████████████████████████████████████████████▉                                     | 430/700 [1:27:13<54:44, 12.16s/it]                                                                                                                                         {'loss': 0.0467, 'learning_rate': 0.0038571428571428576, 'epoch': 40.47}
+ 61%|██████████████████████████████████████████████████████████▉                                     | 430/700 [1:27:13<54:44, 12.16s/it]
\ No newline at end of file