ptrdvn commited on
Commit
7a3ab06
·
verified ·
1 Parent(s): 3d2d08b

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ base_model: Qwen/Qwen2.5-7B-Instruct
5
+ tags:
6
+ - llama-factory
7
+ - full
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: kto_trained_1
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # kto_trained_1
18
+
19
+ This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on the lightblue_kto_data dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.3031
22
+ - Rewards/chosen: 1.5421
23
+ - Logps/chosen: -343.9051
24
+ - Logits/chosen: -69679219.2
25
+ - Rewards/rejected: -7.3046
26
+ - Logps/rejected: -233.7684
27
+ - Logits/rejected: -34451756.1379
28
+ - Rewards/margins: 8.8467
29
+ - Kl: 1080.3173
30
+
31
+ ## Model description
32
+
33
+ More information needed
34
+
35
+ ## Intended uses & limitations
36
+
37
+ More information needed
38
+
39
+ ## Training and evaluation data
40
+
41
+ More information needed
42
+
43
+ ## Training procedure
44
+
45
+ ### Training hyperparameters
46
+
47
+ The following hyperparameters were used during training:
48
+ - learning_rate: 5e-06
49
+ - train_batch_size: 1
50
+ - eval_batch_size: 1
51
+ - seed: 42
52
+ - distributed_type: multi-GPU
53
+ - num_devices: 8
54
+ - gradient_accumulation_steps: 16
55
+ - total_train_batch_size: 128
56
+ - total_eval_batch_size: 8
57
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
58
+ - lr_scheduler_type: cosine
59
+ - lr_scheduler_warmup_ratio: 0.01
60
+ - num_epochs: 1.0
61
+
62
+ ### Training results
63
+
64
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Logps/chosen | Logits/chosen | Rewards/rejected | Logps/rejected | Logits/rejected | Rewards/margins | |
65
+ |:-------------:|:------:|:----:|:---------------:|:--------------:|:------------:|:--------------:|:----------------:|:--------------:|:---------------:|:---------------:|:---------:|
66
+ | 0.2623 | 0.0997 | 36 | 0.3340 | 1.3847 | -345.4796 | -55713169.0667 | -3.6384 | -197.1070 | -40055004.6897 | 5.0231 | 890.2159 |
67
+ | 0.3222 | 0.1995 | 72 | 0.3273 | 1.5219 | -344.1068 | -61469499.7333 | -4.9277 | -209.9999 | -32503238.6207 | 6.4496 | 1189.5447 |
68
+ | 0.3798 | 0.2992 | 108 | 0.3185 | 1.5573 | -343.7531 | -63003302.4 | -5.7081 | -217.8038 | -31597484.1379 | 7.2654 | 955.4995 |
69
+ | 0.3755 | 0.3990 | 144 | 0.3016 | 0.8908 | -350.4181 | -63924428.8 | -6.8986 | -229.7092 | -27711788.1379 | 7.7895 | 705.8951 |
70
+ | 0.3454 | 0.4987 | 180 | 0.3053 | 1.4481 | -344.8449 | -67193476.2667 | -6.5311 | -226.0336 | -37107747.3103 | 7.9792 | 836.6326 |
71
+ | 0.2633 | 0.5984 | 216 | 0.3085 | 1.5864 | -343.4627 | -68801646.9333 | -6.4654 | -225.3766 | -37986458.4828 | 8.0517 | 974.3778 |
72
+ | 0.2519 | 0.6982 | 252 | 0.3109 | 1.5635 | -343.6908 | -69407142.4 | -6.4303 | -225.0262 | -34758311.7241 | 7.9939 | 1106.7635 |
73
+ | 0.2959 | 0.7979 | 288 | 0.3033 | 1.6631 | -342.6956 | -69444923.7333 | -7.0061 | -230.7837 | -36029797.5172 | 8.6691 | 1082.5067 |
74
+ | 0.2921 | 0.8977 | 324 | 0.3022 | 1.4322 | -345.0042 | -69711099.7333 | -7.5841 | -236.5635 | -35742644.9655 | 9.0163 | 1047.6223 |
75
+ | 0.3122 | 0.9974 | 360 | 0.3031 | 1.5421 | -343.9051 | -69679219.2 | -7.3046 | -233.7684 | -34451756.1379 | 8.8467 | 1080.3173 |
76
+
77
+
78
+ ### Framework versions
79
+
80
+ - Transformers 4.46.1
81
+ - Pytorch 2.4.0+cu121
82
+ - Datasets 3.1.0
83
+ - Tokenizers 0.20.3
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
all_results.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9974025974025974,
3
+ "eval_logits/chosen": -69679219.2,
4
+ "eval_logits/rejected": -34451756.137931034,
5
+ "eval_logps/chosen": -343.90514322916664,
6
+ "eval_logps/rejected": -233.7683694773707,
7
+ "eval_loss": 0.30314239859580994,
8
+ "eval_rewards/chosen": 1.5421129862467449,
9
+ "eval_rewards/margins": 8.846664735640603,
10
+ "eval_rewards/rejected": -7.304551749393858,
11
+ "eval_runtime": 374.42,
12
+ "eval_samples_per_second": 1.247,
13
+ "eval_steps_per_second": 0.158,
14
+ "kl": 1080.3172607421875,
15
+ "total_flos": 8.196772297546138e+16,
16
+ "train_loss": 0.31702631492581634,
17
+ "train_runtime": 54644.2785,
18
+ "train_samples_per_second": 0.845,
19
+ "train_steps_per_second": 0.007
20
+ }
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 3584,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 18944,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 28,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 28,
17
+ "num_hidden_layers": 28,
18
+ "num_key_value_heads": 4,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": null,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": false,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.46.1",
26
+ "use_cache": false,
27
+ "use_sliding_window": false,
28
+ "vocab_size": 152064
29
+ }
eval_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9974025974025974,
3
+ "eval_logits/chosen": -69679219.2,
4
+ "eval_logits/rejected": -34451756.137931034,
5
+ "eval_logps/chosen": -343.90514322916664,
6
+ "eval_logps/rejected": -233.7683694773707,
7
+ "eval_loss": 0.30314239859580994,
8
+ "eval_rewards/chosen": 1.5421129862467449,
9
+ "eval_rewards/margins": 8.846664735640603,
10
+ "eval_rewards/rejected": -7.304551749393858,
11
+ "eval_runtime": 374.42,
12
+ "eval_samples_per_second": 1.247,
13
+ "eval_steps_per_second": 0.158,
14
+ "kl": 1080.3172607421875
15
+ }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.46.1"
14
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c360266f480e6ade2b3a9e42afac95c936857d6e0b633ca86b55a6a8ae087e2
3
+ size 4877660776
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:109422a6f220b777ad52b5d448a8a2dae65a88dcd662a53311b347e7d5d4a58a
3
+ size 4932751008
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8915660397663343d81c1be84fa311e4fcab4115b01a62368c74b3f0551c596d
3
+ size 4330865200
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e56ac190db46ee14c1fb027b671537b43c5d1aab8a6660164e36132d8ab4a66a
3
+ size 1089994880
model.safetensors.index.json ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 15231233024
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
32
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
110
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
125
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
126
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
127
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
133
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
134
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
137
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
139
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
140
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
149
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
151
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
153
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
154
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
164
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
224
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
242
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
254
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
255
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
257
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
258
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
259
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
260
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
261
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
278
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
279
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
281
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
282
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
283
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
284
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
285
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
286
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
289
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
290
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
291
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
293
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
294
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
295
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
296
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
297
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
298
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
299
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
300
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
301
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
302
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
303
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
304
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
305
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
306
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
307
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
308
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
309
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
310
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
311
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
312
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
313
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
314
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
315
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
316
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
317
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
318
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
319
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
320
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
321
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
322
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
323
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
324
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
325
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
326
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
327
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
328
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
329
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
330
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
331
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
332
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
333
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
334
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
335
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
336
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
337
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
338
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
339
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
340
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
341
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
342
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
343
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
344
+ "model.norm.weight": "model-00003-of-00004.safetensors"
345
+ }
346
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "padding_side": "right",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9974025974025974,
3
+ "total_flos": 8.196772297546138e+16,
4
+ "train_loss": 0.31702631492581634,
5
+ "train_runtime": 54644.2785,
6
+ "train_samples_per_second": 0.845,
7
+ "train_steps_per_second": 0.007
8
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 1, "total_steps": 360, "loss": 0.5, "lr": 1.25e-06, "epoch": 0.0027705627705627706, "percentage": 0.28, "elapsed_time": "0:02:22", "remaining_time": "14:10:54"}
2
+ {"current_steps": 2, "total_steps": 360, "loss": 0.5, "lr": 2.5e-06, "epoch": 0.005541125541125541, "percentage": 0.56, "elapsed_time": "0:04:41", "remaining_time": "14:00:18"}
3
+ {"current_steps": 3, "total_steps": 360, "loss": 0.4986, "lr": 3.7500000000000005e-06, "epoch": 0.008311688311688312, "percentage": 0.83, "elapsed_time": "0:06:59", "remaining_time": "13:52:50"}
4
+ {"current_steps": 4, "total_steps": 360, "loss": 0.4789, "lr": 5e-06, "epoch": 0.011082251082251082, "percentage": 1.11, "elapsed_time": "0:09:19", "remaining_time": "13:49:29"}
5
+ {"current_steps": 5, "total_steps": 360, "loss": 0.4732, "lr": 4.999902656502973e-06, "epoch": 0.013852813852813853, "percentage": 1.39, "elapsed_time": "0:11:38", "remaining_time": "13:47:02"}
6
+ {"current_steps": 6, "total_steps": 360, "loss": 0.3975, "lr": 4.9996106335924965e-06, "epoch": 0.016623376623376623, "percentage": 1.67, "elapsed_time": "0:14:01", "remaining_time": "13:47:01"}
7
+ {"current_steps": 7, "total_steps": 360, "loss": 0.352, "lr": 4.999123954009797e-06, "epoch": 0.019393939393939394, "percentage": 1.94, "elapsed_time": "0:16:17", "remaining_time": "13:41:23"}
8
+ {"current_steps": 8, "total_steps": 360, "loss": 0.3637, "lr": 4.998442655654946e-06, "epoch": 0.022164502164502164, "percentage": 2.22, "elapsed_time": "0:18:38", "remaining_time": "13:40:17"}
9
+ {"current_steps": 9, "total_steps": 360, "loss": 0.4793, "lr": 4.997566791583916e-06, "epoch": 0.024935064935064935, "percentage": 2.5, "elapsed_time": "0:20:56", "remaining_time": "13:36:53"}
10
+ {"current_steps": 10, "total_steps": 360, "loss": 0.3755, "lr": 4.996496430004446e-06, "epoch": 0.027705627705627706, "percentage": 2.78, "elapsed_time": "0:23:15", "remaining_time": "13:33:45"}
11
+ {"current_steps": 11, "total_steps": 360, "loss": 0.4853, "lr": 4.995231654270726e-06, "epoch": 0.030476190476190476, "percentage": 3.06, "elapsed_time": "0:25:33", "remaining_time": "13:30:49"}
12
+ {"current_steps": 12, "total_steps": 360, "loss": 0.3923, "lr": 4.993772562876909e-06, "epoch": 0.03324675324675325, "percentage": 3.33, "elapsed_time": "0:27:55", "remaining_time": "13:29:54"}
13
+ {"current_steps": 13, "total_steps": 360, "loss": 0.3514, "lr": 4.992119269449445e-06, "epoch": 0.03601731601731602, "percentage": 3.61, "elapsed_time": "0:30:18", "remaining_time": "13:29:04"}
14
+ {"current_steps": 14, "total_steps": 360, "loss": 0.4055, "lr": 4.990271902738223e-06, "epoch": 0.03878787878787879, "percentage": 3.89, "elapsed_time": "0:32:41", "remaining_time": "13:27:50"}
15
+ {"current_steps": 15, "total_steps": 360, "loss": 0.3175, "lr": 4.988230606606552e-06, "epoch": 0.04155844155844156, "percentage": 4.17, "elapsed_time": "0:35:05", "remaining_time": "13:27:02"}
16
+ {"current_steps": 16, "total_steps": 360, "loss": 0.3861, "lr": 4.985995540019956e-06, "epoch": 0.04432900432900433, "percentage": 4.44, "elapsed_time": "0:37:33", "remaining_time": "13:27:36"}
17
+ {"current_steps": 17, "total_steps": 360, "loss": 0.3605, "lr": 4.983566877033791e-06, "epoch": 0.0470995670995671, "percentage": 4.72, "elapsed_time": "0:39:56", "remaining_time": "13:26:01"}
18
+ {"current_steps": 18, "total_steps": 360, "loss": 0.388, "lr": 4.980944806779698e-06, "epoch": 0.04987012987012987, "percentage": 5.0, "elapsed_time": "0:42:18", "remaining_time": "13:23:52"}
19
+ {"current_steps": 19, "total_steps": 360, "loss": 0.3068, "lr": 4.9781295334508664e-06, "epoch": 0.05264069264069264, "percentage": 5.28, "elapsed_time": "0:44:40", "remaining_time": "13:21:56"}
20
+ {"current_steps": 20, "total_steps": 360, "loss": 0.3188, "lr": 4.975121276286136e-06, "epoch": 0.05541125541125541, "percentage": 5.56, "elapsed_time": "0:46:57", "remaining_time": "13:18:10"}
21
+ {"current_steps": 21, "total_steps": 360, "loss": 0.3428, "lr": 4.9719202695529265e-06, "epoch": 0.05818181818181818, "percentage": 5.83, "elapsed_time": "0:49:16", "remaining_time": "13:15:31"}
22
+ {"current_steps": 22, "total_steps": 360, "loss": 0.3241, "lr": 4.968526762528988e-06, "epoch": 0.06095238095238095, "percentage": 6.11, "elapsed_time": "0:51:38", "remaining_time": "13:13:24"}
23
+ {"current_steps": 23, "total_steps": 360, "loss": 0.3545, "lr": 4.964941019482995e-06, "epoch": 0.06372294372294372, "percentage": 6.39, "elapsed_time": "0:53:59", "remaining_time": "13:11:02"}
24
+ {"current_steps": 24, "total_steps": 360, "loss": 0.3224, "lr": 4.961163319653959e-06, "epoch": 0.0664935064935065, "percentage": 6.67, "elapsed_time": "0:56:22", "remaining_time": "13:09:10"}
25
+ {"current_steps": 25, "total_steps": 360, "loss": 0.318, "lr": 4.9571939572294914e-06, "epoch": 0.06926406926406926, "percentage": 6.94, "elapsed_time": "0:58:43", "remaining_time": "13:07:00"}
26
+ {"current_steps": 26, "total_steps": 360, "loss": 0.3116, "lr": 4.953033241322887e-06, "epoch": 0.07203463203463203, "percentage": 7.22, "elapsed_time": "1:01:06", "remaining_time": "13:04:55"}
27
+ {"current_steps": 27, "total_steps": 360, "loss": 0.3684, "lr": 4.948681495949055e-06, "epoch": 0.0748051948051948, "percentage": 7.5, "elapsed_time": "1:03:30", "remaining_time": "13:03:14"}
28
+ {"current_steps": 28, "total_steps": 360, "loss": 0.2844, "lr": 4.944139059999286e-06, "epoch": 0.07757575757575758, "percentage": 7.78, "elapsed_time": "1:05:51", "remaining_time": "13:00:50"}
29
+ {"current_steps": 29, "total_steps": 360, "loss": 0.3235, "lr": 4.939406287214861e-06, "epoch": 0.08034632034632035, "percentage": 8.06, "elapsed_time": "1:08:11", "remaining_time": "12:58:15"}
30
+ {"current_steps": 30, "total_steps": 360, "loss": 0.3997, "lr": 4.9344835461595016e-06, "epoch": 0.08311688311688312, "percentage": 8.33, "elapsed_time": "1:10:32", "remaining_time": "12:55:59"}
31
+ {"current_steps": 31, "total_steps": 360, "loss": 0.3246, "lr": 4.929371220190671e-06, "epoch": 0.08588744588744589, "percentage": 8.61, "elapsed_time": "1:12:58", "remaining_time": "12:54:24"}
32
+ {"current_steps": 32, "total_steps": 360, "loss": 0.2904, "lr": 4.9240697074297205e-06, "epoch": 0.08865800865800866, "percentage": 8.89, "elapsed_time": "1:15:20", "remaining_time": "12:52:11"}
33
+ {"current_steps": 33, "total_steps": 360, "loss": 0.3206, "lr": 4.918579420730884e-06, "epoch": 0.09142857142857143, "percentage": 9.17, "elapsed_time": "1:17:45", "remaining_time": "12:50:30"}
34
+ {"current_steps": 34, "total_steps": 360, "loss": 0.3035, "lr": 4.912900787649124e-06, "epoch": 0.0941991341991342, "percentage": 9.44, "elapsed_time": "1:20:10", "remaining_time": "12:48:43"}
35
+ {"current_steps": 35, "total_steps": 360, "loss": 0.3779, "lr": 4.907034250406846e-06, "epoch": 0.09696969696969697, "percentage": 9.72, "elapsed_time": "1:22:25", "remaining_time": "12:45:23"}
36
+ {"current_steps": 36, "total_steps": 360, "loss": 0.2623, "lr": 4.900980265859449e-06, "epoch": 0.09974025974025974, "percentage": 10.0, "elapsed_time": "1:24:42", "remaining_time": "12:42:20"}
37
+ {"current_steps": 36, "total_steps": 360, "eval_loss": 0.33400189876556396, "epoch": 0.09974025974025974, "percentage": 10.0, "elapsed_time": "1:30:58", "remaining_time": "13:38:46"}
38
+ {"current_steps": 37, "total_steps": 360, "loss": 0.2681, "lr": 4.894739305459754e-06, "epoch": 0.10251082251082251, "percentage": 10.28, "elapsed_time": "1:33:17", "remaining_time": "13:34:20"}
39
+ {"current_steps": 38, "total_steps": 360, "loss": 0.3007, "lr": 4.88831185522129e-06, "epoch": 0.10528138528138528, "percentage": 10.56, "elapsed_time": "1:35:44", "remaining_time": "13:31:13"}
40
+ {"current_steps": 39, "total_steps": 360, "loss": 0.3303, "lr": 4.881698415680442e-06, "epoch": 0.10805194805194805, "percentage": 10.83, "elapsed_time": "1:38:02", "remaining_time": "13:26:54"}
41
+ {"current_steps": 40, "total_steps": 360, "loss": 0.2789, "lr": 4.874899501857477e-06, "epoch": 0.11082251082251082, "percentage": 11.11, "elapsed_time": "1:40:29", "remaining_time": "13:23:56"}
42
+ {"current_steps": 41, "total_steps": 360, "loss": 0.3505, "lr": 4.867915643216434e-06, "epoch": 0.11359307359307359, "percentage": 11.39, "elapsed_time": "1:42:51", "remaining_time": "13:20:17"}
43
+ {"current_steps": 42, "total_steps": 360, "loss": 0.3736, "lr": 4.860747383623889e-06, "epoch": 0.11636363636363636, "percentage": 11.67, "elapsed_time": "1:45:17", "remaining_time": "13:17:09"}
44
+ {"current_steps": 43, "total_steps": 360, "loss": 0.3449, "lr": 4.85339528130661e-06, "epoch": 0.11913419913419913, "percentage": 11.94, "elapsed_time": "1:47:45", "remaining_time": "13:14:27"}
45
+ {"current_steps": 44, "total_steps": 360, "loss": 0.3315, "lr": 4.845859908808074e-06, "epoch": 0.1219047619047619, "percentage": 12.22, "elapsed_time": "1:50:07", "remaining_time": "13:10:52"}
46
+ {"current_steps": 45, "total_steps": 360, "loss": 0.2698, "lr": 4.838141852943891e-06, "epoch": 0.12467532467532468, "percentage": 12.5, "elapsed_time": "1:52:32", "remaining_time": "13:07:50"}
47
+ {"current_steps": 46, "total_steps": 360, "loss": 0.3081, "lr": 4.830241714756099e-06, "epoch": 0.12744588744588745, "percentage": 12.78, "elapsed_time": "1:54:50", "remaining_time": "13:03:53"}
48
+ {"current_steps": 47, "total_steps": 360, "loss": 0.2922, "lr": 4.822160109466361e-06, "epoch": 0.13021645021645023, "percentage": 13.06, "elapsed_time": "1:57:09", "remaining_time": "13:00:14"}
49
+ {"current_steps": 48, "total_steps": 360, "loss": 0.3107, "lr": 4.813897666428054e-06, "epoch": 0.132987012987013, "percentage": 13.33, "elapsed_time": "1:59:29", "remaining_time": "12:56:44"}
50
+ {"current_steps": 49, "total_steps": 360, "loss": 0.3119, "lr": 4.805455029077255e-06, "epoch": 0.13575757575757577, "percentage": 13.61, "elapsed_time": "2:01:52", "remaining_time": "12:53:30"}
51
+ {"current_steps": 50, "total_steps": 360, "loss": 0.3508, "lr": 4.79683285488264e-06, "epoch": 0.13852813852813853, "percentage": 13.89, "elapsed_time": "2:04:10", "remaining_time": "12:49:53"}
52
+ {"current_steps": 51, "total_steps": 360, "loss": 0.3853, "lr": 4.788031815294282e-06, "epoch": 0.1412987012987013, "percentage": 14.17, "elapsed_time": "2:06:27", "remaining_time": "12:46:14"}
53
+ {"current_steps": 52, "total_steps": 360, "loss": 0.3074, "lr": 4.779052595691355e-06, "epoch": 0.14406926406926407, "percentage": 14.44, "elapsed_time": "2:08:49", "remaining_time": "12:43:03"}
54
+ {"current_steps": 53, "total_steps": 360, "loss": 0.3462, "lr": 4.76989589532877e-06, "epoch": 0.14683982683982685, "percentage": 14.72, "elapsed_time": "2:11:06", "remaining_time": "12:39:27"}
55
+ {"current_steps": 54, "total_steps": 360, "loss": 0.3016, "lr": 4.7605624272827125e-06, "epoch": 0.1496103896103896, "percentage": 15.0, "elapsed_time": "2:13:38", "remaining_time": "12:37:18"}
56
+ {"current_steps": 55, "total_steps": 360, "loss": 0.335, "lr": 4.75105291839512e-06, "epoch": 0.1523809523809524, "percentage": 15.28, "elapsed_time": "2:16:01", "remaining_time": "12:34:17"}
57
+ {"current_steps": 56, "total_steps": 360, "loss": 0.2906, "lr": 4.741368109217072e-06, "epoch": 0.15515151515151515, "percentage": 15.56, "elapsed_time": "2:18:27", "remaining_time": "12:31:36"}
58
+ {"current_steps": 57, "total_steps": 360, "loss": 0.3628, "lr": 4.7315087539511225e-06, "epoch": 0.15792207792207794, "percentage": 15.83, "elapsed_time": "2:20:42", "remaining_time": "12:28:00"}
59
+ {"current_steps": 58, "total_steps": 360, "loss": 0.2694, "lr": 4.721475620392567e-06, "epoch": 0.1606926406926407, "percentage": 16.11, "elapsed_time": "2:23:02", "remaining_time": "12:24:46"}
60
+ {"current_steps": 59, "total_steps": 360, "loss": 0.3414, "lr": 4.711269489869654e-06, "epoch": 0.16346320346320348, "percentage": 16.39, "elapsed_time": "2:25:27", "remaining_time": "12:22:03"}
61
+ {"current_steps": 60, "total_steps": 360, "loss": 0.4238, "lr": 4.700891157182729e-06, "epoch": 0.16623376623376623, "percentage": 16.67, "elapsed_time": "2:27:53", "remaining_time": "12:19:29"}
62
+ {"current_steps": 61, "total_steps": 360, "loss": 0.3324, "lr": 4.690341430542351e-06, "epoch": 0.16900432900432902, "percentage": 16.94, "elapsed_time": "2:30:12", "remaining_time": "12:16:17"}
63
+ {"current_steps": 62, "total_steps": 360, "loss": 0.3589, "lr": 4.679621131506347e-06, "epoch": 0.17177489177489177, "percentage": 17.22, "elapsed_time": "2:32:31", "remaining_time": "12:13:05"}
64
+ {"current_steps": 63, "total_steps": 360, "loss": 0.3341, "lr": 4.668731094915835e-06, "epoch": 0.17454545454545456, "percentage": 17.5, "elapsed_time": "2:34:49", "remaining_time": "12:09:52"}
65
+ {"current_steps": 64, "total_steps": 360, "loss": 0.3758, "lr": 4.657672168830211e-06, "epoch": 0.17731601731601732, "percentage": 17.78, "elapsed_time": "2:37:12", "remaining_time": "12:07:05"}
66
+ {"current_steps": 65, "total_steps": 360, "loss": 0.2864, "lr": 4.646445214461105e-06, "epoch": 0.1800865800865801, "percentage": 18.06, "elapsed_time": "2:39:35", "remaining_time": "12:04:16"}
67
+ {"current_steps": 66, "total_steps": 360, "loss": 0.3533, "lr": 4.635051106105316e-06, "epoch": 0.18285714285714286, "percentage": 18.33, "elapsed_time": "2:41:54", "remaining_time": "12:01:13"}
68
+ {"current_steps": 67, "total_steps": 360, "loss": 0.2895, "lr": 4.623490731076728e-06, "epoch": 0.18562770562770564, "percentage": 18.61, "elapsed_time": "2:44:19", "remaining_time": "11:58:35"}
69
+ {"current_steps": 68, "total_steps": 360, "loss": 0.3098, "lr": 4.6117649896372055e-06, "epoch": 0.1883982683982684, "percentage": 18.89, "elapsed_time": "2:46:40", "remaining_time": "11:55:45"}
70
+ {"current_steps": 69, "total_steps": 360, "loss": 0.3605, "lr": 4.59987479492649e-06, "epoch": 0.19116883116883118, "percentage": 19.17, "elapsed_time": "2:48:59", "remaining_time": "11:52:44"}
71
+ {"current_steps": 70, "total_steps": 360, "loss": 0.2793, "lr": 4.587821072891089e-06, "epoch": 0.19393939393939394, "percentage": 19.44, "elapsed_time": "2:51:20", "remaining_time": "11:49:50"}
72
+ {"current_steps": 71, "total_steps": 360, "loss": 0.3164, "lr": 4.5756047622121665e-06, "epoch": 0.19670995670995672, "percentage": 19.72, "elapsed_time": "2:53:37", "remaining_time": "11:46:44"}
73
+ {"current_steps": 72, "total_steps": 360, "loss": 0.3222, "lr": 4.563226814232444e-06, "epoch": 0.19948051948051948, "percentage": 20.0, "elapsed_time": "2:55:56", "remaining_time": "11:43:46"}
74
+ {"current_steps": 72, "total_steps": 360, "eval_loss": 0.32733702659606934, "epoch": 0.19948051948051948, "percentage": 20.0, "elapsed_time": "3:02:12", "remaining_time": "12:08:51"}
75
+ {"current_steps": 73, "total_steps": 360, "loss": 0.308, "lr": 4.550688192882115e-06, "epoch": 0.20225108225108226, "percentage": 20.28, "elapsed_time": "3:04:34", "remaining_time": "12:05:40"}
76
+ {"current_steps": 74, "total_steps": 360, "loss": 0.3314, "lr": 4.53798987460378e-06, "epoch": 0.20502164502164502, "percentage": 20.56, "elapsed_time": "3:06:58", "remaining_time": "12:02:36"}
77
+ {"current_steps": 75, "total_steps": 360, "loss": 0.2617, "lr": 4.525132848276405e-06, "epoch": 0.2077922077922078, "percentage": 20.83, "elapsed_time": "3:09:17", "remaining_time": "11:59:18"}
78
+ {"current_steps": 76, "total_steps": 360, "loss": 0.3111, "lr": 4.512118115138315e-06, "epoch": 0.21056277056277056, "percentage": 21.11, "elapsed_time": "3:11:38", "remaining_time": "11:56:09"}
79
+ {"current_steps": 77, "total_steps": 360, "loss": 0.312, "lr": 4.498946688709216e-06, "epoch": 0.21333333333333335, "percentage": 21.39, "elapsed_time": "3:13:59", "remaining_time": "11:52:59"}
80
+ {"current_steps": 78, "total_steps": 360, "loss": 0.3199, "lr": 4.485619594711278e-06, "epoch": 0.2161038961038961, "percentage": 21.67, "elapsed_time": "3:16:19", "remaining_time": "11:49:49"}
81
+ {"current_steps": 79, "total_steps": 360, "loss": 0.3595, "lr": 4.4721378709892475e-06, "epoch": 0.2188744588744589, "percentage": 21.94, "elapsed_time": "3:18:39", "remaining_time": "11:46:37"}
82
+ {"current_steps": 80, "total_steps": 360, "loss": 0.2647, "lr": 4.4585025674296315e-06, "epoch": 0.22164502164502164, "percentage": 22.22, "elapsed_time": "3:21:04", "remaining_time": "11:43:45"}
83
+ {"current_steps": 81, "total_steps": 360, "loss": 0.2927, "lr": 4.444714745878936e-06, "epoch": 0.22441558441558443, "percentage": 22.5, "elapsed_time": "3:23:25", "remaining_time": "11:40:41"}
84
+ {"current_steps": 82, "total_steps": 360, "loss": 0.3077, "lr": 4.430775480060973e-06, "epoch": 0.22718614718614719, "percentage": 22.78, "elapsed_time": "3:25:46", "remaining_time": "11:37:38"}
85
+ {"current_steps": 83, "total_steps": 360, "loss": 0.3408, "lr": 4.416685855493246e-06, "epoch": 0.22995670995670997, "percentage": 23.06, "elapsed_time": "3:28:05", "remaining_time": "11:34:29"}
86
+ {"current_steps": 84, "total_steps": 360, "loss": 0.2934, "lr": 4.4024469694024194e-06, "epoch": 0.23272727272727273, "percentage": 23.33, "elapsed_time": "3:30:31", "remaining_time": "11:31:41"}
87
+ {"current_steps": 85, "total_steps": 360, "loss": 0.3093, "lr": 4.388059930638865e-06, "epoch": 0.2354978354978355, "percentage": 23.61, "elapsed_time": "3:32:53", "remaining_time": "11:28:47"}
88
+ {"current_steps": 86, "total_steps": 360, "loss": 0.2263, "lr": 4.373525859590313e-06, "epoch": 0.23826839826839827, "percentage": 23.89, "elapsed_time": "3:35:11", "remaining_time": "11:25:37"}
89
+ {"current_steps": 87, "total_steps": 360, "loss": 0.2666, "lr": 4.358845888094607e-06, "epoch": 0.24103896103896105, "percentage": 24.17, "elapsed_time": "3:37:34", "remaining_time": "11:22:42"}
90
+ {"current_steps": 88, "total_steps": 360, "loss": 0.3756, "lr": 4.3440211593515556e-06, "epoch": 0.2438095238095238, "percentage": 24.44, "elapsed_time": "3:39:59", "remaining_time": "11:19:58"}
91
+ {"current_steps": 89, "total_steps": 360, "loss": 0.3375, "lr": 4.32905282783391e-06, "epoch": 0.2465800865800866, "percentage": 24.72, "elapsed_time": "3:42:23", "remaining_time": "11:17:09"}
92
+ {"current_steps": 90, "total_steps": 360, "loss": 0.3189, "lr": 4.313942059197457e-06, "epoch": 0.24935064935064935, "percentage": 25.0, "elapsed_time": "3:44:42", "remaining_time": "11:14:08"}
93
+ {"current_steps": 91, "total_steps": 360, "loss": 0.2776, "lr": 4.298690030190247e-06, "epoch": 0.25212121212121213, "percentage": 25.28, "elapsed_time": "3:47:02", "remaining_time": "11:11:08"}
94
+ {"current_steps": 92, "total_steps": 360, "loss": 0.3621, "lr": 4.283297928560951e-06, "epoch": 0.2548917748917749, "percentage": 25.56, "elapsed_time": "3:49:22", "remaining_time": "11:08:11"}
95
+ {"current_steps": 93, "total_steps": 360, "loss": 0.3372, "lr": 4.267766952966369e-06, "epoch": 0.25766233766233765, "percentage": 25.83, "elapsed_time": "3:51:47", "remaining_time": "11:05:28"}
96
+ {"current_steps": 94, "total_steps": 360, "loss": 0.3621, "lr": 4.252098312878083e-06, "epoch": 0.26043290043290046, "percentage": 26.11, "elapsed_time": "3:54:09", "remaining_time": "11:02:37"}
97
+ {"current_steps": 95, "total_steps": 360, "loss": 0.3272, "lr": 4.236293228488267e-06, "epoch": 0.2632034632034632, "percentage": 26.39, "elapsed_time": "3:56:29", "remaining_time": "10:59:41"}
98
+ {"current_steps": 96, "total_steps": 360, "loss": 0.337, "lr": 4.220352930614672e-06, "epoch": 0.265974025974026, "percentage": 26.67, "elapsed_time": "3:58:57", "remaining_time": "10:57:09"}
99
+ {"current_steps": 97, "total_steps": 360, "loss": 0.3529, "lr": 4.204278660604767e-06, "epoch": 0.26874458874458873, "percentage": 26.94, "elapsed_time": "4:01:20", "remaining_time": "10:54:21"}
100
+ {"current_steps": 98, "total_steps": 360, "loss": 0.2662, "lr": 4.1880716702390764e-06, "epoch": 0.27151515151515154, "percentage": 27.22, "elapsed_time": "4:03:43", "remaining_time": "10:51:35"}
101
+ {"current_steps": 99, "total_steps": 360, "loss": 0.3054, "lr": 4.171733221633695e-06, "epoch": 0.2742857142857143, "percentage": 27.5, "elapsed_time": "4:06:04", "remaining_time": "10:48:43"}
102
+ {"current_steps": 100, "total_steps": 360, "loss": 0.3833, "lr": 4.155264587142002e-06, "epoch": 0.27705627705627706, "percentage": 27.78, "elapsed_time": "4:08:22", "remaining_time": "10:45:46"}
103
+ {"current_steps": 101, "total_steps": 360, "loss": 0.3334, "lr": 4.138667049255574e-06, "epoch": 0.2798268398268398, "percentage": 28.06, "elapsed_time": "4:10:42", "remaining_time": "10:42:53"}
104
+ {"current_steps": 102, "total_steps": 360, "loss": 0.323, "lr": 4.121941900504316e-06, "epoch": 0.2825974025974026, "percentage": 28.33, "elapsed_time": "4:13:05", "remaining_time": "10:40:09"}
105
+ {"current_steps": 103, "total_steps": 360, "loss": 0.3506, "lr": 4.105090443355801e-06, "epoch": 0.2853679653679654, "percentage": 28.61, "elapsed_time": "4:15:20", "remaining_time": "10:37:06"}
106
+ {"current_steps": 104, "total_steps": 360, "loss": 0.3179, "lr": 4.088113990113846e-06, "epoch": 0.28813852813852814, "percentage": 28.89, "elapsed_time": "4:17:34", "remaining_time": "10:34:01"}
107
+ {"current_steps": 105, "total_steps": 360, "loss": 0.2801, "lr": 4.071013862816311e-06, "epoch": 0.2909090909090909, "percentage": 29.17, "elapsed_time": "4:19:48", "remaining_time": "10:30:58"}
108
+ {"current_steps": 106, "total_steps": 360, "loss": 0.3477, "lr": 4.0537913931321495e-06, "epoch": 0.2936796536796537, "percentage": 29.44, "elapsed_time": "4:22:03", "remaining_time": "10:27:57"}
109
+ {"current_steps": 107, "total_steps": 360, "loss": 0.291, "lr": 4.036447922257699e-06, "epoch": 0.29645021645021646, "percentage": 29.72, "elapsed_time": "4:24:33", "remaining_time": "10:25:32"}
110
+ {"current_steps": 108, "total_steps": 360, "loss": 0.3798, "lr": 4.018984800812248e-06, "epoch": 0.2992207792207792, "percentage": 30.0, "elapsed_time": "4:26:58", "remaining_time": "10:22:55"}
111
+ {"current_steps": 108, "total_steps": 360, "eval_loss": 0.3185268044471741, "epoch": 0.2992207792207792, "percentage": 30.0, "elapsed_time": "4:33:13", "remaining_time": "10:37:32"}
112
+ {"current_steps": 109, "total_steps": 360, "loss": 0.2488, "lr": 4.001403388732842e-06, "epoch": 0.301991341991342, "percentage": 30.28, "elapsed_time": "4:35:32", "remaining_time": "10:34:31"}
113
+ {"current_steps": 110, "total_steps": 360, "loss": 0.329, "lr": 3.983705055168391e-06, "epoch": 0.3047619047619048, "percentage": 30.56, "elapsed_time": "4:37:54", "remaining_time": "10:31:35"}
114
+ {"current_steps": 111, "total_steps": 360, "loss": 0.3257, "lr": 3.965891178373038e-06, "epoch": 0.30753246753246755, "percentage": 30.83, "elapsed_time": "4:40:23", "remaining_time": "10:28:58"}
115
+ {"current_steps": 112, "total_steps": 360, "loss": 0.3487, "lr": 3.947963145598833e-06, "epoch": 0.3103030303030303, "percentage": 31.11, "elapsed_time": "4:42:47", "remaining_time": "10:26:10"}
116
+ {"current_steps": 113, "total_steps": 360, "loss": 0.3574, "lr": 3.929922352987702e-06, "epoch": 0.31307359307359306, "percentage": 31.39, "elapsed_time": "4:45:07", "remaining_time": "10:23:14"}
117
+ {"current_steps": 114, "total_steps": 360, "loss": 0.3359, "lr": 3.911770205462717e-06, "epoch": 0.31584415584415587, "percentage": 31.67, "elapsed_time": "4:47:34", "remaining_time": "10:20:34"}
118
+ {"current_steps": 115, "total_steps": 360, "loss": 0.3044, "lr": 3.8935081166186935e-06, "epoch": 0.31861471861471863, "percentage": 31.94, "elapsed_time": "4:49:55", "remaining_time": "10:17:39"}
119
+ {"current_steps": 116, "total_steps": 360, "loss": 0.3275, "lr": 3.875137508612104e-06, "epoch": 0.3213852813852814, "percentage": 32.22, "elapsed_time": "4:52:14", "remaining_time": "10:14:42"}
120
+ {"current_steps": 117, "total_steps": 360, "loss": 0.3665, "lr": 3.856659812050328e-06, "epoch": 0.32415584415584414, "percentage": 32.5, "elapsed_time": "4:54:29", "remaining_time": "10:11:39"}
121
+ {"current_steps": 118, "total_steps": 360, "loss": 0.3269, "lr": 3.838076465880248e-06, "epoch": 0.32692640692640695, "percentage": 32.78, "elapsed_time": "4:56:56", "remaining_time": "10:08:58"}
122
+ {"current_steps": 119, "total_steps": 360, "loss": 0.3944, "lr": 3.819388917276186e-06, "epoch": 0.3296969696969697, "percentage": 33.06, "elapsed_time": "4:59:20", "remaining_time": "10:06:13"}
123
+ {"current_steps": 120, "total_steps": 360, "loss": 0.3068, "lr": 3.8005986215272056e-06, "epoch": 0.33246753246753247, "percentage": 33.33, "elapsed_time": "5:01:38", "remaining_time": "10:03:17"}
124
+ {"current_steps": 121, "total_steps": 360, "loss": 0.3559, "lr": 3.7817070419237866e-06, "epoch": 0.3352380952380952, "percentage": 33.61, "elapsed_time": "5:03:58", "remaining_time": "10:00:24"}
125
+ {"current_steps": 122, "total_steps": 360, "loss": 0.2996, "lr": 3.7627156496438686e-06, "epoch": 0.33800865800865804, "percentage": 33.89, "elapsed_time": "5:06:21", "remaining_time": "9:57:38"}
126
+ {"current_steps": 123, "total_steps": 360, "loss": 0.3117, "lr": 3.7436259236382797e-06, "epoch": 0.3407792207792208, "percentage": 34.17, "elapsed_time": "5:08:44", "remaining_time": "9:54:54"}
127
+ {"current_steps": 124, "total_steps": 360, "loss": 0.3261, "lr": 3.7244393505155713e-06, "epoch": 0.34354978354978355, "percentage": 34.44, "elapsed_time": "5:11:09", "remaining_time": "9:52:13"}
128
+ {"current_steps": 125, "total_steps": 360, "loss": 0.2878, "lr": 3.7051574244262412e-06, "epoch": 0.3463203463203463, "percentage": 34.72, "elapsed_time": "5:13:30", "remaining_time": "9:49:23"}
129
+ {"current_steps": 126, "total_steps": 360, "loss": 0.3024, "lr": 3.6857816469463806e-06, "epoch": 0.3490909090909091, "percentage": 35.0, "elapsed_time": "5:15:47", "remaining_time": "9:46:28"}
130
+ {"current_steps": 127, "total_steps": 360, "loss": 0.2751, "lr": 3.6663135269607413e-06, "epoch": 0.3518614718614719, "percentage": 35.28, "elapsed_time": "5:18:03", "remaining_time": "9:43:31"}
131
+ {"current_steps": 128, "total_steps": 360, "loss": 0.3473, "lr": 3.6467545805452266e-06, "epoch": 0.35463203463203463, "percentage": 35.56, "elapsed_time": "5:20:21", "remaining_time": "9:40:38"}
132
+ {"current_steps": 129, "total_steps": 360, "loss": 0.2654, "lr": 3.6271063308488298e-06, "epoch": 0.3574025974025974, "percentage": 35.83, "elapsed_time": "5:22:46", "remaining_time": "9:37:59"}
133
+ {"current_steps": 130, "total_steps": 360, "loss": 0.2825, "lr": 3.6073703079750204e-06, "epoch": 0.3601731601731602, "percentage": 36.11, "elapsed_time": "5:25:09", "remaining_time": "9:35:16"}
134
+ {"current_steps": 131, "total_steps": 360, "loss": 0.3034, "lr": 3.5875480488625847e-06, "epoch": 0.36294372294372296, "percentage": 36.39, "elapsed_time": "5:27:26", "remaining_time": "9:32:23"}
135
+ {"current_steps": 132, "total_steps": 360, "loss": 0.3241, "lr": 3.5676410971659404e-06, "epoch": 0.3657142857142857, "percentage": 36.67, "elapsed_time": "5:29:53", "remaining_time": "9:29:48"}
136
+ {"current_steps": 133, "total_steps": 360, "loss": 0.2921, "lr": 3.547651003134921e-06, "epoch": 0.36848484848484847, "percentage": 36.94, "elapsed_time": "5:32:16", "remaining_time": "9:27:06"}
137
+ {"current_steps": 134, "total_steps": 360, "loss": 0.2934, "lr": 3.527579323494055e-06, "epoch": 0.3712554112554113, "percentage": 37.22, "elapsed_time": "5:34:35", "remaining_time": "9:24:18"}
138
+ {"current_steps": 135, "total_steps": 360, "loss": 0.2774, "lr": 3.507427621321331e-06, "epoch": 0.37402597402597404, "percentage": 37.5, "elapsed_time": "5:36:53", "remaining_time": "9:21:28"}
139
+ {"current_steps": 136, "total_steps": 360, "loss": 0.3326, "lr": 3.4871974659264786e-06, "epoch": 0.3767965367965368, "percentage": 37.78, "elapsed_time": "5:39:12", "remaining_time": "9:18:42"}
140
+ {"current_steps": 137, "total_steps": 360, "loss": 0.3081, "lr": 3.466890432728754e-06, "epoch": 0.37956709956709955, "percentage": 38.06, "elapsed_time": "5:41:32", "remaining_time": "9:15:56"}
141
+ {"current_steps": 138, "total_steps": 360, "loss": 0.3302, "lr": 3.446508103134259e-06, "epoch": 0.38233766233766237, "percentage": 38.33, "elapsed_time": "5:43:51", "remaining_time": "9:13:10"}
142
+ {"current_steps": 139, "total_steps": 360, "loss": 0.349, "lr": 3.426052064412785e-06, "epoch": 0.3851082251082251, "percentage": 38.61, "elapsed_time": "5:46:15", "remaining_time": "9:10:31"}
143
+ {"current_steps": 140, "total_steps": 360, "loss": 0.2861, "lr": 3.4055239095742067e-06, "epoch": 0.3878787878787879, "percentage": 38.89, "elapsed_time": "5:48:35", "remaining_time": "9:07:47"}
144
+ {"current_steps": 141, "total_steps": 360, "loss": 0.3302, "lr": 3.3849252372444295e-06, "epoch": 0.39064935064935064, "percentage": 39.17, "elapsed_time": "5:51:00", "remaining_time": "9:05:10"}
145
+ {"current_steps": 142, "total_steps": 360, "loss": 0.3731, "lr": 3.364257651540891e-06, "epoch": 0.39341991341991345, "percentage": 39.44, "elapsed_time": "5:53:23", "remaining_time": "9:02:32"}
146
+ {"current_steps": 143, "total_steps": 360, "loss": 0.2592, "lr": 3.343522761947646e-06, "epoch": 0.3961904761904762, "percentage": 39.72, "elapsed_time": "5:55:44", "remaining_time": "8:59:50"}
147
+ {"current_steps": 144, "total_steps": 360, "loss": 0.3755, "lr": 3.322722183190025e-06, "epoch": 0.39896103896103896, "percentage": 40.0, "elapsed_time": "5:58:04", "remaining_time": "8:57:06"}
148
+ {"current_steps": 144, "total_steps": 360, "eval_loss": 0.30155444145202637, "epoch": 0.39896103896103896, "percentage": 40.0, "elapsed_time": "6:04:20", "remaining_time": "9:06:30"}
149
+ {"current_steps": 145, "total_steps": 360, "loss": 0.3813, "lr": 3.3018575351088894e-06, "epoch": 0.4017316017316017, "percentage": 40.28, "elapsed_time": "6:06:42", "remaining_time": "9:03:44"}
150
+ {"current_steps": 146, "total_steps": 360, "loss": 0.3756, "lr": 3.280930442534486e-06, "epoch": 0.40450216450216453, "percentage": 40.56, "elapsed_time": "6:09:05", "remaining_time": "9:00:59"}
151
+ {"current_steps": 147, "total_steps": 360, "loss": 0.3121, "lr": 3.2599425351599136e-06, "epoch": 0.4072727272727273, "percentage": 40.83, "elapsed_time": "6:11:27", "remaining_time": "8:58:13"}
152
+ {"current_steps": 148, "total_steps": 360, "loss": 0.3778, "lr": 3.238895447414211e-06, "epoch": 0.41004329004329004, "percentage": 41.11, "elapsed_time": "6:13:47", "remaining_time": "8:55:25"}
153
+ {"current_steps": 149, "total_steps": 360, "loss": 0.3093, "lr": 3.217790818335077e-06, "epoch": 0.4128138528138528, "percentage": 41.39, "elapsed_time": "6:16:07", "remaining_time": "8:52:38"}
154
+ {"current_steps": 150, "total_steps": 360, "loss": 0.3118, "lr": 3.196630291441231e-06, "epoch": 0.4155844155844156, "percentage": 41.67, "elapsed_time": "6:18:28", "remaining_time": "8:49:52"}
155
+ {"current_steps": 151, "total_steps": 360, "loss": 0.3376, "lr": 3.175415514604422e-06, "epoch": 0.41835497835497837, "percentage": 41.94, "elapsed_time": "6:20:51", "remaining_time": "8:47:08"}
156
+ {"current_steps": 152, "total_steps": 360, "loss": 0.259, "lr": 3.154148139921102e-06, "epoch": 0.4211255411255411, "percentage": 42.22, "elapsed_time": "6:23:13", "remaining_time": "8:44:24"}
157
+ {"current_steps": 153, "total_steps": 360, "loss": 0.3274, "lr": 3.132829823583771e-06, "epoch": 0.4238961038961039, "percentage": 42.5, "elapsed_time": "6:25:38", "remaining_time": "8:41:45"}
158
+ {"current_steps": 154, "total_steps": 360, "loss": 0.3034, "lr": 3.1114622257520004e-06, "epoch": 0.4266666666666667, "percentage": 42.78, "elapsed_time": "6:28:02", "remaining_time": "8:39:04"}
159
+ {"current_steps": 155, "total_steps": 360, "loss": 0.3404, "lr": 3.0900470104231456e-06, "epoch": 0.42943722943722945, "percentage": 43.06, "elapsed_time": "6:30:17", "remaining_time": "8:36:11"}
160
+ {"current_steps": 156, "total_steps": 360, "loss": 0.2813, "lr": 3.0685858453027668e-06, "epoch": 0.4322077922077922, "percentage": 43.33, "elapsed_time": "6:32:37", "remaining_time": "8:33:26"}
161
+ {"current_steps": 157, "total_steps": 360, "loss": 0.3498, "lr": 3.047080401674754e-06, "epoch": 0.43497835497835496, "percentage": 43.61, "elapsed_time": "6:34:58", "remaining_time": "8:30:41"}
162
+ {"current_steps": 158, "total_steps": 360, "loss": 0.2849, "lr": 3.0255323542711784e-06, "epoch": 0.4377489177489178, "percentage": 43.89, "elapsed_time": "6:37:20", "remaining_time": "8:27:59"}
163
+ {"current_steps": 159, "total_steps": 360, "loss": 0.333, "lr": 3.00394338114187e-06, "epoch": 0.44051948051948053, "percentage": 44.17, "elapsed_time": "6:39:42", "remaining_time": "8:25:16"}
164
+ {"current_steps": 160, "total_steps": 360, "loss": 0.3901, "lr": 2.9823151635237424e-06, "epoch": 0.4432900432900433, "percentage": 44.44, "elapsed_time": "6:42:02", "remaining_time": "8:22:33"}
165
+ {"current_steps": 161, "total_steps": 360, "loss": 0.3214, "lr": 2.9606493857098657e-06, "epoch": 0.44606060606060605, "percentage": 44.72, "elapsed_time": "6:44:25", "remaining_time": "8:19:53"}
166
+ {"current_steps": 162, "total_steps": 360, "loss": 0.2709, "lr": 2.938947734918302e-06, "epoch": 0.44883116883116886, "percentage": 45.0, "elapsed_time": "6:46:49", "remaining_time": "8:17:14"}
167
+ {"current_steps": 163, "total_steps": 360, "loss": 0.2987, "lr": 2.9172119011607153e-06, "epoch": 0.4516017316017316, "percentage": 45.28, "elapsed_time": "6:49:07", "remaining_time": "8:14:27"}
168
+ {"current_steps": 164, "total_steps": 360, "loss": 0.2914, "lr": 2.8954435771107604e-06, "epoch": 0.45437229437229437, "percentage": 45.56, "elapsed_time": "6:51:25", "remaining_time": "8:11:41"}
169
+ {"current_steps": 165, "total_steps": 360, "loss": 0.2741, "lr": 2.8736444579722665e-06, "epoch": 0.45714285714285713, "percentage": 45.83, "elapsed_time": "6:53:45", "remaining_time": "8:08:58"}
170
+ {"current_steps": 166, "total_steps": 360, "loss": 0.3081, "lr": 2.8518162413472266e-06, "epoch": 0.45991341991341994, "percentage": 46.11, "elapsed_time": "6:56:04", "remaining_time": "8:06:15"}
171
+ {"current_steps": 167, "total_steps": 360, "loss": 0.3063, "lr": 2.8299606271035913e-06, "epoch": 0.4626839826839827, "percentage": 46.39, "elapsed_time": "6:58:23", "remaining_time": "8:03:31"}
172
+ {"current_steps": 168, "total_steps": 360, "loss": 0.3263, "lr": 2.8080793172428965e-06, "epoch": 0.46545454545454545, "percentage": 46.67, "elapsed_time": "7:00:48", "remaining_time": "8:00:55"}
173
+ {"current_steps": 169, "total_steps": 360, "loss": 0.255, "lr": 2.786174015767721e-06, "epoch": 0.4682251082251082, "percentage": 46.94, "elapsed_time": "7:03:15", "remaining_time": "7:58:21"}
174
+ {"current_steps": 170, "total_steps": 360, "loss": 0.2828, "lr": 2.764246428548983e-06, "epoch": 0.470995670995671, "percentage": 47.22, "elapsed_time": "7:05:38", "remaining_time": "7:55:42"}
175
+ {"current_steps": 171, "total_steps": 360, "loss": 0.3186, "lr": 2.742298263193099e-06, "epoch": 0.4737662337662338, "percentage": 47.5, "elapsed_time": "7:07:57", "remaining_time": "7:53:00"}
176
+ {"current_steps": 172, "total_steps": 360, "loss": 0.3305, "lr": 2.720331228909005e-06, "epoch": 0.47653679653679654, "percentage": 47.78, "elapsed_time": "7:10:16", "remaining_time": "7:50:18"}
177
+ {"current_steps": 173, "total_steps": 360, "loss": 0.2907, "lr": 2.6983470363750497e-06, "epoch": 0.4793073593073593, "percentage": 48.06, "elapsed_time": "7:12:47", "remaining_time": "7:47:48"}
178
+ {"current_steps": 174, "total_steps": 360, "loss": 0.3302, "lr": 2.6763473976057776e-06, "epoch": 0.4820779220779221, "percentage": 48.33, "elapsed_time": "7:15:04", "remaining_time": "7:45:04"}
179
+ {"current_steps": 175, "total_steps": 360, "loss": 0.2448, "lr": 2.6543340258186063e-06, "epoch": 0.48484848484848486, "percentage": 48.61, "elapsed_time": "7:17:23", "remaining_time": "7:42:23"}
180
+ {"current_steps": 176, "total_steps": 360, "loss": 0.2549, "lr": 2.6323086353004077e-06, "epoch": 0.4876190476190476, "percentage": 48.89, "elapsed_time": "7:19:46", "remaining_time": "7:39:46"}
181
+ {"current_steps": 177, "total_steps": 360, "loss": 0.2989, "lr": 2.610272941274012e-06, "epoch": 0.4903896103896104, "percentage": 49.17, "elapsed_time": "7:22:08", "remaining_time": "7:37:07"}
182
+ {"current_steps": 178, "total_steps": 360, "loss": 0.3189, "lr": 2.588228659764632e-06, "epoch": 0.4931601731601732, "percentage": 49.44, "elapsed_time": "7:24:30", "remaining_time": "7:34:29"}
183
+ {"current_steps": 179, "total_steps": 360, "loss": 0.3417, "lr": 2.5661775074662276e-06, "epoch": 0.49593073593073594, "percentage": 49.72, "elapsed_time": "7:26:56", "remaining_time": "7:31:56"}
184
+ {"current_steps": 180, "total_steps": 360, "loss": 0.3454, "lr": 2.544121201607822e-06, "epoch": 0.4987012987012987, "percentage": 50.0, "elapsed_time": "7:29:20", "remaining_time": "7:29:20"}
185
+ {"current_steps": 180, "total_steps": 360, "eval_loss": 0.30528655648231506, "epoch": 0.4987012987012987, "percentage": 50.0, "elapsed_time": "7:35:35", "remaining_time": "7:35:35"}
186
+ {"current_steps": 181, "total_steps": 360, "loss": 0.3333, "lr": 2.5220614598197708e-06, "epoch": 0.5014718614718615, "percentage": 50.28, "elapsed_time": "7:38:05", "remaining_time": "7:33:02"}
187
+ {"current_steps": 182, "total_steps": 360, "loss": 0.3226, "lr": 2.5e-06, "epoch": 0.5042424242424243, "percentage": 50.56, "elapsed_time": "7:40:33", "remaining_time": "7:30:25"}
188
+ {"current_steps": 183, "total_steps": 360, "loss": 0.3903, "lr": 2.477938540180231e-06, "epoch": 0.507012987012987, "percentage": 50.83, "elapsed_time": "7:42:53", "remaining_time": "7:27:42"}
189
+ {"current_steps": 184, "total_steps": 360, "loss": 0.3303, "lr": 2.455878798392179e-06, "epoch": 0.5097835497835498, "percentage": 51.11, "elapsed_time": "7:45:10", "remaining_time": "7:24:57"}
190
+ {"current_steps": 185, "total_steps": 360, "loss": 0.3537, "lr": 2.433822492533774e-06, "epoch": 0.5125541125541125, "percentage": 51.39, "elapsed_time": "7:47:29", "remaining_time": "7:22:13"}
191
+ {"current_steps": 186, "total_steps": 360, "loss": 0.3312, "lr": 2.411771340235369e-06, "epoch": 0.5153246753246753, "percentage": 51.67, "elapsed_time": "7:49:51", "remaining_time": "7:19:32"}
192
+ {"current_steps": 187, "total_steps": 360, "loss": 0.342, "lr": 2.389727058725989e-06, "epoch": 0.518095238095238, "percentage": 51.94, "elapsed_time": "7:52:12", "remaining_time": "7:16:51"}
193
+ {"current_steps": 188, "total_steps": 360, "loss": 0.2445, "lr": 2.3676913646995923e-06, "epoch": 0.5208658008658009, "percentage": 52.22, "elapsed_time": "7:54:36", "remaining_time": "7:14:12"}
194
+ {"current_steps": 189, "total_steps": 360, "loss": 0.3456, "lr": 2.3456659741813945e-06, "epoch": 0.5236363636363637, "percentage": 52.5, "elapsed_time": "7:56:56", "remaining_time": "7:11:31"}
195
+ {"current_steps": 190, "total_steps": 360, "loss": 0.3614, "lr": 2.3236526023942224e-06, "epoch": 0.5264069264069264, "percentage": 52.78, "elapsed_time": "7:59:12", "remaining_time": "7:08:45"}
196
+ {"current_steps": 191, "total_steps": 360, "loss": 0.3241, "lr": 2.301652963624951e-06, "epoch": 0.5291774891774892, "percentage": 53.06, "elapsed_time": "8:01:33", "remaining_time": "7:06:05"}
197
+ {"current_steps": 192, "total_steps": 360, "loss": 0.3448, "lr": 2.2796687710909966e-06, "epoch": 0.531948051948052, "percentage": 53.33, "elapsed_time": "8:03:54", "remaining_time": "7:03:24"}
198
+ {"current_steps": 193, "total_steps": 360, "loss": 0.2616, "lr": 2.2577017368069017e-06, "epoch": 0.5347186147186147, "percentage": 53.61, "elapsed_time": "8:06:14", "remaining_time": "7:00:44"}
199
+ {"current_steps": 194, "total_steps": 360, "loss": 0.3198, "lr": 2.235753571451018e-06, "epoch": 0.5374891774891775, "percentage": 53.89, "elapsed_time": "8:08:34", "remaining_time": "6:58:03"}
200
+ {"current_steps": 195, "total_steps": 360, "loss": 0.2509, "lr": 2.2138259842322794e-06, "epoch": 0.5402597402597402, "percentage": 54.17, "elapsed_time": "8:10:57", "remaining_time": "6:55:25"}
201
+ {"current_steps": 196, "total_steps": 360, "loss": 0.3048, "lr": 2.191920682757104e-06, "epoch": 0.5430303030303031, "percentage": 54.44, "elapsed_time": "8:13:20", "remaining_time": "6:52:47"}
202
+ {"current_steps": 197, "total_steps": 360, "loss": 0.2765, "lr": 2.170039372896409e-06, "epoch": 0.5458008658008658, "percentage": 54.72, "elapsed_time": "8:15:47", "remaining_time": "6:50:13"}
203
+ {"current_steps": 198, "total_steps": 360, "loss": 0.2769, "lr": 2.148183758652774e-06, "epoch": 0.5485714285714286, "percentage": 55.0, "elapsed_time": "8:18:05", "remaining_time": "6:47:31"}
204
+ {"current_steps": 199, "total_steps": 360, "loss": 0.3478, "lr": 2.126355542027734e-06, "epoch": 0.5513419913419914, "percentage": 55.28, "elapsed_time": "8:20:28", "remaining_time": "6:44:54"}
205
+ {"current_steps": 200, "total_steps": 360, "loss": 0.2268, "lr": 2.1045564228892404e-06, "epoch": 0.5541125541125541, "percentage": 55.56, "elapsed_time": "8:22:49", "remaining_time": "6:42:15"}
206
+ {"current_steps": 201, "total_steps": 360, "loss": 0.274, "lr": 2.0827880988392856e-06, "epoch": 0.5568831168831169, "percentage": 55.83, "elapsed_time": "8:25:06", "remaining_time": "6:39:33"}
207
+ {"current_steps": 202, "total_steps": 360, "loss": 0.3558, "lr": 2.0610522650816985e-06, "epoch": 0.5596536796536796, "percentage": 56.11, "elapsed_time": "8:27:26", "remaining_time": "6:36:54"}
208
+ {"current_steps": 203, "total_steps": 360, "loss": 0.274, "lr": 2.0393506142901347e-06, "epoch": 0.5624242424242424, "percentage": 56.39, "elapsed_time": "8:29:45", "remaining_time": "6:34:14"}
209
+ {"current_steps": 204, "total_steps": 360, "loss": 0.3008, "lr": 2.017684836476258e-06, "epoch": 0.5651948051948052, "percentage": 56.67, "elapsed_time": "8:32:04", "remaining_time": "6:31:34"}
210
+ {"current_steps": 205, "total_steps": 360, "loss": 0.343, "lr": 1.9960566188581306e-06, "epoch": 0.567965367965368, "percentage": 56.94, "elapsed_time": "8:34:20", "remaining_time": "6:28:53"}
211
+ {"current_steps": 206, "total_steps": 360, "loss": 0.3601, "lr": 1.9744676457288225e-06, "epoch": 0.5707359307359308, "percentage": 57.22, "elapsed_time": "8:36:42", "remaining_time": "6:26:16"}
212
+ {"current_steps": 207, "total_steps": 360, "loss": 0.3738, "lr": 1.952919598325247e-06, "epoch": 0.5735064935064935, "percentage": 57.5, "elapsed_time": "8:38:59", "remaining_time": "6:23:35"}
213
+ {"current_steps": 208, "total_steps": 360, "loss": 0.2513, "lr": 1.9314141546972345e-06, "epoch": 0.5762770562770563, "percentage": 57.78, "elapsed_time": "8:41:20", "remaining_time": "6:20:59"}
214
+ {"current_steps": 209, "total_steps": 360, "loss": 0.2727, "lr": 1.9099529895768552e-06, "epoch": 0.579047619047619, "percentage": 58.06, "elapsed_time": "8:43:42", "remaining_time": "6:18:22"}
215
+ {"current_steps": 210, "total_steps": 360, "loss": 0.3118, "lr": 1.8885377742480005e-06, "epoch": 0.5818181818181818, "percentage": 58.33, "elapsed_time": "8:46:03", "remaining_time": "6:15:45"}
216
+ {"current_steps": 211, "total_steps": 360, "loss": 0.2568, "lr": 1.8671701764162287e-06, "epoch": 0.5845887445887445, "percentage": 58.61, "elapsed_time": "8:48:26", "remaining_time": "6:13:09"}
217
+ {"current_steps": 212, "total_steps": 360, "loss": 0.277, "lr": 1.8458518600788988e-06, "epoch": 0.5873593073593074, "percentage": 58.89, "elapsed_time": "8:50:49", "remaining_time": "6:10:34"}
218
+ {"current_steps": 213, "total_steps": 360, "loss": 0.2229, "lr": 1.8245844853955786e-06, "epoch": 0.5901298701298702, "percentage": 59.17, "elapsed_time": "8:53:10", "remaining_time": "6:07:58"}
219
+ {"current_steps": 214, "total_steps": 360, "loss": 0.3359, "lr": 1.8033697085587698e-06, "epoch": 0.5929004329004329, "percentage": 59.44, "elapsed_time": "8:55:31", "remaining_time": "6:05:21"}
220
+ {"current_steps": 215, "total_steps": 360, "loss": 0.2641, "lr": 1.782209181664924e-06, "epoch": 0.5956709956709957, "percentage": 59.72, "elapsed_time": "8:57:48", "remaining_time": "6:02:42"}
221
+ {"current_steps": 216, "total_steps": 360, "loss": 0.2633, "lr": 1.7611045525857902e-06, "epoch": 0.5984415584415584, "percentage": 60.0, "elapsed_time": "9:00:09", "remaining_time": "6:00:06"}
222
+ {"current_steps": 216, "total_steps": 360, "eval_loss": 0.30848661065101624, "epoch": 0.5984415584415584, "percentage": 60.0, "elapsed_time": "9:06:24", "remaining_time": "6:04:16"}
223
+ {"current_steps": 217, "total_steps": 360, "loss": 0.2693, "lr": 1.740057464840088e-06, "epoch": 0.6012121212121212, "percentage": 60.28, "elapsed_time": "9:08:47", "remaining_time": "6:01:39"}
224
+ {"current_steps": 218, "total_steps": 360, "loss": 0.3916, "lr": 1.7190695574655147e-06, "epoch": 0.603982683982684, "percentage": 60.56, "elapsed_time": "9:11:03", "remaining_time": "5:58:57"}
225
+ {"current_steps": 219, "total_steps": 360, "loss": 0.308, "lr": 1.6981424648911112e-06, "epoch": 0.6067532467532467, "percentage": 60.83, "elapsed_time": "9:13:19", "remaining_time": "5:56:15"}
226
+ {"current_steps": 220, "total_steps": 360, "loss": 0.336, "lr": 1.677277816809975e-06, "epoch": 0.6095238095238096, "percentage": 61.11, "elapsed_time": "9:15:36", "remaining_time": "5:53:34"}
227
+ {"current_steps": 221, "total_steps": 360, "loss": 0.368, "lr": 1.6564772380523546e-06, "epoch": 0.6122943722943723, "percentage": 61.39, "elapsed_time": "9:17:54", "remaining_time": "5:50:54"}
228
+ {"current_steps": 222, "total_steps": 360, "loss": 0.2651, "lr": 1.635742348459109e-06, "epoch": 0.6150649350649351, "percentage": 61.67, "elapsed_time": "9:20:15", "remaining_time": "5:48:16"}
229
+ {"current_steps": 223, "total_steps": 360, "loss": 0.3203, "lr": 1.6150747627555713e-06, "epoch": 0.6178354978354978, "percentage": 61.94, "elapsed_time": "9:22:40", "remaining_time": "5:45:40"}
230
+ {"current_steps": 224, "total_steps": 360, "loss": 0.3245, "lr": 1.5944760904257944e-06, "epoch": 0.6206060606060606, "percentage": 62.22, "elapsed_time": "9:25:05", "remaining_time": "5:43:05"}
231
+ {"current_steps": 225, "total_steps": 360, "loss": 0.3358, "lr": 1.5739479355872162e-06, "epoch": 0.6233766233766234, "percentage": 62.5, "elapsed_time": "9:27:27", "remaining_time": "5:40:28"}
232
+ {"current_steps": 226, "total_steps": 360, "loss": 0.2938, "lr": 1.5534918968657423e-06, "epoch": 0.6261471861471861, "percentage": 62.78, "elapsed_time": "9:29:42", "remaining_time": "5:37:47"}
233
+ {"current_steps": 227, "total_steps": 360, "loss": 0.4037, "lr": 1.5331095672712463e-06, "epoch": 0.6289177489177489, "percentage": 63.06, "elapsed_time": "9:31:56", "remaining_time": "5:35:06"}
234
+ {"current_steps": 228, "total_steps": 360, "loss": 0.3153, "lr": 1.5128025340735223e-06, "epoch": 0.6316883116883117, "percentage": 63.33, "elapsed_time": "9:34:15", "remaining_time": "5:32:27"}
235
+ {"current_steps": 229, "total_steps": 360, "loss": 0.3023, "lr": 1.4925723786786691e-06, "epoch": 0.6344588744588745, "percentage": 63.61, "elapsed_time": "9:36:35", "remaining_time": "5:29:50"}
236
+ {"current_steps": 230, "total_steps": 360, "loss": 0.2649, "lr": 1.4724206765059456e-06, "epoch": 0.6372294372294373, "percentage": 63.89, "elapsed_time": "9:39:03", "remaining_time": "5:27:17"}
237
+ {"current_steps": 231, "total_steps": 360, "loss": 0.3014, "lr": 1.4523489968650795e-06, "epoch": 0.64, "percentage": 64.17, "elapsed_time": "9:41:24", "remaining_time": "5:24:40"}
238
+ {"current_steps": 232, "total_steps": 360, "loss": 0.2185, "lr": 1.4323589028340598e-06, "epoch": 0.6427705627705628, "percentage": 64.44, "elapsed_time": "9:43:49", "remaining_time": "5:22:06"}
239
+ {"current_steps": 233, "total_steps": 360, "loss": 0.3564, "lr": 1.4124519511374158e-06, "epoch": 0.6455411255411255, "percentage": 64.72, "elapsed_time": "9:46:14", "remaining_time": "5:19:32"}
240
+ {"current_steps": 234, "total_steps": 360, "loss": 0.2441, "lr": 1.3926296920249796e-06, "epoch": 0.6483116883116883, "percentage": 65.0, "elapsed_time": "9:48:34", "remaining_time": "5:16:55"}
241
+ {"current_steps": 235, "total_steps": 360, "loss": 0.343, "lr": 1.3728936691511704e-06, "epoch": 0.651082251082251, "percentage": 65.28, "elapsed_time": "9:50:52", "remaining_time": "5:14:17"}
242
+ {"current_steps": 236, "total_steps": 360, "loss": 0.2775, "lr": 1.3532454194547734e-06, "epoch": 0.6538528138528139, "percentage": 65.56, "elapsed_time": "9:53:14", "remaining_time": "5:11:42"}
243
+ {"current_steps": 237, "total_steps": 360, "loss": 0.3229, "lr": 1.3336864730392587e-06, "epoch": 0.6566233766233767, "percentage": 65.83, "elapsed_time": "9:55:33", "remaining_time": "5:09:05"}
244
+ {"current_steps": 238, "total_steps": 360, "loss": 0.374, "lr": 1.314218353053619e-06, "epoch": 0.6593939393939394, "percentage": 66.11, "elapsed_time": "9:57:53", "remaining_time": "5:06:29"}
245
+ {"current_steps": 239, "total_steps": 360, "loss": 0.3171, "lr": 1.2948425755737592e-06, "epoch": 0.6621645021645022, "percentage": 66.39, "elapsed_time": "10:00:16", "remaining_time": "5:03:54"}
246
+ {"current_steps": 240, "total_steps": 360, "loss": 0.2457, "lr": 1.2755606494844294e-06, "epoch": 0.6649350649350649, "percentage": 66.67, "elapsed_time": "10:02:36", "remaining_time": "5:01:18"}
247
+ {"current_steps": 241, "total_steps": 360, "loss": 0.2843, "lr": 1.2563740763617198e-06, "epoch": 0.6677056277056277, "percentage": 66.94, "elapsed_time": "10:04:56", "remaining_time": "4:58:42"}
248
+ {"current_steps": 242, "total_steps": 360, "loss": 0.3154, "lr": 1.2372843503561318e-06, "epoch": 0.6704761904761904, "percentage": 67.22, "elapsed_time": "10:07:22", "remaining_time": "4:56:09"}
249
+ {"current_steps": 243, "total_steps": 360, "loss": 0.342, "lr": 1.218292958076213e-06, "epoch": 0.6732467532467532, "percentage": 67.5, "elapsed_time": "10:09:43", "remaining_time": "4:53:34"}
250
+ {"current_steps": 244, "total_steps": 360, "loss": 0.352, "lr": 1.1994013784727948e-06, "epoch": 0.6760173160173161, "percentage": 67.78, "elapsed_time": "10:11:59", "remaining_time": "4:50:56"}
251
+ {"current_steps": 245, "total_steps": 360, "loss": 0.3024, "lr": 1.180611082723814e-06, "epoch": 0.6787878787878788, "percentage": 68.06, "elapsed_time": "10:14:20", "remaining_time": "4:48:21"}
252
+ {"current_steps": 246, "total_steps": 360, "loss": 0.3034, "lr": 1.161923534119752e-06, "epoch": 0.6815584415584416, "percentage": 68.33, "elapsed_time": "10:16:43", "remaining_time": "4:45:47"}
253
+ {"current_steps": 247, "total_steps": 360, "loss": 0.2797, "lr": 1.1433401879496723e-06, "epoch": 0.6843290043290043, "percentage": 68.61, "elapsed_time": "10:19:01", "remaining_time": "4:43:11"}
254
+ {"current_steps": 248, "total_steps": 360, "loss": 0.3145, "lr": 1.1248624913878966e-06, "epoch": 0.6870995670995671, "percentage": 68.89, "elapsed_time": "10:21:29", "remaining_time": "4:40:40"}
255
+ {"current_steps": 249, "total_steps": 360, "loss": 0.3581, "lr": 1.1064918833813073e-06, "epoch": 0.6898701298701299, "percentage": 69.17, "elapsed_time": "10:23:52", "remaining_time": "4:38:06"}
256
+ {"current_steps": 250, "total_steps": 360, "loss": 0.316, "lr": 1.088229794537283e-06, "epoch": 0.6926406926406926, "percentage": 69.44, "elapsed_time": "10:26:13", "remaining_time": "4:35:32"}
257
+ {"current_steps": 251, "total_steps": 360, "loss": 0.2896, "lr": 1.0700776470122981e-06, "epoch": 0.6954112554112554, "percentage": 69.72, "elapsed_time": "10:28:33", "remaining_time": "4:32:57"}
258
+ {"current_steps": 252, "total_steps": 360, "loss": 0.2519, "lr": 1.0520368544011661e-06, "epoch": 0.6981818181818182, "percentage": 70.0, "elapsed_time": "10:30:57", "remaining_time": "4:30:24"}
259
+ {"current_steps": 252, "total_steps": 360, "eval_loss": 0.3109191656112671, "epoch": 0.6981818181818182, "percentage": 70.0, "elapsed_time": "10:37:12", "remaining_time": "4:33:05"}
260
+ {"current_steps": 253, "total_steps": 360, "loss": 0.2696, "lr": 1.0341088216269625e-06, "epoch": 0.700952380952381, "percentage": 70.28, "elapsed_time": "10:39:29", "remaining_time": "4:30:27"}
261
+ {"current_steps": 254, "total_steps": 360, "loss": 0.3508, "lr": 1.0162949448316089e-06, "epoch": 0.7037229437229438, "percentage": 70.56, "elapsed_time": "10:41:48", "remaining_time": "4:27:50"}
262
+ {"current_steps": 255, "total_steps": 360, "loss": 0.3384, "lr": 9.98596611267158e-07, "epoch": 0.7064935064935065, "percentage": 70.83, "elapsed_time": "10:44:18", "remaining_time": "4:25:18"}
263
+ {"current_steps": 256, "total_steps": 360, "loss": 0.2374, "lr": 9.81015199187753e-07, "epoch": 0.7092640692640693, "percentage": 71.11, "elapsed_time": "10:46:37", "remaining_time": "4:22:41"}
264
+ {"current_steps": 257, "total_steps": 360, "loss": 0.3161, "lr": 9.63552077742301e-07, "epoch": 0.712034632034632, "percentage": 71.39, "elapsed_time": "10:48:52", "remaining_time": "4:20:03"}
265
+ {"current_steps": 258, "total_steps": 360, "loss": 0.2752, "lr": 9.462086068678519e-07, "epoch": 0.7148051948051948, "percentage": 71.67, "elapsed_time": "10:51:15", "remaining_time": "4:17:28"}
266
+ {"current_steps": 259, "total_steps": 360, "loss": 0.3954, "lr": 9.289861371836886e-07, "epoch": 0.7175757575757575, "percentage": 71.94, "elapsed_time": "10:53:33", "remaining_time": "4:14:51"}
267
+ {"current_steps": 260, "total_steps": 360, "loss": 0.2925, "lr": 9.118860098861538e-07, "epoch": 0.7203463203463204, "percentage": 72.22, "elapsed_time": "10:55:55", "remaining_time": "4:12:16"}
268
+ {"current_steps": 261, "total_steps": 360, "loss": 0.2628, "lr": 8.949095566441985e-07, "epoch": 0.7231168831168832, "percentage": 72.5, "elapsed_time": "10:58:18", "remaining_time": "4:09:42"}
269
+ {"current_steps": 262, "total_steps": 360, "loss": 0.3599, "lr": 8.78058099495685e-07, "epoch": 0.7258874458874459, "percentage": 72.78, "elapsed_time": "11:00:36", "remaining_time": "4:07:05"}
270
+ {"current_steps": 263, "total_steps": 360, "loss": 0.3067, "lr": 8.613329507444274e-07, "epoch": 0.7286580086580087, "percentage": 73.06, "elapsed_time": "11:02:57", "remaining_time": "4:04:30"}
271
+ {"current_steps": 264, "total_steps": 360, "loss": 0.3157, "lr": 8.44735412857999e-07, "epoch": 0.7314285714285714, "percentage": 73.33, "elapsed_time": "11:05:19", "remaining_time": "4:01:56"}
272
+ {"current_steps": 265, "total_steps": 360, "loss": 0.2581, "lr": 8.282667783663056e-07, "epoch": 0.7341991341991342, "percentage": 73.61, "elapsed_time": "11:07:44", "remaining_time": "3:59:22"}
273
+ {"current_steps": 266, "total_steps": 360, "loss": 0.2609, "lr": 8.119283297609238e-07, "epoch": 0.7369696969696969, "percentage": 73.89, "elapsed_time": "11:10:02", "remaining_time": "3:56:46"}
274
+ {"current_steps": 267, "total_steps": 360, "loss": 0.2854, "lr": 7.957213393952335e-07, "epoch": 0.7397402597402597, "percentage": 74.17, "elapsed_time": "11:12:22", "remaining_time": "3:54:11"}
275
+ {"current_steps": 268, "total_steps": 360, "loss": 0.2848, "lr": 7.796470693853281e-07, "epoch": 0.7425108225108226, "percentage": 74.44, "elapsed_time": "11:14:50", "remaining_time": "3:51:39"}
276
+ {"current_steps": 269, "total_steps": 360, "loss": 0.291, "lr": 7.637067715117327e-07, "epoch": 0.7452813852813853, "percentage": 74.72, "elapsed_time": "11:17:09", "remaining_time": "3:49:04"}
277
+ {"current_steps": 270, "total_steps": 360, "loss": 0.3713, "lr": 7.479016871219174e-07, "epoch": 0.7480519480519481, "percentage": 75.0, "elapsed_time": "11:19:27", "remaining_time": "3:46:29"}
278
+ {"current_steps": 271, "total_steps": 360, "loss": 0.3042, "lr": 7.322330470336314e-07, "epoch": 0.7508225108225108, "percentage": 75.28, "elapsed_time": "11:21:47", "remaining_time": "3:43:54"}
279
+ {"current_steps": 272, "total_steps": 360, "loss": 0.2824, "lr": 7.167020714390502e-07, "epoch": 0.7535930735930736, "percentage": 75.56, "elapsed_time": "11:24:05", "remaining_time": "3:41:19"}
280
+ {"current_steps": 273, "total_steps": 360, "loss": 0.2744, "lr": 7.013099698097539e-07, "epoch": 0.7563636363636363, "percentage": 75.83, "elapsed_time": "11:26:29", "remaining_time": "3:38:46"}
281
+ {"current_steps": 274, "total_steps": 360, "loss": 0.351, "lr": 6.860579408025436e-07, "epoch": 0.7591341991341991, "percentage": 76.11, "elapsed_time": "11:29:00", "remaining_time": "3:36:15"}
282
+ {"current_steps": 275, "total_steps": 360, "loss": 0.3114, "lr": 6.709471721660904e-07, "epoch": 0.7619047619047619, "percentage": 76.39, "elapsed_time": "11:31:18", "remaining_time": "3:33:40"}
283
+ {"current_steps": 276, "total_steps": 360, "loss": 0.2748, "lr": 6.559788406484446e-07, "epoch": 0.7646753246753247, "percentage": 76.67, "elapsed_time": "11:33:35", "remaining_time": "3:31:05"}
284
+ {"current_steps": 277, "total_steps": 360, "loss": 0.2997, "lr": 6.41154111905393e-07, "epoch": 0.7674458874458875, "percentage": 76.94, "elapsed_time": "11:35:54", "remaining_time": "3:28:31"}
285
+ {"current_steps": 278, "total_steps": 360, "loss": 0.3447, "lr": 6.264741404096875e-07, "epoch": 0.7702164502164502, "percentage": 77.22, "elapsed_time": "11:38:23", "remaining_time": "3:26:00"}
286
+ {"current_steps": 279, "total_steps": 360, "loss": 0.2985, "lr": 6.119400693611358e-07, "epoch": 0.772987012987013, "percentage": 77.5, "elapsed_time": "11:40:44", "remaining_time": "3:23:26"}
287
+ {"current_steps": 280, "total_steps": 360, "loss": 0.282, "lr": 5.975530305975808e-07, "epoch": 0.7757575757575758, "percentage": 77.78, "elapsed_time": "11:43:06", "remaining_time": "3:20:53"}
288
+ {"current_steps": 281, "total_steps": 360, "loss": 0.2869, "lr": 5.833141445067541e-07, "epoch": 0.7785281385281385, "percentage": 78.06, "elapsed_time": "11:45:25", "remaining_time": "3:18:19"}
289
+ {"current_steps": 282, "total_steps": 360, "loss": 0.3249, "lr": 5.692245199390281e-07, "epoch": 0.7812987012987013, "percentage": 78.33, "elapsed_time": "11:47:43", "remaining_time": "3:15:45"}
290
+ {"current_steps": 283, "total_steps": 360, "loss": 0.3215, "lr": 5.552852541210651e-07, "epoch": 0.784069264069264, "percentage": 78.61, "elapsed_time": "11:50:03", "remaining_time": "3:13:11"}
291
+ {"current_steps": 284, "total_steps": 360, "loss": 0.2668, "lr": 5.414974325703687e-07, "epoch": 0.7868398268398269, "percentage": 78.89, "elapsed_time": "11:52:24", "remaining_time": "3:10:38"}
292
+ {"current_steps": 285, "total_steps": 360, "loss": 0.3009, "lr": 5.278621290107533e-07, "epoch": 0.7896103896103897, "percentage": 79.17, "elapsed_time": "11:54:40", "remaining_time": "3:08:04"}
293
+ {"current_steps": 286, "total_steps": 360, "loss": 0.3118, "lr": 5.143804052887228e-07, "epoch": 0.7923809523809524, "percentage": 79.44, "elapsed_time": "11:56:57", "remaining_time": "3:05:30"}
294
+ {"current_steps": 287, "total_steps": 360, "loss": 0.2868, "lr": 5.010533112907845e-07, "epoch": 0.7951515151515152, "percentage": 79.72, "elapsed_time": "11:59:19", "remaining_time": "3:02:57"}
295
+ {"current_steps": 288, "total_steps": 360, "loss": 0.2959, "lr": 4.878818848616861e-07, "epoch": 0.7979220779220779, "percentage": 80.0, "elapsed_time": "12:01:39", "remaining_time": "3:00:24"}
296
+ {"current_steps": 288, "total_steps": 360, "eval_loss": 0.3033340275287628, "epoch": 0.7979220779220779, "percentage": 80.0, "elapsed_time": "12:07:55", "remaining_time": "3:01:58"}
297
+ {"current_steps": 289, "total_steps": 360, "loss": 0.3472, "lr": 4.748671517235948e-07, "epoch": 0.8006926406926407, "percentage": 80.28, "elapsed_time": "12:10:20", "remaining_time": "2:59:25"}
298
+ {"current_steps": 290, "total_steps": 360, "loss": 0.2236, "lr": 4.620101253962206e-07, "epoch": 0.8034632034632034, "percentage": 80.56, "elapsed_time": "12:12:38", "remaining_time": "2:56:50"}
299
+ {"current_steps": 291, "total_steps": 360, "loss": 0.2312, "lr": 4.4931180711788537e-07, "epoch": 0.8062337662337662, "percentage": 80.83, "elapsed_time": "12:15:01", "remaining_time": "2:54:17"}
300
+ {"current_steps": 292, "total_steps": 360, "loss": 0.3219, "lr": 4.3677318576755693e-07, "epoch": 0.8090043290043291, "percentage": 81.11, "elapsed_time": "12:17:24", "remaining_time": "2:51:43"}
301
+ {"current_steps": 293, "total_steps": 360, "loss": 0.2972, "lr": 4.243952377878338e-07, "epoch": 0.8117748917748918, "percentage": 81.39, "elapsed_time": "12:19:50", "remaining_time": "2:49:10"}
302
+ {"current_steps": 294, "total_steps": 360, "loss": 0.2971, "lr": 4.1217892710891134e-07, "epoch": 0.8145454545454546, "percentage": 81.67, "elapsed_time": "12:22:09", "remaining_time": "2:46:36"}
303
+ {"current_steps": 295, "total_steps": 360, "loss": 0.3443, "lr": 4.001252050735102e-07, "epoch": 0.8173160173160173, "percentage": 81.94, "elapsed_time": "12:24:27", "remaining_time": "2:44:02"}
304
+ {"current_steps": 296, "total_steps": 360, "loss": 0.3395, "lr": 3.882350103627952e-07, "epoch": 0.8200865800865801, "percentage": 82.22, "elapsed_time": "12:26:51", "remaining_time": "2:41:28"}
305
+ {"current_steps": 297, "total_steps": 360, "loss": 0.2963, "lr": 3.7650926892327297e-07, "epoch": 0.8228571428571428, "percentage": 82.5, "elapsed_time": "12:29:08", "remaining_time": "2:38:54"}
306
+ {"current_steps": 298, "total_steps": 360, "loss": 0.2647, "lr": 3.649488938946844e-07, "epoch": 0.8256277056277056, "percentage": 82.78, "elapsed_time": "12:31:21", "remaining_time": "2:36:19"}
307
+ {"current_steps": 299, "total_steps": 360, "loss": 0.2934, "lr": 3.5355478553889626e-07, "epoch": 0.8283982683982684, "percentage": 83.06, "elapsed_time": "12:33:46", "remaining_time": "2:33:46"}
308
+ {"current_steps": 300, "total_steps": 360, "loss": 0.3485, "lr": 3.4232783116978976e-07, "epoch": 0.8311688311688312, "percentage": 83.33, "elapsed_time": "12:36:06", "remaining_time": "2:31:13"}
309
+ {"current_steps": 301, "total_steps": 360, "loss": 0.2369, "lr": 3.312689050841658e-07, "epoch": 0.833939393939394, "percentage": 83.61, "elapsed_time": "12:38:28", "remaining_time": "2:28:40"}
310
+ {"current_steps": 302, "total_steps": 360, "loss": 0.3186, "lr": 3.203788684936535e-07, "epoch": 0.8367099567099567, "percentage": 83.89, "elapsed_time": "12:40:53", "remaining_time": "2:26:07"}
311
+ {"current_steps": 303, "total_steps": 360, "loss": 0.3247, "lr": 3.096585694576498e-07, "epoch": 0.8394805194805195, "percentage": 84.17, "elapsed_time": "12:43:15", "remaining_time": "2:23:35"}
312
+ {"current_steps": 304, "total_steps": 360, "loss": 0.3226, "lr": 2.9910884281727225e-07, "epoch": 0.8422510822510823, "percentage": 84.44, "elapsed_time": "12:45:39", "remaining_time": "2:21:02"}
313
+ {"current_steps": 305, "total_steps": 360, "loss": 0.2805, "lr": 2.8873051013034695e-07, "epoch": 0.845021645021645, "percentage": 84.72, "elapsed_time": "12:47:58", "remaining_time": "2:18:29"}
314
+ {"current_steps": 306, "total_steps": 360, "loss": 0.2939, "lr": 2.785243796074333e-07, "epoch": 0.8477922077922078, "percentage": 85.0, "elapsed_time": "12:50:22", "remaining_time": "2:15:56"}
315
+ {"current_steps": 307, "total_steps": 360, "loss": 0.2921, "lr": 2.6849124604887836e-07, "epoch": 0.8505627705627705, "percentage": 85.28, "elapsed_time": "12:52:43", "remaining_time": "2:13:24"}
316
+ {"current_steps": 308, "total_steps": 360, "loss": 0.2882, "lr": 2.5863189078292913e-07, "epoch": 0.8533333333333334, "percentage": 85.56, "elapsed_time": "12:55:10", "remaining_time": "2:10:52"}
317
+ {"current_steps": 309, "total_steps": 360, "loss": 0.2987, "lr": 2.489470816048806e-07, "epoch": 0.8561038961038961, "percentage": 85.83, "elapsed_time": "12:57:35", "remaining_time": "2:08:20"}
318
+ {"current_steps": 310, "total_steps": 360, "loss": 0.3377, "lr": 2.3943757271728816e-07, "epoch": 0.8588744588744589, "percentage": 86.11, "elapsed_time": "12:59:56", "remaining_time": "2:05:47"}
319
+ {"current_steps": 311, "total_steps": 360, "loss": 0.3031, "lr": 2.30104104671231e-07, "epoch": 0.8616450216450217, "percentage": 86.39, "elapsed_time": "13:02:17", "remaining_time": "2:03:15"}
320
+ {"current_steps": 312, "total_steps": 360, "loss": 0.3594, "lr": 2.2094740430864569e-07, "epoch": 0.8644155844155844, "percentage": 86.67, "elapsed_time": "13:04:33", "remaining_time": "2:00:42"}
321
+ {"current_steps": 313, "total_steps": 360, "loss": 0.2976, "lr": 2.119681847057184e-07, "epoch": 0.8671861471861472, "percentage": 86.94, "elapsed_time": "13:06:55", "remaining_time": "1:58:09"}
322
+ {"current_steps": 314, "total_steps": 360, "loss": 0.2737, "lr": 2.0316714511736002e-07, "epoch": 0.8699567099567099, "percentage": 87.22, "elapsed_time": "13:09:21", "remaining_time": "1:55:38"}
323
+ {"current_steps": 315, "total_steps": 360, "loss": 0.3133, "lr": 1.9454497092274565e-07, "epoch": 0.8727272727272727, "percentage": 87.5, "elapsed_time": "13:11:38", "remaining_time": "1:53:05"}
324
+ {"current_steps": 316, "total_steps": 360, "loss": 0.3657, "lr": 1.861023335719475e-07, "epoch": 0.8754978354978356, "percentage": 87.78, "elapsed_time": "13:13:56", "remaining_time": "1:50:32"}
325
+ {"current_steps": 317, "total_steps": 360, "loss": 0.256, "lr": 1.7783989053363926e-07, "epoch": 0.8782683982683983, "percentage": 88.06, "elapsed_time": "13:16:29", "remaining_time": "1:48:02"}
326
+ {"current_steps": 318, "total_steps": 360, "loss": 0.3, "lr": 1.6975828524390116e-07, "epoch": 0.8810389610389611, "percentage": 88.33, "elapsed_time": "13:18:49", "remaining_time": "1:45:30"}
327
+ {"current_steps": 319, "total_steps": 360, "loss": 0.3225, "lr": 1.6185814705610926e-07, "epoch": 0.8838095238095238, "percentage": 88.61, "elapsed_time": "13:21:10", "remaining_time": "1:42:58"}
328
+ {"current_steps": 320, "total_steps": 360, "loss": 0.2767, "lr": 1.5414009119192635e-07, "epoch": 0.8865800865800866, "percentage": 88.89, "elapsed_time": "13:23:31", "remaining_time": "1:40:26"}
329
+ {"current_steps": 321, "total_steps": 360, "loss": 0.3119, "lr": 1.4660471869339056e-07, "epoch": 0.8893506493506493, "percentage": 89.17, "elapsed_time": "13:25:53", "remaining_time": "1:37:54"}
330
+ {"current_steps": 322, "total_steps": 360, "loss": 0.3591, "lr": 1.392526163761107e-07, "epoch": 0.8921212121212121, "percentage": 89.44, "elapsed_time": "13:28:13", "remaining_time": "1:35:22"}
331
+ {"current_steps": 323, "total_steps": 360, "loss": 0.3352, "lr": 1.3208435678356612e-07, "epoch": 0.8948917748917748, "percentage": 89.72, "elapsed_time": "13:30:31", "remaining_time": "1:32:50"}
332
+ {"current_steps": 324, "total_steps": 360, "loss": 0.2921, "lr": 1.2510049814252302e-07, "epoch": 0.8976623376623377, "percentage": 90.0, "elapsed_time": "13:32:49", "remaining_time": "1:30:18"}
333
+ {"current_steps": 324, "total_steps": 360, "eval_loss": 0.30222654342651367, "epoch": 0.8976623376623377, "percentage": 90.0, "elapsed_time": "13:39:06", "remaining_time": "1:31:00"}
334
+ {"current_steps": 325, "total_steps": 360, "loss": 0.2455, "lr": 1.1830158431955841e-07, "epoch": 0.9004329004329005, "percentage": 90.28, "elapsed_time": "13:41:33", "remaining_time": "1:28:28"}
335
+ {"current_steps": 326, "total_steps": 360, "loss": 0.3071, "lr": 1.1168814477871132e-07, "epoch": 0.9032034632034632, "percentage": 90.56, "elapsed_time": "13:43:53", "remaining_time": "1:25:55"}
336
+ {"current_steps": 327, "total_steps": 360, "loss": 0.4173, "lr": 1.0526069454024651e-07, "epoch": 0.905974025974026, "percentage": 90.83, "elapsed_time": "13:46:13", "remaining_time": "1:23:22"}
337
+ {"current_steps": 328, "total_steps": 360, "loss": 0.2767, "lr": 9.901973414055188e-08, "epoch": 0.9087445887445887, "percentage": 91.11, "elapsed_time": "13:48:31", "remaining_time": "1:20:49"}
338
+ {"current_steps": 329, "total_steps": 360, "loss": 0.2375, "lr": 9.296574959315464e-08, "epoch": 0.9115151515151515, "percentage": 91.39, "elapsed_time": "13:51:01", "remaining_time": "1:18:18"}
339
+ {"current_steps": 330, "total_steps": 360, "loss": 0.2772, "lr": 8.709921235087598e-08, "epoch": 0.9142857142857143, "percentage": 91.67, "elapsed_time": "13:53:21", "remaining_time": "1:15:45"}
340
+ {"current_steps": 331, "total_steps": 360, "loss": 0.247, "lr": 8.142057926911722e-08, "epoch": 0.917056277056277, "percentage": 91.94, "elapsed_time": "13:55:44", "remaining_time": "1:13:13"}
341
+ {"current_steps": 332, "total_steps": 360, "loss": 0.2784, "lr": 7.593029257027956e-08, "epoch": 0.9198268398268399, "percentage": 92.22, "elapsed_time": "13:58:01", "remaining_time": "1:10:40"}
342
+ {"current_steps": 333, "total_steps": 360, "loss": 0.2929, "lr": 7.062877980932914e-08, "epoch": 0.9225974025974026, "percentage": 92.5, "elapsed_time": "14:00:22", "remaining_time": "1:08:08"}
343
+ {"current_steps": 334, "total_steps": 360, "loss": 0.3399, "lr": 6.551645384049898e-08, "epoch": 0.9253679653679654, "percentage": 92.78, "elapsed_time": "14:02:45", "remaining_time": "1:05:36"}
344
+ {"current_steps": 335, "total_steps": 360, "loss": 0.2597, "lr": 6.059371278513942e-08, "epoch": 0.9281385281385282, "percentage": 93.06, "elapsed_time": "14:05:03", "remaining_time": "1:03:03"}
345
+ {"current_steps": 336, "total_steps": 360, "loss": 0.334, "lr": 5.5860940000714016e-08, "epoch": 0.9309090909090909, "percentage": 93.33, "elapsed_time": "14:07:27", "remaining_time": "1:00:31"}
346
+ {"current_steps": 337, "total_steps": 360, "loss": 0.2889, "lr": 5.131850405094535e-08, "epoch": 0.9336796536796537, "percentage": 93.61, "elapsed_time": "14:09:49", "remaining_time": "0:57:59"}
347
+ {"current_steps": 338, "total_steps": 360, "loss": 0.3371, "lr": 4.6966758677113865e-08, "epoch": 0.9364502164502164, "percentage": 93.89, "elapsed_time": "14:12:06", "remaining_time": "0:55:27"}
348
+ {"current_steps": 339, "total_steps": 360, "loss": 0.3823, "lr": 4.280604277050932e-08, "epoch": 0.9392207792207792, "percentage": 94.17, "elapsed_time": "14:14:30", "remaining_time": "0:52:56"}
349
+ {"current_steps": 340, "total_steps": 360, "loss": 0.3137, "lr": 3.88366803460416e-08, "epoch": 0.941991341991342, "percentage": 94.44, "elapsed_time": "14:16:53", "remaining_time": "0:50:24"}
350
+ {"current_steps": 341, "total_steps": 360, "loss": 0.2495, "lr": 3.505898051700596e-08, "epoch": 0.9447619047619048, "percentage": 94.72, "elapsed_time": "14:19:17", "remaining_time": "0:47:52"}
351
+ {"current_steps": 342, "total_steps": 360, "loss": 0.318, "lr": 3.147323747101222e-08, "epoch": 0.9475324675324676, "percentage": 95.0, "elapsed_time": "14:21:33", "remaining_time": "0:45:20"}
352
+ {"current_steps": 343, "total_steps": 360, "loss": 0.27, "lr": 2.8079730447073685e-08, "epoch": 0.9503030303030303, "percentage": 95.28, "elapsed_time": "14:23:51", "remaining_time": "0:42:48"}
353
+ {"current_steps": 344, "total_steps": 360, "loss": 0.2689, "lr": 2.487872371386424e-08, "epoch": 0.9530735930735931, "percentage": 95.56, "elapsed_time": "14:26:15", "remaining_time": "0:40:17"}
354
+ {"current_steps": 345, "total_steps": 360, "loss": 0.3501, "lr": 2.187046654913455e-08, "epoch": 0.9558441558441558, "percentage": 95.83, "elapsed_time": "14:28:42", "remaining_time": "0:37:46"}
355
+ {"current_steps": 346, "total_steps": 360, "loss": 0.2536, "lr": 1.9055193220302582e-08, "epoch": 0.9586147186147186, "percentage": 96.11, "elapsed_time": "14:31:02", "remaining_time": "0:35:14"}
356
+ {"current_steps": 347, "total_steps": 360, "loss": 0.3007, "lr": 1.6433122966209303e-08, "epoch": 0.9613852813852813, "percentage": 96.39, "elapsed_time": "14:33:28", "remaining_time": "0:32:43"}
357
+ {"current_steps": 348, "total_steps": 360, "loss": 0.2698, "lr": 1.4004459980045127e-08, "epoch": 0.9641558441558442, "percentage": 96.67, "elapsed_time": "14:35:48", "remaining_time": "0:30:12"}
358
+ {"current_steps": 349, "total_steps": 360, "loss": 0.2803, "lr": 1.1769393393448459e-08, "epoch": 0.966926406926407, "percentage": 96.94, "elapsed_time": "14:38:05", "remaining_time": "0:27:40"}
359
+ {"current_steps": 350, "total_steps": 360, "loss": 0.3073, "lr": 9.728097261777202e-09, "epoch": 0.9696969696969697, "percentage": 97.22, "elapsed_time": "14:40:24", "remaining_time": "0:25:09"}
360
+ {"current_steps": 351, "total_steps": 360, "loss": 0.3206, "lr": 7.88073055055516e-09, "epoch": 0.9724675324675325, "percentage": 97.5, "elapsed_time": "14:42:43", "remaining_time": "0:22:38"}
361
+ {"current_steps": 352, "total_steps": 360, "loss": 0.2676, "lr": 6.2274371230905405e-09, "epoch": 0.9752380952380952, "percentage": 97.78, "elapsed_time": "14:45:04", "remaining_time": "0:20:06"}
362
+ {"current_steps": 353, "total_steps": 360, "loss": 0.2688, "lr": 4.7683457292743705e-09, "epoch": 0.978008658008658, "percentage": 98.06, "elapsed_time": "14:47:25", "remaining_time": "0:17:35"}
363
+ {"current_steps": 354, "total_steps": 360, "loss": 0.3393, "lr": 3.503569995554068e-09, "epoch": 0.9807792207792208, "percentage": 98.33, "elapsed_time": "14:49:42", "remaining_time": "0:15:04"}
364
+ {"current_steps": 355, "total_steps": 360, "loss": 0.3643, "lr": 2.4332084160835766e-09, "epoch": 0.9835497835497835, "percentage": 98.61, "elapsed_time": "14:51:59", "remaining_time": "0:12:33"}
365
+ {"current_steps": 356, "total_steps": 360, "loss": 0.3501, "lr": 1.5573443450545012e-09, "epoch": 0.9863203463203464, "percentage": 98.89, "elapsed_time": "14:54:27", "remaining_time": "0:10:03"}
366
+ {"current_steps": 357, "total_steps": 360, "loss": 0.2598, "lr": 8.760459902037998e-10, "epoch": 0.9890909090909091, "percentage": 99.17, "elapsed_time": "14:56:44", "remaining_time": "0:07:32"}
367
+ {"current_steps": 358, "total_steps": 360, "loss": 0.2426, "lr": 3.8936640750358856e-10, "epoch": 0.9918614718614719, "percentage": 99.44, "elapsed_time": "14:59:04", "remaining_time": "0:05:01"}
368
+ {"current_steps": 359, "total_steps": 360, "loss": 0.3314, "lr": 9.734349702722468e-11, "epoch": 0.9946320346320346, "percentage": 99.72, "elapsed_time": "15:01:26", "remaining_time": "0:02:30"}
369
+ {"current_steps": 360, "total_steps": 360, "loss": 0.3122, "lr": 0.0, "epoch": 0.9974025974025974, "percentage": 100.0, "elapsed_time": "15:03:49", "remaining_time": "0:00:00"}
370
+ {"current_steps": 360, "total_steps": 360, "eval_loss": 0.30314239859580994, "epoch": 0.9974025974025974, "percentage": 100.0, "elapsed_time": "15:10:04", "remaining_time": "0:00:00"}
371
+ {"current_steps": 360, "total_steps": 360, "epoch": 0.9974025974025974, "percentage": 100.0, "elapsed_time": "15:10:42", "remaining_time": "0:00:00"}
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e7bcc35e6e40553a57ca0490c5e2e92fe31577703934c0895f05ed04f0d738a
3
+ size 7224
training_eval_loss.png ADDED
training_loss.png ADDED
training_rewards_chosen.png ADDED
vocab.json ADDED
The diff for this file is too large to render. See raw diff