yalhessi commited on
Commit
a7758bd
·
verified ·
1 Parent(s): 1a1a9d4

End of training

Browse files
README.md ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: other
4
+ base_model: deepseek-ai/deepseek-coder-1.3b-base
5
+ tags:
6
+ - generated_from_trainer
7
+ model-index:
8
+ - name: lemexp-processed-task1_min_symbols_lemma_command_small-deepseek-coder-1.3b-base
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ # lemexp-processed-task1_min_symbols_lemma_command_small-deepseek-coder-1.3b-base
16
+
17
+ This model is a fine-tuned version of [deepseek-ai/deepseek-coder-1.3b-base](https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-base) on an unknown dataset.
18
+ It achieves the following results on the evaluation set:
19
+ - Loss: 0.4329
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 0.0002
39
+ - train_batch_size: 2
40
+ - eval_batch_size: 2
41
+ - seed: 42
42
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
43
+ - lr_scheduler_type: linear
44
+ - lr_scheduler_warmup_steps: 100
45
+ - num_epochs: 6
46
+ - mixed_precision_training: Native AMP
47
+
48
+ ### Training results
49
+
50
+ | Training Loss | Epoch | Step | Validation Loss |
51
+ |:-------------:|:------:|:------:|:---------------:|
52
+ | 0.6364 | 0.2000 | 3683 | 0.6357 |
53
+ | 0.5857 | 0.4001 | 7366 | 0.5827 |
54
+ | 0.5682 | 0.6001 | 11049 | 0.5516 |
55
+ | 0.5421 | 0.8001 | 14732 | 0.5293 |
56
+ | 0.5142 | 1.0002 | 18415 | 0.5177 |
57
+ | 0.4674 | 1.2002 | 22098 | 0.5015 |
58
+ | 0.4615 | 1.4002 | 25781 | 0.5000 |
59
+ | 0.453 | 1.6003 | 29464 | 0.4770 |
60
+ | 0.4506 | 1.8003 | 33147 | 0.4701 |
61
+ | 0.4309 | 2.0003 | 36830 | 0.4646 |
62
+ | 0.3829 | 2.2004 | 40513 | 0.4667 |
63
+ | 0.3925 | 2.4004 | 44196 | 0.4595 |
64
+ | 0.3858 | 2.6004 | 47879 | 0.4566 |
65
+ | 0.3879 | 2.8005 | 51562 | 0.4439 |
66
+ | 0.3764 | 3.0005 | 55245 | 0.4379 |
67
+ | 0.3267 | 3.2005 | 58928 | 0.4502 |
68
+ | 0.3346 | 3.4006 | 62611 | 0.4443 |
69
+ | 0.3363 | 3.6006 | 66294 | 0.4339 |
70
+ | 0.3321 | 3.8006 | 69977 | 0.4350 |
71
+ | 0.3423 | 4.0007 | 73660 | 0.4288 |
72
+ | 0.2789 | 4.2007 | 77343 | 0.4458 |
73
+ | 0.2928 | 4.4007 | 81026 | 0.4379 |
74
+ | 0.2963 | 4.6007 | 84709 | 0.4325 |
75
+ | 0.2887 | 4.8008 | 88392 | 0.4275 |
76
+ | 0.2949 | 5.0008 | 92075 | 0.4292 |
77
+ | 0.2437 | 5.2008 | 95758 | 0.4366 |
78
+ | 0.2424 | 5.4009 | 99441 | 0.4358 |
79
+ | 0.2528 | 5.6009 | 103124 | 0.4331 |
80
+ | 0.2477 | 5.8009 | 106807 | 0.4329 |
81
+
82
+
83
+ ### Framework versions
84
+
85
+ - PEFT 0.14.0
86
+ - Transformers 4.47.0
87
+ - Pytorch 2.5.1+cu124
88
+ - Datasets 3.2.0
89
+ - Tokenizers 0.21.0
adapter_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "deepseek-ai/deepseek-coder-1.3b-base",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": false,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 32,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.05,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 8,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "q_proj",
27
+ "v_proj"
28
+ ],
29
+ "task_type": "CAUSAL_LM",
30
+ "use_dora": false,
31
+ "use_rslora": false
32
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b11dcd5831eabe187fc7cdbc3ca1732249fadda083fa0a80db66db747cc50f5a
3
+ size 531035104
trainer_state.json ADDED
@@ -0,0 +1,1814 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 6.0,
5
+ "eval_steps": 3683,
6
+ "global_step": 110472,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.027156202476645665,
13
+ "grad_norm": 1.423619031906128,
14
+ "learning_rate": 0.00019927699054107928,
15
+ "loss": 0.804,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.05431240495329133,
20
+ "grad_norm": 2.0592901706695557,
21
+ "learning_rate": 0.00019837096365020115,
22
+ "loss": 0.7258,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.081468607429937,
27
+ "grad_norm": 1.7999378442764282,
28
+ "learning_rate": 0.00019746493675932303,
29
+ "loss": 0.698,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.10862480990658266,
34
+ "grad_norm": 1.6404426097869873,
35
+ "learning_rate": 0.0001965589098684449,
36
+ "loss": 0.6863,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.13578101238322832,
41
+ "grad_norm": 2.424567222595215,
42
+ "learning_rate": 0.00019565469503134855,
43
+ "loss": 0.6638,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.162937214859874,
48
+ "grad_norm": 2.102727174758911,
49
+ "learning_rate": 0.00019475048019425216,
50
+ "loss": 0.6554,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.19009341733651966,
55
+ "grad_norm": 1.732860803604126,
56
+ "learning_rate": 0.00019384445330337407,
57
+ "loss": 0.6364,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.20003258744297198,
62
+ "eval_loss": 0.6357121467590332,
63
+ "eval_runtime": 25.1143,
64
+ "eval_samples_per_second": 14.693,
65
+ "eval_steps_per_second": 7.366,
66
+ "step": 3683
67
+ },
68
+ {
69
+ "epoch": 0.21724961981316532,
70
+ "grad_norm": 1.5829988718032837,
71
+ "learning_rate": 0.00019293842641249595,
72
+ "loss": 0.6576,
73
+ "step": 4000
74
+ },
75
+ {
76
+ "epoch": 0.244405822289811,
77
+ "grad_norm": 0.9155117869377136,
78
+ "learning_rate": 0.0001920323995216178,
79
+ "loss": 0.6241,
80
+ "step": 4500
81
+ },
82
+ {
83
+ "epoch": 0.27156202476645663,
84
+ "grad_norm": 1.6804828643798828,
85
+ "learning_rate": 0.0001911263726307397,
86
+ "loss": 0.6369,
87
+ "step": 5000
88
+ },
89
+ {
90
+ "epoch": 0.2987182272431023,
91
+ "grad_norm": 1.337156891822815,
92
+ "learning_rate": 0.00019022034573986155,
93
+ "loss": 0.6196,
94
+ "step": 5500
95
+ },
96
+ {
97
+ "epoch": 0.325874429719748,
98
+ "grad_norm": 1.680059790611267,
99
+ "learning_rate": 0.00018931431884898345,
100
+ "loss": 0.6121,
101
+ "step": 6000
102
+ },
103
+ {
104
+ "epoch": 0.35303063219639363,
105
+ "grad_norm": 2.251309871673584,
106
+ "learning_rate": 0.00018840829195810533,
107
+ "loss": 0.5984,
108
+ "step": 6500
109
+ },
110
+ {
111
+ "epoch": 0.3801868346730393,
112
+ "grad_norm": 1.7084137201309204,
113
+ "learning_rate": 0.0001875022650672272,
114
+ "loss": 0.5857,
115
+ "step": 7000
116
+ },
117
+ {
118
+ "epoch": 0.40006517488594395,
119
+ "eval_loss": 0.5826721787452698,
120
+ "eval_runtime": 24.2431,
121
+ "eval_samples_per_second": 15.221,
122
+ "eval_steps_per_second": 7.631,
123
+ "step": 7366
124
+ },
125
+ {
126
+ "epoch": 0.407343037149685,
127
+ "grad_norm": 1.7896497249603271,
128
+ "learning_rate": 0.00018659623817634909,
129
+ "loss": 0.5777,
130
+ "step": 7500
131
+ },
132
+ {
133
+ "epoch": 0.43449923962633064,
134
+ "grad_norm": 1.6899892091751099,
135
+ "learning_rate": 0.00018569021128547096,
136
+ "loss": 0.584,
137
+ "step": 8000
138
+ },
139
+ {
140
+ "epoch": 0.4616554421029763,
141
+ "grad_norm": 2.2623794078826904,
142
+ "learning_rate": 0.00018478418439459284,
143
+ "loss": 0.5731,
144
+ "step": 8500
145
+ },
146
+ {
147
+ "epoch": 0.488811644579622,
148
+ "grad_norm": 2.9157674312591553,
149
+ "learning_rate": 0.00018387996955749646,
150
+ "loss": 0.5719,
151
+ "step": 9000
152
+ },
153
+ {
154
+ "epoch": 0.5159678470562676,
155
+ "grad_norm": 1.841354489326477,
156
+ "learning_rate": 0.00018297394266661836,
157
+ "loss": 0.5765,
158
+ "step": 9500
159
+ },
160
+ {
161
+ "epoch": 0.5431240495329133,
162
+ "grad_norm": 2.550917625427246,
163
+ "learning_rate": 0.00018206791577574024,
164
+ "loss": 0.552,
165
+ "step": 10000
166
+ },
167
+ {
168
+ "epoch": 0.570280252009559,
169
+ "grad_norm": 1.5018529891967773,
170
+ "learning_rate": 0.00018116188888486211,
171
+ "loss": 0.5489,
172
+ "step": 10500
173
+ },
174
+ {
175
+ "epoch": 0.5974364544862046,
176
+ "grad_norm": 3.648230791091919,
177
+ "learning_rate": 0.00018025767404776576,
178
+ "loss": 0.5682,
179
+ "step": 11000
180
+ },
181
+ {
182
+ "epoch": 0.6000977623289159,
183
+ "eval_loss": 0.5515537858009338,
184
+ "eval_runtime": 25.2059,
185
+ "eval_samples_per_second": 14.639,
186
+ "eval_steps_per_second": 7.34,
187
+ "step": 11049
188
+ },
189
+ {
190
+ "epoch": 0.6245926569628503,
191
+ "grad_norm": 3.894047498703003,
192
+ "learning_rate": 0.0001793516471568876,
193
+ "loss": 0.5662,
194
+ "step": 11500
195
+ },
196
+ {
197
+ "epoch": 0.651748859439496,
198
+ "grad_norm": 2.1958436965942383,
199
+ "learning_rate": 0.0001784456202660095,
200
+ "loss": 0.557,
201
+ "step": 12000
202
+ },
203
+ {
204
+ "epoch": 0.6789050619161416,
205
+ "grad_norm": 1.6268092393875122,
206
+ "learning_rate": 0.00017754140542891312,
207
+ "loss": 0.5472,
208
+ "step": 12500
209
+ },
210
+ {
211
+ "epoch": 0.7060612643927873,
212
+ "grad_norm": 1.6947818994522095,
213
+ "learning_rate": 0.00017663537853803503,
214
+ "loss": 0.5487,
215
+ "step": 13000
216
+ },
217
+ {
218
+ "epoch": 0.733217466869433,
219
+ "grad_norm": 1.740544080734253,
220
+ "learning_rate": 0.00017572935164715688,
221
+ "loss": 0.5473,
222
+ "step": 13500
223
+ },
224
+ {
225
+ "epoch": 0.7603736693460786,
226
+ "grad_norm": 2.6229496002197266,
227
+ "learning_rate": 0.00017482332475627878,
228
+ "loss": 0.5306,
229
+ "step": 14000
230
+ },
231
+ {
232
+ "epoch": 0.7875298718227243,
233
+ "grad_norm": 1.760733723640442,
234
+ "learning_rate": 0.00017391729786540066,
235
+ "loss": 0.5421,
236
+ "step": 14500
237
+ },
238
+ {
239
+ "epoch": 0.8001303497718879,
240
+ "eval_loss": 0.5292674899101257,
241
+ "eval_runtime": 23.3828,
242
+ "eval_samples_per_second": 15.781,
243
+ "eval_steps_per_second": 7.912,
244
+ "step": 14732
245
+ },
246
+ {
247
+ "epoch": 0.81468607429937,
248
+ "grad_norm": 2.7095932960510254,
249
+ "learning_rate": 0.00017301308302830428,
250
+ "loss": 0.5225,
251
+ "step": 15000
252
+ },
253
+ {
254
+ "epoch": 0.8418422767760156,
255
+ "grad_norm": 3.7730860710144043,
256
+ "learning_rate": 0.00017210705613742618,
257
+ "loss": 0.536,
258
+ "step": 15500
259
+ },
260
+ {
261
+ "epoch": 0.8689984792526613,
262
+ "grad_norm": 1.8944693803787231,
263
+ "learning_rate": 0.00017120102924654803,
264
+ "loss": 0.5123,
265
+ "step": 16000
266
+ },
267
+ {
268
+ "epoch": 0.896154681729307,
269
+ "grad_norm": 2.137572765350342,
270
+ "learning_rate": 0.00017029500235566993,
271
+ "loss": 0.5411,
272
+ "step": 16500
273
+ },
274
+ {
275
+ "epoch": 0.9233108842059526,
276
+ "grad_norm": 4.163636207580566,
277
+ "learning_rate": 0.0001693889754647918,
278
+ "loss": 0.5218,
279
+ "step": 17000
280
+ },
281
+ {
282
+ "epoch": 0.9504670866825983,
283
+ "grad_norm": 2.492893934249878,
284
+ "learning_rate": 0.0001684829485739137,
285
+ "loss": 0.5171,
286
+ "step": 17500
287
+ },
288
+ {
289
+ "epoch": 0.977623289159244,
290
+ "grad_norm": 1.2668529748916626,
291
+ "learning_rate": 0.00016757692168303557,
292
+ "loss": 0.5142,
293
+ "step": 18000
294
+ },
295
+ {
296
+ "epoch": 1.00016293721486,
297
+ "eval_loss": 0.517742395401001,
298
+ "eval_runtime": 24.0099,
299
+ "eval_samples_per_second": 15.369,
300
+ "eval_steps_per_second": 7.705,
301
+ "step": 18415
302
+ },
303
+ {
304
+ "epoch": 1.0047794916358896,
305
+ "grad_norm": 4.145332336425781,
306
+ "learning_rate": 0.00016667089479215742,
307
+ "loss": 0.5248,
308
+ "step": 18500
309
+ },
310
+ {
311
+ "epoch": 1.0319356941125353,
312
+ "grad_norm": 3.0422215461730957,
313
+ "learning_rate": 0.00016576667995506109,
314
+ "loss": 0.4571,
315
+ "step": 19000
316
+ },
317
+ {
318
+ "epoch": 1.059091896589181,
319
+ "grad_norm": 2.034750461578369,
320
+ "learning_rate": 0.00016486065306418294,
321
+ "loss": 0.4635,
322
+ "step": 19500
323
+ },
324
+ {
325
+ "epoch": 1.0862480990658265,
326
+ "grad_norm": 2.047473907470703,
327
+ "learning_rate": 0.0001639564382270866,
328
+ "loss": 0.47,
329
+ "step": 20000
330
+ },
331
+ {
332
+ "epoch": 1.1134043015424724,
333
+ "grad_norm": 2.424201011657715,
334
+ "learning_rate": 0.00016305041133620845,
335
+ "loss": 0.4742,
336
+ "step": 20500
337
+ },
338
+ {
339
+ "epoch": 1.140560504019118,
340
+ "grad_norm": 2.1113667488098145,
341
+ "learning_rate": 0.00016214438444533036,
342
+ "loss": 0.4628,
343
+ "step": 21000
344
+ },
345
+ {
346
+ "epoch": 1.1677167064957636,
347
+ "grad_norm": 2.0212793350219727,
348
+ "learning_rate": 0.00016123835755445224,
349
+ "loss": 0.4636,
350
+ "step": 21500
351
+ },
352
+ {
353
+ "epoch": 1.1948729089724093,
354
+ "grad_norm": 4.672229290008545,
355
+ "learning_rate": 0.00016033233066357409,
356
+ "loss": 0.4674,
357
+ "step": 22000
358
+ },
359
+ {
360
+ "epoch": 1.2001955246578317,
361
+ "eval_loss": 0.5015310645103455,
362
+ "eval_runtime": 25.0548,
363
+ "eval_samples_per_second": 14.728,
364
+ "eval_steps_per_second": 7.384,
365
+ "step": 22098
366
+ },
367
+ {
368
+ "epoch": 1.222029111449055,
369
+ "grad_norm": 2.172687292098999,
370
+ "learning_rate": 0.000159426303772696,
371
+ "loss": 0.4583,
372
+ "step": 22500
373
+ },
374
+ {
375
+ "epoch": 1.2491853139257005,
376
+ "grad_norm": 2.1510438919067383,
377
+ "learning_rate": 0.00015852027688181784,
378
+ "loss": 0.4558,
379
+ "step": 23000
380
+ },
381
+ {
382
+ "epoch": 1.2763415164023462,
383
+ "grad_norm": 1.1689780950546265,
384
+ "learning_rate": 0.00015761424999093975,
385
+ "loss": 0.4502,
386
+ "step": 23500
387
+ },
388
+ {
389
+ "epoch": 1.303497718878992,
390
+ "grad_norm": 2.7791380882263184,
391
+ "learning_rate": 0.00015670822310006162,
392
+ "loss": 0.4451,
393
+ "step": 24000
394
+ },
395
+ {
396
+ "epoch": 1.3306539213556376,
397
+ "grad_norm": 2.7756049633026123,
398
+ "learning_rate": 0.0001558021962091835,
399
+ "loss": 0.4509,
400
+ "step": 24500
401
+ },
402
+ {
403
+ "epoch": 1.3578101238322833,
404
+ "grad_norm": 2.263340950012207,
405
+ "learning_rate": 0.00015489616931830538,
406
+ "loss": 0.4472,
407
+ "step": 25000
408
+ },
409
+ {
410
+ "epoch": 1.384966326308929,
411
+ "grad_norm": 3.0343711376190186,
412
+ "learning_rate": 0.000153991954481209,
413
+ "loss": 0.4615,
414
+ "step": 25500
415
+ },
416
+ {
417
+ "epoch": 1.4002281121008038,
418
+ "eval_loss": 0.500033438205719,
419
+ "eval_runtime": 24.7233,
420
+ "eval_samples_per_second": 14.925,
421
+ "eval_steps_per_second": 7.483,
422
+ "step": 25781
423
+ },
424
+ {
425
+ "epoch": 1.4121225287855745,
426
+ "grad_norm": 3.176940441131592,
427
+ "learning_rate": 0.0001530859275903309,
428
+ "loss": 0.4561,
429
+ "step": 26000
430
+ },
431
+ {
432
+ "epoch": 1.4392787312622204,
433
+ "grad_norm": 4.111068248748779,
434
+ "learning_rate": 0.00015217990069945275,
435
+ "loss": 0.491,
436
+ "step": 26500
437
+ },
438
+ {
439
+ "epoch": 1.466434933738866,
440
+ "grad_norm": 5.199289321899414,
441
+ "learning_rate": 0.00015127387380857465,
442
+ "loss": 0.4721,
443
+ "step": 27000
444
+ },
445
+ {
446
+ "epoch": 1.4935911362155116,
447
+ "grad_norm": 1.901997447013855,
448
+ "learning_rate": 0.00015036965897147827,
449
+ "loss": 0.4458,
450
+ "step": 27500
451
+ },
452
+ {
453
+ "epoch": 1.5207473386921573,
454
+ "grad_norm": 3.2669544219970703,
455
+ "learning_rate": 0.00014946363208060017,
456
+ "loss": 0.4313,
457
+ "step": 28000
458
+ },
459
+ {
460
+ "epoch": 1.547903541168803,
461
+ "grad_norm": 1.151863694190979,
462
+ "learning_rate": 0.00014855760518972205,
463
+ "loss": 0.4698,
464
+ "step": 28500
465
+ },
466
+ {
467
+ "epoch": 1.5750597436454488,
468
+ "grad_norm": 2.112612724304199,
469
+ "learning_rate": 0.0001476515782988439,
470
+ "loss": 0.453,
471
+ "step": 29000
472
+ },
473
+ {
474
+ "epoch": 1.6002606995437758,
475
+ "eval_loss": 0.47696688771247864,
476
+ "eval_runtime": 22.9354,
477
+ "eval_samples_per_second": 16.089,
478
+ "eval_steps_per_second": 8.066,
479
+ "step": 29464
480
+ },
481
+ {
482
+ "epoch": 1.6022159461220942,
483
+ "grad_norm": 2.558180570602417,
484
+ "learning_rate": 0.0001467455514079658,
485
+ "loss": 0.4575,
486
+ "step": 29500
487
+ },
488
+ {
489
+ "epoch": 1.62937214859874,
490
+ "grad_norm": 2.6507065296173096,
491
+ "learning_rate": 0.00014583952451708768,
492
+ "loss": 0.459,
493
+ "step": 30000
494
+ },
495
+ {
496
+ "epoch": 1.6565283510753857,
497
+ "grad_norm": 1.5638259649276733,
498
+ "learning_rate": 0.00014493349762620956,
499
+ "loss": 0.4288,
500
+ "step": 30500
501
+ },
502
+ {
503
+ "epoch": 1.6836845535520313,
504
+ "grad_norm": 3.8055946826934814,
505
+ "learning_rate": 0.00014402747073533143,
506
+ "loss": 0.4514,
507
+ "step": 31000
508
+ },
509
+ {
510
+ "epoch": 1.710840756028677,
511
+ "grad_norm": 3.0687201023101807,
512
+ "learning_rate": 0.00014312325589823508,
513
+ "loss": 0.4471,
514
+ "step": 31500
515
+ },
516
+ {
517
+ "epoch": 1.7379969585053225,
518
+ "grad_norm": 2.26448655128479,
519
+ "learning_rate": 0.0001422190410611387,
520
+ "loss": 0.4447,
521
+ "step": 32000
522
+ },
523
+ {
524
+ "epoch": 1.7651531609819684,
525
+ "grad_norm": 1.4060781002044678,
526
+ "learning_rate": 0.00014131301417026057,
527
+ "loss": 0.4361,
528
+ "step": 32500
529
+ },
530
+ {
531
+ "epoch": 1.7923093634586138,
532
+ "grad_norm": 2.3706018924713135,
533
+ "learning_rate": 0.00014040698727938247,
534
+ "loss": 0.4506,
535
+ "step": 33000
536
+ },
537
+ {
538
+ "epoch": 1.8002932869867476,
539
+ "eval_loss": 0.470061331987381,
540
+ "eval_runtime": 27.1848,
541
+ "eval_samples_per_second": 13.574,
542
+ "eval_steps_per_second": 6.805,
543
+ "step": 33147
544
+ },
545
+ {
546
+ "epoch": 1.8194655659352597,
547
+ "grad_norm": 2.880718946456909,
548
+ "learning_rate": 0.00013950096038850432,
549
+ "loss": 0.4439,
550
+ "step": 33500
551
+ },
552
+ {
553
+ "epoch": 1.8466217684119053,
554
+ "grad_norm": 1.4225813150405884,
555
+ "learning_rate": 0.000138596745551408,
556
+ "loss": 0.4434,
557
+ "step": 34000
558
+ },
559
+ {
560
+ "epoch": 1.873777970888551,
561
+ "grad_norm": 0.7051529884338379,
562
+ "learning_rate": 0.00013769071866052984,
563
+ "loss": 0.449,
564
+ "step": 34500
565
+ },
566
+ {
567
+ "epoch": 1.9009341733651968,
568
+ "grad_norm": 4.5070648193359375,
569
+ "learning_rate": 0.00013678469176965174,
570
+ "loss": 0.4356,
571
+ "step": 35000
572
+ },
573
+ {
574
+ "epoch": 1.9280903758418422,
575
+ "grad_norm": 1.354962944984436,
576
+ "learning_rate": 0.00013587866487877362,
577
+ "loss": 0.4509,
578
+ "step": 35500
579
+ },
580
+ {
581
+ "epoch": 1.955246578318488,
582
+ "grad_norm": 2.919261932373047,
583
+ "learning_rate": 0.00013497263798789547,
584
+ "loss": 0.4399,
585
+ "step": 36000
586
+ },
587
+ {
588
+ "epoch": 1.9824027807951337,
589
+ "grad_norm": 1.7376036643981934,
590
+ "learning_rate": 0.00013406661109701738,
591
+ "loss": 0.4309,
592
+ "step": 36500
593
+ },
594
+ {
595
+ "epoch": 2.00032587442972,
596
+ "eval_loss": 0.4645754098892212,
597
+ "eval_runtime": 26.5674,
598
+ "eval_samples_per_second": 13.889,
599
+ "eval_steps_per_second": 6.963,
600
+ "step": 36830
601
+ },
602
+ {
603
+ "epoch": 2.0095589832717793,
604
+ "grad_norm": 2.448807954788208,
605
+ "learning_rate": 0.000133162396259921,
606
+ "loss": 0.4074,
607
+ "step": 37000
608
+ },
609
+ {
610
+ "epoch": 2.036715185748425,
611
+ "grad_norm": 4.4545369148254395,
612
+ "learning_rate": 0.0001322563693690429,
613
+ "loss": 0.3784,
614
+ "step": 37500
615
+ },
616
+ {
617
+ "epoch": 2.0638713882250705,
618
+ "grad_norm": 1.4585567712783813,
619
+ "learning_rate": 0.0001313521545319465,
620
+ "loss": 0.3802,
621
+ "step": 38000
622
+ },
623
+ {
624
+ "epoch": 2.0910275907017164,
625
+ "grad_norm": 4.981091499328613,
626
+ "learning_rate": 0.0001304461276410684,
627
+ "loss": 0.3887,
628
+ "step": 38500
629
+ },
630
+ {
631
+ "epoch": 2.118183793178362,
632
+ "grad_norm": 1.345459222793579,
633
+ "learning_rate": 0.00012954010075019026,
634
+ "loss": 0.3791,
635
+ "step": 39000
636
+ },
637
+ {
638
+ "epoch": 2.1453399956550077,
639
+ "grad_norm": 2.339366912841797,
640
+ "learning_rate": 0.00012863407385931214,
641
+ "loss": 0.3895,
642
+ "step": 39500
643
+ },
644
+ {
645
+ "epoch": 2.172496198131653,
646
+ "grad_norm": 2.1575145721435547,
647
+ "learning_rate": 0.00012772804696843405,
648
+ "loss": 0.4046,
649
+ "step": 40000
650
+ },
651
+ {
652
+ "epoch": 2.199652400608299,
653
+ "grad_norm": 3.028726100921631,
654
+ "learning_rate": 0.0001268220200775559,
655
+ "loss": 0.3829,
656
+ "step": 40500
657
+ },
658
+ {
659
+ "epoch": 2.2003584618726917,
660
+ "eval_loss": 0.46665239334106445,
661
+ "eval_runtime": 22.994,
662
+ "eval_samples_per_second": 16.048,
663
+ "eval_steps_per_second": 8.046,
664
+ "step": 40513
665
+ },
666
+ {
667
+ "epoch": 2.2268086030849448,
668
+ "grad_norm": 2.198944330215454,
669
+ "learning_rate": 0.0001259159931866778,
670
+ "loss": 0.404,
671
+ "step": 41000
672
+ },
673
+ {
674
+ "epoch": 2.25396480556159,
675
+ "grad_norm": 1.4934983253479004,
676
+ "learning_rate": 0.00012500996629579965,
677
+ "loss": 0.3758,
678
+ "step": 41500
679
+ },
680
+ {
681
+ "epoch": 2.281121008038236,
682
+ "grad_norm": 3.0350615978240967,
683
+ "learning_rate": 0.0001241057514587033,
684
+ "loss": 0.3851,
685
+ "step": 42000
686
+ },
687
+ {
688
+ "epoch": 2.3082772105148814,
689
+ "grad_norm": 2.0013248920440674,
690
+ "learning_rate": 0.00012319972456782517,
691
+ "loss": 0.3874,
692
+ "step": 42500
693
+ },
694
+ {
695
+ "epoch": 2.3354334129915273,
696
+ "grad_norm": 3.0805039405822754,
697
+ "learning_rate": 0.00012229369767694705,
698
+ "loss": 0.383,
699
+ "step": 43000
700
+ },
701
+ {
702
+ "epoch": 2.362589615468173,
703
+ "grad_norm": 2.340902328491211,
704
+ "learning_rate": 0.00012138767078606894,
705
+ "loss": 0.3795,
706
+ "step": 43500
707
+ },
708
+ {
709
+ "epoch": 2.3897458179448186,
710
+ "grad_norm": 1.4872759580612183,
711
+ "learning_rate": 0.00012048164389519082,
712
+ "loss": 0.3925,
713
+ "step": 44000
714
+ },
715
+ {
716
+ "epoch": 2.4003910493156635,
717
+ "eval_loss": 0.45948517322540283,
718
+ "eval_runtime": 23.0689,
719
+ "eval_samples_per_second": 15.996,
720
+ "eval_steps_per_second": 8.019,
721
+ "step": 44196
722
+ },
723
+ {
724
+ "epoch": 2.4169020204214644,
725
+ "grad_norm": 2.0199170112609863,
726
+ "learning_rate": 0.00011957742905809446,
727
+ "loss": 0.387,
728
+ "step": 44500
729
+ },
730
+ {
731
+ "epoch": 2.44405822289811,
732
+ "grad_norm": 2.0993518829345703,
733
+ "learning_rate": 0.00011867140216721633,
734
+ "loss": 0.3858,
735
+ "step": 45000
736
+ },
737
+ {
738
+ "epoch": 2.4712144253747557,
739
+ "grad_norm": 2.5431594848632812,
740
+ "learning_rate": 0.0001177653752763382,
741
+ "loss": 0.3731,
742
+ "step": 45500
743
+ },
744
+ {
745
+ "epoch": 2.498370627851401,
746
+ "grad_norm": 2.0377984046936035,
747
+ "learning_rate": 0.00011685934838546009,
748
+ "loss": 0.383,
749
+ "step": 46000
750
+ },
751
+ {
752
+ "epoch": 2.525526830328047,
753
+ "grad_norm": 2.3051955699920654,
754
+ "learning_rate": 0.00011595332149458195,
755
+ "loss": 0.3865,
756
+ "step": 46500
757
+ },
758
+ {
759
+ "epoch": 2.5526830328046923,
760
+ "grad_norm": 3.8095552921295166,
761
+ "learning_rate": 0.00011504729460370384,
762
+ "loss": 0.3726,
763
+ "step": 47000
764
+ },
765
+ {
766
+ "epoch": 2.579839235281338,
767
+ "grad_norm": 2.2560086250305176,
768
+ "learning_rate": 0.00011414307976660747,
769
+ "loss": 0.3858,
770
+ "step": 47500
771
+ },
772
+ {
773
+ "epoch": 2.6004236367586357,
774
+ "eval_loss": 0.45664411783218384,
775
+ "eval_runtime": 22.9971,
776
+ "eval_samples_per_second": 16.045,
777
+ "eval_steps_per_second": 8.044,
778
+ "step": 47879
779
+ },
780
+ {
781
+ "epoch": 2.606995437757984,
782
+ "grad_norm": 2.8991200923919678,
783
+ "learning_rate": 0.00011323705287572936,
784
+ "loss": 0.383,
785
+ "step": 48000
786
+ },
787
+ {
788
+ "epoch": 2.6341516402346294,
789
+ "grad_norm": 4.307155132293701,
790
+ "learning_rate": 0.00011233102598485124,
791
+ "loss": 0.3941,
792
+ "step": 48500
793
+ },
794
+ {
795
+ "epoch": 2.6613078427112753,
796
+ "grad_norm": 3.7580649852752686,
797
+ "learning_rate": 0.0001114249990939731,
798
+ "loss": 0.385,
799
+ "step": 49000
800
+ },
801
+ {
802
+ "epoch": 2.6884640451879207,
803
+ "grad_norm": 2.604210615158081,
804
+ "learning_rate": 0.000110518972203095,
805
+ "loss": 0.3847,
806
+ "step": 49500
807
+ },
808
+ {
809
+ "epoch": 2.7156202476645666,
810
+ "grad_norm": 1.8067151308059692,
811
+ "learning_rate": 0.00010961475736599862,
812
+ "loss": 0.3775,
813
+ "step": 50000
814
+ },
815
+ {
816
+ "epoch": 2.7427764501412124,
817
+ "grad_norm": 2.4924516677856445,
818
+ "learning_rate": 0.00010870873047512051,
819
+ "loss": 0.392,
820
+ "step": 50500
821
+ },
822
+ {
823
+ "epoch": 2.769932652617858,
824
+ "grad_norm": 2.7145466804504395,
825
+ "learning_rate": 0.00010780270358424238,
826
+ "loss": 0.3817,
827
+ "step": 51000
828
+ },
829
+ {
830
+ "epoch": 2.7970888550945037,
831
+ "grad_norm": 3.6621336936950684,
832
+ "learning_rate": 0.00010689667669336427,
833
+ "loss": 0.3879,
834
+ "step": 51500
835
+ },
836
+ {
837
+ "epoch": 2.8004562242016076,
838
+ "eval_loss": 0.4439272880554199,
839
+ "eval_runtime": 22.9231,
840
+ "eval_samples_per_second": 16.097,
841
+ "eval_steps_per_second": 8.07,
842
+ "step": 51562
843
+ },
844
+ {
845
+ "epoch": 2.824245057571149,
846
+ "grad_norm": 3.2784557342529297,
847
+ "learning_rate": 0.00010599064980248614,
848
+ "loss": 0.3775,
849
+ "step": 52000
850
+ },
851
+ {
852
+ "epoch": 2.851401260047795,
853
+ "grad_norm": 2.4789769649505615,
854
+ "learning_rate": 0.00010508643496538977,
855
+ "loss": 0.3828,
856
+ "step": 52500
857
+ },
858
+ {
859
+ "epoch": 2.878557462524441,
860
+ "grad_norm": 4.17576789855957,
861
+ "learning_rate": 0.00010418040807451166,
862
+ "loss": 0.3922,
863
+ "step": 53000
864
+ },
865
+ {
866
+ "epoch": 2.905713665001086,
867
+ "grad_norm": 2.2692151069641113,
868
+ "learning_rate": 0.00010327438118363353,
869
+ "loss": 0.3684,
870
+ "step": 53500
871
+ },
872
+ {
873
+ "epoch": 2.932869867477732,
874
+ "grad_norm": 3.434340238571167,
875
+ "learning_rate": 0.00010236835429275542,
876
+ "loss": 0.3703,
877
+ "step": 54000
878
+ },
879
+ {
880
+ "epoch": 2.9600260699543774,
881
+ "grad_norm": 2.867629289627075,
882
+ "learning_rate": 0.00010146232740187728,
883
+ "loss": 0.3769,
884
+ "step": 54500
885
+ },
886
+ {
887
+ "epoch": 2.9871822724310233,
888
+ "grad_norm": 2.588996171951294,
889
+ "learning_rate": 0.00010055630051099917,
890
+ "loss": 0.3764,
891
+ "step": 55000
892
+ },
893
+ {
894
+ "epoch": 3.00048881164458,
895
+ "eval_loss": 0.43786150217056274,
896
+ "eval_runtime": 23.1256,
897
+ "eval_samples_per_second": 15.956,
898
+ "eval_steps_per_second": 8.0,
899
+ "step": 55245
900
+ },
901
+ {
902
+ "epoch": 3.0143384749076687,
903
+ "grad_norm": 2.558405876159668,
904
+ "learning_rate": 9.96520856739028e-05,
905
+ "loss": 0.3535,
906
+ "step": 55500
907
+ },
908
+ {
909
+ "epoch": 3.0414946773843146,
910
+ "grad_norm": 2.3702216148376465,
911
+ "learning_rate": 9.874605878302469e-05,
912
+ "loss": 0.3299,
913
+ "step": 56000
914
+ },
915
+ {
916
+ "epoch": 3.0686508798609604,
917
+ "grad_norm": 2.283313274383545,
918
+ "learning_rate": 9.784003189214657e-05,
919
+ "loss": 0.3366,
920
+ "step": 56500
921
+ },
922
+ {
923
+ "epoch": 3.095807082337606,
924
+ "grad_norm": 2.421048641204834,
925
+ "learning_rate": 9.693400500126845e-05,
926
+ "loss": 0.3261,
927
+ "step": 57000
928
+ },
929
+ {
930
+ "epoch": 3.1229632848142517,
931
+ "grad_norm": 2.0642685890197754,
932
+ "learning_rate": 9.602979016417207e-05,
933
+ "loss": 0.3335,
934
+ "step": 57500
935
+ },
936
+ {
937
+ "epoch": 3.150119487290897,
938
+ "grad_norm": 3.4360289573669434,
939
+ "learning_rate": 9.512376327329395e-05,
940
+ "loss": 0.3287,
941
+ "step": 58000
942
+ },
943
+ {
944
+ "epoch": 3.177275689767543,
945
+ "grad_norm": 3.9619264602661133,
946
+ "learning_rate": 9.421773638241583e-05,
947
+ "loss": 0.3267,
948
+ "step": 58500
949
+ },
950
+ {
951
+ "epoch": 3.2005213990875516,
952
+ "eval_loss": 0.4501725733280182,
953
+ "eval_runtime": 23.0376,
954
+ "eval_samples_per_second": 16.017,
955
+ "eval_steps_per_second": 8.03,
956
+ "step": 58928
957
+ },
958
+ {
959
+ "epoch": 3.2044318922441883,
960
+ "grad_norm": 2.5098698139190674,
961
+ "learning_rate": 9.331170949153772e-05,
962
+ "loss": 0.3365,
963
+ "step": 59000
964
+ },
965
+ {
966
+ "epoch": 3.231588094720834,
967
+ "grad_norm": 2.2651731967926025,
968
+ "learning_rate": 9.24056826006596e-05,
969
+ "loss": 0.3285,
970
+ "step": 59500
971
+ },
972
+ {
973
+ "epoch": 3.25874429719748,
974
+ "grad_norm": 2.573915958404541,
975
+ "learning_rate": 9.150146776356322e-05,
976
+ "loss": 0.3421,
977
+ "step": 60000
978
+ },
979
+ {
980
+ "epoch": 3.2859004996741255,
981
+ "grad_norm": 3.5748302936553955,
982
+ "learning_rate": 9.059544087268512e-05,
983
+ "loss": 0.3267,
984
+ "step": 60500
985
+ },
986
+ {
987
+ "epoch": 3.3130567021507713,
988
+ "grad_norm": 2.8185431957244873,
989
+ "learning_rate": 8.968941398180698e-05,
990
+ "loss": 0.3225,
991
+ "step": 61000
992
+ },
993
+ {
994
+ "epoch": 3.3402129046274167,
995
+ "grad_norm": 6.555810451507568,
996
+ "learning_rate": 8.878338709092886e-05,
997
+ "loss": 0.3174,
998
+ "step": 61500
999
+ },
1000
+ {
1001
+ "epoch": 3.3673691071040626,
1002
+ "grad_norm": 3.8243870735168457,
1003
+ "learning_rate": 8.787736020005073e-05,
1004
+ "loss": 0.3249,
1005
+ "step": 62000
1006
+ },
1007
+ {
1008
+ "epoch": 3.3945253095807084,
1009
+ "grad_norm": 1.514364242553711,
1010
+ "learning_rate": 8.697314536295438e-05,
1011
+ "loss": 0.3346,
1012
+ "step": 62500
1013
+ },
1014
+ {
1015
+ "epoch": 3.4005539865305234,
1016
+ "eval_loss": 0.4442519247531891,
1017
+ "eval_runtime": 22.857,
1018
+ "eval_samples_per_second": 16.144,
1019
+ "eval_steps_per_second": 8.094,
1020
+ "step": 62611
1021
+ },
1022
+ {
1023
+ "epoch": 3.421681512057354,
1024
+ "grad_norm": 2.1374149322509766,
1025
+ "learning_rate": 8.606711847207625e-05,
1026
+ "loss": 0.3231,
1027
+ "step": 63000
1028
+ },
1029
+ {
1030
+ "epoch": 3.4488377145339997,
1031
+ "grad_norm": 2.8971145153045654,
1032
+ "learning_rate": 8.516109158119814e-05,
1033
+ "loss": 0.3376,
1034
+ "step": 63500
1035
+ },
1036
+ {
1037
+ "epoch": 3.475993917010645,
1038
+ "grad_norm": 2.860117197036743,
1039
+ "learning_rate": 8.425506469032002e-05,
1040
+ "loss": 0.3295,
1041
+ "step": 64000
1042
+ },
1043
+ {
1044
+ "epoch": 3.503150119487291,
1045
+ "grad_norm": 1.976477026939392,
1046
+ "learning_rate": 8.335084985322365e-05,
1047
+ "loss": 0.3236,
1048
+ "step": 64500
1049
+ },
1050
+ {
1051
+ "epoch": 3.530306321963937,
1052
+ "grad_norm": 2.6291637420654297,
1053
+ "learning_rate": 8.244482296234553e-05,
1054
+ "loss": 0.3201,
1055
+ "step": 65000
1056
+ },
1057
+ {
1058
+ "epoch": 3.557462524440582,
1059
+ "grad_norm": 2.5785484313964844,
1060
+ "learning_rate": 8.15387960714674e-05,
1061
+ "loss": 0.3354,
1062
+ "step": 65500
1063
+ },
1064
+ {
1065
+ "epoch": 3.584618726917228,
1066
+ "grad_norm": 2.3802502155303955,
1067
+ "learning_rate": 8.063276918058928e-05,
1068
+ "loss": 0.3363,
1069
+ "step": 66000
1070
+ },
1071
+ {
1072
+ "epoch": 3.6005865739734957,
1073
+ "eval_loss": 0.43394023180007935,
1074
+ "eval_runtime": 23.107,
1075
+ "eval_samples_per_second": 15.969,
1076
+ "eval_steps_per_second": 8.006,
1077
+ "step": 66294
1078
+ },
1079
+ {
1080
+ "epoch": 3.6117749293938735,
1081
+ "grad_norm": 3.012232542037964,
1082
+ "learning_rate": 7.972674228971116e-05,
1083
+ "loss": 0.323,
1084
+ "step": 66500
1085
+ },
1086
+ {
1087
+ "epoch": 3.6389311318705193,
1088
+ "grad_norm": 2.5260913372039795,
1089
+ "learning_rate": 7.88225274526148e-05,
1090
+ "loss": 0.3316,
1091
+ "step": 67000
1092
+ },
1093
+ {
1094
+ "epoch": 3.666087334347165,
1095
+ "grad_norm": 3.0673775672912598,
1096
+ "learning_rate": 7.791650056173668e-05,
1097
+ "loss": 0.3194,
1098
+ "step": 67500
1099
+ },
1100
+ {
1101
+ "epoch": 3.6932435368238106,
1102
+ "grad_norm": 1.782955527305603,
1103
+ "learning_rate": 7.701047367085855e-05,
1104
+ "loss": 0.3268,
1105
+ "step": 68000
1106
+ },
1107
+ {
1108
+ "epoch": 3.720399739300456,
1109
+ "grad_norm": 3.0327773094177246,
1110
+ "learning_rate": 7.610444677998043e-05,
1111
+ "loss": 0.327,
1112
+ "step": 68500
1113
+ },
1114
+ {
1115
+ "epoch": 3.747555941777102,
1116
+ "grad_norm": 4.625910758972168,
1117
+ "learning_rate": 7.520023194288407e-05,
1118
+ "loss": 0.3231,
1119
+ "step": 69000
1120
+ },
1121
+ {
1122
+ "epoch": 3.7747121442537477,
1123
+ "grad_norm": 2.987931966781616,
1124
+ "learning_rate": 7.429420505200595e-05,
1125
+ "loss": 0.3321,
1126
+ "step": 69500
1127
+ },
1128
+ {
1129
+ "epoch": 3.8006191614164675,
1130
+ "eval_loss": 0.43500107526779175,
1131
+ "eval_runtime": 22.946,
1132
+ "eval_samples_per_second": 16.081,
1133
+ "eval_steps_per_second": 8.062,
1134
+ "step": 69977
1135
+ },
1136
+ {
1137
+ "epoch": 3.801868346730393,
1138
+ "grad_norm": 3.8928215503692627,
1139
+ "learning_rate": 7.338817816112783e-05,
1140
+ "loss": 0.3387,
1141
+ "step": 70000
1142
+ },
1143
+ {
1144
+ "epoch": 3.829024549207039,
1145
+ "grad_norm": 2.32753586769104,
1146
+ "learning_rate": 7.24821512702497e-05,
1147
+ "loss": 0.3327,
1148
+ "step": 70500
1149
+ },
1150
+ {
1151
+ "epoch": 3.8561807516836843,
1152
+ "grad_norm": 2.5396571159362793,
1153
+ "learning_rate": 7.157793643315333e-05,
1154
+ "loss": 0.3251,
1155
+ "step": 71000
1156
+ },
1157
+ {
1158
+ "epoch": 3.88333695416033,
1159
+ "grad_norm": 2.509148597717285,
1160
+ "learning_rate": 7.067190954227521e-05,
1161
+ "loss": 0.3225,
1162
+ "step": 71500
1163
+ },
1164
+ {
1165
+ "epoch": 3.910493156636976,
1166
+ "grad_norm": 1.7930841445922852,
1167
+ "learning_rate": 6.97658826513971e-05,
1168
+ "loss": 0.3392,
1169
+ "step": 72000
1170
+ },
1171
+ {
1172
+ "epoch": 3.9376493591136215,
1173
+ "grad_norm": 2.579759120941162,
1174
+ "learning_rate": 6.885985576051898e-05,
1175
+ "loss": 0.3415,
1176
+ "step": 72500
1177
+ },
1178
+ {
1179
+ "epoch": 3.9648055615902673,
1180
+ "grad_norm": 4.053764820098877,
1181
+ "learning_rate": 6.795564092342262e-05,
1182
+ "loss": 0.3373,
1183
+ "step": 73000
1184
+ },
1185
+ {
1186
+ "epoch": 3.9919617640669127,
1187
+ "grad_norm": 2.3885462284088135,
1188
+ "learning_rate": 6.70496140325445e-05,
1189
+ "loss": 0.3423,
1190
+ "step": 73500
1191
+ },
1192
+ {
1193
+ "epoch": 4.00065174885944,
1194
+ "eval_loss": 0.42881426215171814,
1195
+ "eval_runtime": 23.0588,
1196
+ "eval_samples_per_second": 16.003,
1197
+ "eval_steps_per_second": 8.023,
1198
+ "step": 73660
1199
+ },
1200
+ {
1201
+ "epoch": 4.019117966543559,
1202
+ "grad_norm": 1.8718838691711426,
1203
+ "learning_rate": 6.614358714166636e-05,
1204
+ "loss": 0.2902,
1205
+ "step": 74000
1206
+ },
1207
+ {
1208
+ "epoch": 4.046274169020204,
1209
+ "grad_norm": 3.1479783058166504,
1210
+ "learning_rate": 6.523756025078824e-05,
1211
+ "loss": 0.2817,
1212
+ "step": 74500
1213
+ },
1214
+ {
1215
+ "epoch": 4.07343037149685,
1216
+ "grad_norm": 2.8043808937072754,
1217
+ "learning_rate": 6.433153335991013e-05,
1218
+ "loss": 0.28,
1219
+ "step": 75000
1220
+ },
1221
+ {
1222
+ "epoch": 4.100586573973495,
1223
+ "grad_norm": 0.6163878440856934,
1224
+ "learning_rate": 6.342550646903201e-05,
1225
+ "loss": 0.283,
1226
+ "step": 75500
1227
+ },
1228
+ {
1229
+ "epoch": 4.127742776450141,
1230
+ "grad_norm": 1.6441878080368042,
1231
+ "learning_rate": 6.252129163193563e-05,
1232
+ "loss": 0.2731,
1233
+ "step": 76000
1234
+ },
1235
+ {
1236
+ "epoch": 4.154898978926787,
1237
+ "grad_norm": 3.012065887451172,
1238
+ "learning_rate": 6.161526474105753e-05,
1239
+ "loss": 0.2757,
1240
+ "step": 76500
1241
+ },
1242
+ {
1243
+ "epoch": 4.182055181403433,
1244
+ "grad_norm": 2.1326332092285156,
1245
+ "learning_rate": 6.07092378501794e-05,
1246
+ "loss": 0.2789,
1247
+ "step": 77000
1248
+ },
1249
+ {
1250
+ "epoch": 4.200684336302412,
1251
+ "eval_loss": 0.44576430320739746,
1252
+ "eval_runtime": 23.0355,
1253
+ "eval_samples_per_second": 16.019,
1254
+ "eval_steps_per_second": 8.031,
1255
+ "step": 77343
1256
+ },
1257
+ {
1258
+ "epoch": 4.209211383880078,
1259
+ "grad_norm": 3.3734445571899414,
1260
+ "learning_rate": 5.9803210959301273e-05,
1261
+ "loss": 0.2729,
1262
+ "step": 77500
1263
+ },
1264
+ {
1265
+ "epoch": 4.236367586356724,
1266
+ "grad_norm": 2.7482869625091553,
1267
+ "learning_rate": 5.889718406842315e-05,
1268
+ "loss": 0.2924,
1269
+ "step": 78000
1270
+ },
1271
+ {
1272
+ "epoch": 4.2635237888333695,
1273
+ "grad_norm": 2.5796825885772705,
1274
+ "learning_rate": 5.799115717754503e-05,
1275
+ "loss": 0.2843,
1276
+ "step": 78500
1277
+ },
1278
+ {
1279
+ "epoch": 4.290679991310015,
1280
+ "grad_norm": 3.74029541015625,
1281
+ "learning_rate": 5.708513028666691e-05,
1282
+ "loss": 0.2889,
1283
+ "step": 79000
1284
+ },
1285
+ {
1286
+ "epoch": 4.317836193786661,
1287
+ "grad_norm": 3.763978958129883,
1288
+ "learning_rate": 5.617910339578879e-05,
1289
+ "loss": 0.2812,
1290
+ "step": 79500
1291
+ },
1292
+ {
1293
+ "epoch": 4.344992396263306,
1294
+ "grad_norm": 2.851184844970703,
1295
+ "learning_rate": 5.527488855869243e-05,
1296
+ "loss": 0.283,
1297
+ "step": 80000
1298
+ },
1299
+ {
1300
+ "epoch": 4.372148598739952,
1301
+ "grad_norm": 3.071202278137207,
1302
+ "learning_rate": 5.436886166781431e-05,
1303
+ "loss": 0.2911,
1304
+ "step": 80500
1305
+ },
1306
+ {
1307
+ "epoch": 4.399304801216598,
1308
+ "grad_norm": 3.962803602218628,
1309
+ "learning_rate": 5.3464646830717936e-05,
1310
+ "loss": 0.2928,
1311
+ "step": 81000
1312
+ },
1313
+ {
1314
+ "epoch": 4.400716923745383,
1315
+ "eval_loss": 0.4378789961338043,
1316
+ "eval_runtime": 22.9566,
1317
+ "eval_samples_per_second": 16.074,
1318
+ "eval_steps_per_second": 8.059,
1319
+ "step": 81026
1320
+ },
1321
+ {
1322
+ "epoch": 4.426461003693244,
1323
+ "grad_norm": 2.5465190410614014,
1324
+ "learning_rate": 5.2558619939839814e-05,
1325
+ "loss": 0.269,
1326
+ "step": 81500
1327
+ },
1328
+ {
1329
+ "epoch": 4.4536172061698895,
1330
+ "grad_norm": 3.322237491607666,
1331
+ "learning_rate": 5.16525930489617e-05,
1332
+ "loss": 0.2883,
1333
+ "step": 82000
1334
+ },
1335
+ {
1336
+ "epoch": 4.4807734086465345,
1337
+ "grad_norm": 1.5292987823486328,
1338
+ "learning_rate": 5.0746566158083575e-05,
1339
+ "loss": 0.2796,
1340
+ "step": 82500
1341
+ },
1342
+ {
1343
+ "epoch": 4.50792961112318,
1344
+ "grad_norm": 2.0258724689483643,
1345
+ "learning_rate": 4.984053926720545e-05,
1346
+ "loss": 0.2766,
1347
+ "step": 83000
1348
+ },
1349
+ {
1350
+ "epoch": 4.535085813599826,
1351
+ "grad_norm": 2.583266019821167,
1352
+ "learning_rate": 4.893451237632733e-05,
1353
+ "loss": 0.2975,
1354
+ "step": 83500
1355
+ },
1356
+ {
1357
+ "epoch": 4.562242016076472,
1358
+ "grad_norm": 2.7614002227783203,
1359
+ "learning_rate": 4.802848548544921e-05,
1360
+ "loss": 0.2846,
1361
+ "step": 84000
1362
+ },
1363
+ {
1364
+ "epoch": 4.589398218553118,
1365
+ "grad_norm": 4.259634971618652,
1366
+ "learning_rate": 4.712245859457109e-05,
1367
+ "loss": 0.2963,
1368
+ "step": 84500
1369
+ },
1370
+ {
1371
+ "epoch": 4.600749511188355,
1372
+ "eval_loss": 0.43254056572914124,
1373
+ "eval_runtime": 22.8989,
1374
+ "eval_samples_per_second": 16.114,
1375
+ "eval_steps_per_second": 8.079,
1376
+ "step": 84709
1377
+ },
1378
+ {
1379
+ "epoch": 4.616554421029763,
1380
+ "grad_norm": 1.8035340309143066,
1381
+ "learning_rate": 4.621643170369297e-05,
1382
+ "loss": 0.2854,
1383
+ "step": 85000
1384
+ },
1385
+ {
1386
+ "epoch": 4.643710623506409,
1387
+ "grad_norm": 3.2322275638580322,
1388
+ "learning_rate": 4.53122168665966e-05,
1389
+ "loss": 0.287,
1390
+ "step": 85500
1391
+ },
1392
+ {
1393
+ "epoch": 4.670866825983055,
1394
+ "grad_norm": 7.430004119873047,
1395
+ "learning_rate": 4.440618997571848e-05,
1396
+ "loss": 0.2805,
1397
+ "step": 86000
1398
+ },
1399
+ {
1400
+ "epoch": 4.6980230284597,
1401
+ "grad_norm": 2.2691986560821533,
1402
+ "learning_rate": 4.3500163084840364e-05,
1403
+ "loss": 0.2874,
1404
+ "step": 86500
1405
+ },
1406
+ {
1407
+ "epoch": 4.725179230936346,
1408
+ "grad_norm": 2.7627906799316406,
1409
+ "learning_rate": 4.2594136193962235e-05,
1410
+ "loss": 0.2818,
1411
+ "step": 87000
1412
+ },
1413
+ {
1414
+ "epoch": 4.752335433412991,
1415
+ "grad_norm": 3.7362864017486572,
1416
+ "learning_rate": 4.1689921356865876e-05,
1417
+ "loss": 0.2827,
1418
+ "step": 87500
1419
+ },
1420
+ {
1421
+ "epoch": 4.779491635889637,
1422
+ "grad_norm": 4.409236907958984,
1423
+ "learning_rate": 4.0783894465987754e-05,
1424
+ "loss": 0.2887,
1425
+ "step": 88000
1426
+ },
1427
+ {
1428
+ "epoch": 4.800782098631327,
1429
+ "eval_loss": 0.42746320366859436,
1430
+ "eval_runtime": 23.0563,
1431
+ "eval_samples_per_second": 16.004,
1432
+ "eval_steps_per_second": 8.024,
1433
+ "step": 88392
1434
+ },
1435
+ {
1436
+ "epoch": 4.806647838366283,
1437
+ "grad_norm": 4.065585136413574,
1438
+ "learning_rate": 3.987786757510963e-05,
1439
+ "loss": 0.2905,
1440
+ "step": 88500
1441
+ },
1442
+ {
1443
+ "epoch": 4.833804040842929,
1444
+ "grad_norm": 3.655996799468994,
1445
+ "learning_rate": 3.897184068423151e-05,
1446
+ "loss": 0.2716,
1447
+ "step": 89000
1448
+ },
1449
+ {
1450
+ "epoch": 4.860960243319575,
1451
+ "grad_norm": 4.297955513000488,
1452
+ "learning_rate": 3.806762584713515e-05,
1453
+ "loss": 0.29,
1454
+ "step": 89500
1455
+ },
1456
+ {
1457
+ "epoch": 4.88811644579622,
1458
+ "grad_norm": 3.1703717708587646,
1459
+ "learning_rate": 3.716159895625702e-05,
1460
+ "loss": 0.2754,
1461
+ "step": 90000
1462
+ },
1463
+ {
1464
+ "epoch": 4.9152726482728655,
1465
+ "grad_norm": 3.771336078643799,
1466
+ "learning_rate": 3.62555720653789e-05,
1467
+ "loss": 0.2839,
1468
+ "step": 90500
1469
+ },
1470
+ {
1471
+ "epoch": 4.942428850749511,
1472
+ "grad_norm": 3.908500909805298,
1473
+ "learning_rate": 3.534954517450078e-05,
1474
+ "loss": 0.2744,
1475
+ "step": 91000
1476
+ },
1477
+ {
1478
+ "epoch": 4.969585053226157,
1479
+ "grad_norm": 3.199415445327759,
1480
+ "learning_rate": 3.444351828362266e-05,
1481
+ "loss": 0.2834,
1482
+ "step": 91500
1483
+ },
1484
+ {
1485
+ "epoch": 4.996741255702802,
1486
+ "grad_norm": 3.1083319187164307,
1487
+ "learning_rate": 3.3539303446526294e-05,
1488
+ "loss": 0.2949,
1489
+ "step": 92000
1490
+ },
1491
+ {
1492
+ "epoch": 5.0008146860743,
1493
+ "eval_loss": 0.4291832447052002,
1494
+ "eval_runtime": 23.525,
1495
+ "eval_samples_per_second": 15.685,
1496
+ "eval_steps_per_second": 7.864,
1497
+ "step": 92075
1498
+ },
1499
+ {
1500
+ "epoch": 5.023897458179448,
1501
+ "grad_norm": 6.121253490447998,
1502
+ "learning_rate": 3.263327655564817e-05,
1503
+ "loss": 0.2289,
1504
+ "step": 92500
1505
+ },
1506
+ {
1507
+ "epoch": 5.051053660656094,
1508
+ "grad_norm": 2.5016486644744873,
1509
+ "learning_rate": 3.1727249664770055e-05,
1510
+ "loss": 0.248,
1511
+ "step": 93000
1512
+ },
1513
+ {
1514
+ "epoch": 5.07820986313274,
1515
+ "grad_norm": 2.344914197921753,
1516
+ "learning_rate": 3.0821222773891926e-05,
1517
+ "loss": 0.2315,
1518
+ "step": 93500
1519
+ },
1520
+ {
1521
+ "epoch": 5.1053660656093856,
1522
+ "grad_norm": 3.519299268722534,
1523
+ "learning_rate": 2.9917007936795567e-05,
1524
+ "loss": 0.2516,
1525
+ "step": 94000
1526
+ },
1527
+ {
1528
+ "epoch": 5.1325222680860305,
1529
+ "grad_norm": 3.192281484603882,
1530
+ "learning_rate": 2.9010981045917445e-05,
1531
+ "loss": 0.2368,
1532
+ "step": 94500
1533
+ },
1534
+ {
1535
+ "epoch": 5.159678470562676,
1536
+ "grad_norm": 3.7645487785339355,
1537
+ "learning_rate": 2.8104954155039322e-05,
1538
+ "loss": 0.2573,
1539
+ "step": 95000
1540
+ },
1541
+ {
1542
+ "epoch": 5.186834673039322,
1543
+ "grad_norm": 4.5175275802612305,
1544
+ "learning_rate": 2.71989272641612e-05,
1545
+ "loss": 0.2437,
1546
+ "step": 95500
1547
+ },
1548
+ {
1549
+ "epoch": 5.2008472735172715,
1550
+ "eval_loss": 0.4366357922554016,
1551
+ "eval_runtime": 23.1107,
1552
+ "eval_samples_per_second": 15.967,
1553
+ "eval_steps_per_second": 8.005,
1554
+ "step": 95758
1555
+ },
1556
+ {
1557
+ "epoch": 5.213990875515968,
1558
+ "grad_norm": 4.234988212585449,
1559
+ "learning_rate": 2.629290037328308e-05,
1560
+ "loss": 0.2439,
1561
+ "step": 96000
1562
+ },
1563
+ {
1564
+ "epoch": 5.241147077992614,
1565
+ "grad_norm": 3.174309492111206,
1566
+ "learning_rate": 2.538687348240496e-05,
1567
+ "loss": 0.2523,
1568
+ "step": 96500
1569
+ },
1570
+ {
1571
+ "epoch": 5.268303280469259,
1572
+ "grad_norm": 3.7519733905792236,
1573
+ "learning_rate": 2.4480846591526838e-05,
1574
+ "loss": 0.2463,
1575
+ "step": 97000
1576
+ },
1577
+ {
1578
+ "epoch": 5.295459482945905,
1579
+ "grad_norm": 2.9701130390167236,
1580
+ "learning_rate": 2.357481970064872e-05,
1581
+ "loss": 0.2519,
1582
+ "step": 97500
1583
+ },
1584
+ {
1585
+ "epoch": 5.322615685422551,
1586
+ "grad_norm": 5.130082130432129,
1587
+ "learning_rate": 2.2672416917334107e-05,
1588
+ "loss": 0.2486,
1589
+ "step": 98000
1590
+ },
1591
+ {
1592
+ "epoch": 5.349771887899196,
1593
+ "grad_norm": 3.390826463699341,
1594
+ "learning_rate": 2.1766390026455985e-05,
1595
+ "loss": 0.2478,
1596
+ "step": 98500
1597
+ },
1598
+ {
1599
+ "epoch": 5.376928090375841,
1600
+ "grad_norm": 2.6151483058929443,
1601
+ "learning_rate": 2.0860363135577865e-05,
1602
+ "loss": 0.2424,
1603
+ "step": 99000
1604
+ },
1605
+ {
1606
+ "epoch": 5.400879860960243,
1607
+ "eval_loss": 0.43580135703086853,
1608
+ "eval_runtime": 23.7346,
1609
+ "eval_samples_per_second": 15.547,
1610
+ "eval_steps_per_second": 7.795,
1611
+ "step": 99441
1612
+ },
1613
+ {
1614
+ "epoch": 5.404084292852487,
1615
+ "grad_norm": 3.701735496520996,
1616
+ "learning_rate": 1.9954336244699743e-05,
1617
+ "loss": 0.2443,
1618
+ "step": 99500
1619
+ },
1620
+ {
1621
+ "epoch": 5.431240495329133,
1622
+ "grad_norm": 3.8400754928588867,
1623
+ "learning_rate": 1.9048309353821623e-05,
1624
+ "loss": 0.2276,
1625
+ "step": 100000
1626
+ },
1627
+ {
1628
+ "epoch": 5.458396697805779,
1629
+ "grad_norm": 2.5460264682769775,
1630
+ "learning_rate": 1.81422824629435e-05,
1631
+ "loss": 0.2313,
1632
+ "step": 100500
1633
+ },
1634
+ {
1635
+ "epoch": 5.485552900282425,
1636
+ "grad_norm": 5.040457725524902,
1637
+ "learning_rate": 1.7236255572065378e-05,
1638
+ "loss": 0.238,
1639
+ "step": 101000
1640
+ },
1641
+ {
1642
+ "epoch": 5.51270910275907,
1643
+ "grad_norm": 4.061932563781738,
1644
+ "learning_rate": 1.633022868118726e-05,
1645
+ "loss": 0.2558,
1646
+ "step": 101500
1647
+ },
1648
+ {
1649
+ "epoch": 5.539865305235716,
1650
+ "grad_norm": 4.28571081161499,
1651
+ "learning_rate": 1.5424201790309136e-05,
1652
+ "loss": 0.2531,
1653
+ "step": 102000
1654
+ },
1655
+ {
1656
+ "epoch": 5.5670215077123615,
1657
+ "grad_norm": 4.26746129989624,
1658
+ "learning_rate": 1.4519986953212772e-05,
1659
+ "loss": 0.2487,
1660
+ "step": 102500
1661
+ },
1662
+ {
1663
+ "epoch": 5.594177710189007,
1664
+ "grad_norm": 1.4005869626998901,
1665
+ "learning_rate": 1.3613960062334651e-05,
1666
+ "loss": 0.2528,
1667
+ "step": 103000
1668
+ },
1669
+ {
1670
+ "epoch": 5.600912448403215,
1671
+ "eval_loss": 0.4331228733062744,
1672
+ "eval_runtime": 25.1727,
1673
+ "eval_samples_per_second": 14.659,
1674
+ "eval_steps_per_second": 7.349,
1675
+ "step": 103124
1676
+ },
1677
+ {
1678
+ "epoch": 5.621333912665653,
1679
+ "grad_norm": 3.8620026111602783,
1680
+ "learning_rate": 1.2707933171456529e-05,
1681
+ "loss": 0.248,
1682
+ "step": 103500
1683
+ },
1684
+ {
1685
+ "epoch": 5.648490115142298,
1686
+ "grad_norm": 4.398037433624268,
1687
+ "learning_rate": 1.1803718334360163e-05,
1688
+ "loss": 0.2394,
1689
+ "step": 104000
1690
+ },
1691
+ {
1692
+ "epoch": 5.675646317618944,
1693
+ "grad_norm": 2.4203145503997803,
1694
+ "learning_rate": 1.0897691443482042e-05,
1695
+ "loss": 0.2344,
1696
+ "step": 104500
1697
+ },
1698
+ {
1699
+ "epoch": 5.70280252009559,
1700
+ "grad_norm": 3.2735469341278076,
1701
+ "learning_rate": 9.991664552603922e-06,
1702
+ "loss": 0.2391,
1703
+ "step": 105000
1704
+ },
1705
+ {
1706
+ "epoch": 5.729958722572236,
1707
+ "grad_norm": 3.202352523803711,
1708
+ "learning_rate": 9.0856376617258e-06,
1709
+ "loss": 0.2503,
1710
+ "step": 105500
1711
+ },
1712
+ {
1713
+ "epoch": 5.757114925048882,
1714
+ "grad_norm": 2.457843065261841,
1715
+ "learning_rate": 8.17961077084768e-06,
1716
+ "loss": 0.233,
1717
+ "step": 106000
1718
+ },
1719
+ {
1720
+ "epoch": 5.7842711275255265,
1721
+ "grad_norm": 2.1440610885620117,
1722
+ "learning_rate": 7.273583879969558e-06,
1723
+ "loss": 0.2477,
1724
+ "step": 106500
1725
+ },
1726
+ {
1727
+ "epoch": 5.800945035846187,
1728
+ "eval_loss": 0.43289270997047424,
1729
+ "eval_runtime": 25.7135,
1730
+ "eval_samples_per_second": 14.35,
1731
+ "eval_steps_per_second": 7.195,
1732
+ "step": 106807
1733
+ },
1734
+ {
1735
+ "epoch": 5.811427330002172,
1736
+ "grad_norm": 2.6855876445770264,
1737
+ "learning_rate": 6.367556989091436e-06,
1738
+ "loss": 0.231,
1739
+ "step": 107000
1740
+ },
1741
+ {
1742
+ "epoch": 5.838583532478818,
1743
+ "grad_norm": 5.511388778686523,
1744
+ "learning_rate": 5.461530098213316e-06,
1745
+ "loss": 0.2399,
1746
+ "step": 107500
1747
+ },
1748
+ {
1749
+ "epoch": 5.865739734955464,
1750
+ "grad_norm": 2.992866277694702,
1751
+ "learning_rate": 4.555503207335194e-06,
1752
+ "loss": 0.2367,
1753
+ "step": 108000
1754
+ },
1755
+ {
1756
+ "epoch": 5.89289593743211,
1757
+ "grad_norm": 2.2536861896514893,
1758
+ "learning_rate": 3.651288370238829e-06,
1759
+ "loss": 0.2545,
1760
+ "step": 108500
1761
+ },
1762
+ {
1763
+ "epoch": 5.920052139908755,
1764
+ "grad_norm": 3.6174511909484863,
1765
+ "learning_rate": 2.745261479360707e-06,
1766
+ "loss": 0.2576,
1767
+ "step": 109000
1768
+ },
1769
+ {
1770
+ "epoch": 5.947208342385401,
1771
+ "grad_norm": 2.4859135150909424,
1772
+ "learning_rate": 1.8392345884825864e-06,
1773
+ "loss": 0.2448,
1774
+ "step": 109500
1775
+ },
1776
+ {
1777
+ "epoch": 5.974364544862047,
1778
+ "grad_norm": 1.783007025718689,
1779
+ "learning_rate": 9.350197513862211e-07,
1780
+ "loss": 0.2347,
1781
+ "step": 110000
1782
+ },
1783
+ {
1784
+ "epoch": 6.0,
1785
+ "step": 110472,
1786
+ "total_flos": 7.299634402197504e+17,
1787
+ "train_loss": 0.3804842073002838,
1788
+ "train_runtime": 59722.3514,
1789
+ "train_samples_per_second": 3.699,
1790
+ "train_steps_per_second": 1.85
1791
+ }
1792
+ ],
1793
+ "logging_steps": 500,
1794
+ "max_steps": 110472,
1795
+ "num_input_tokens_seen": 0,
1796
+ "num_train_epochs": 6,
1797
+ "save_steps": 500,
1798
+ "stateful_callbacks": {
1799
+ "TrainerControl": {
1800
+ "args": {
1801
+ "should_epoch_stop": false,
1802
+ "should_evaluate": false,
1803
+ "should_log": false,
1804
+ "should_save": true,
1805
+ "should_training_stop": true
1806
+ },
1807
+ "attributes": {}
1808
+ }
1809
+ },
1810
+ "total_flos": 7.299634402197504e+17,
1811
+ "train_batch_size": 2,
1812
+ "trial_name": null,
1813
+ "trial_params": null
1814
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec7d042b93d73031dccc443da2cb5446fc90da6638fc7798f7d6525c7d5af74e
3
+ size 5496