nielsbantilan commited on
Commit
45c5b3c
·
1 Parent(s): 873008c

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,16 +1,34 @@
1
  ---
2
- datasets:
3
- - yahma/alpaca-cleaned
4
- language:
5
- - en
6
- license: apache-2.0
7
- tags:
8
- - pytorch
9
- - causal-lm
10
- - llama2
11
- - fine-tuning
12
- - alpaca
13
-
14
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- # Llama-2-13b fine-tuned on LoRA alpaca-cleaned
 
1
  ---
2
+ library_name: peft
 
 
 
 
 
 
 
 
 
 
 
3
  ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: True
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: True
17
+ - bnb_4bit_compute_dtype: bfloat16
18
+
19
+ The following `bitsandbytes` quantization config was used during training:
20
+ - quant_method: bitsandbytes
21
+ - load_in_8bit: False
22
+ - load_in_4bit: True
23
+ - llm_int8_threshold: 6.0
24
+ - llm_int8_skip_modules: None
25
+ - llm_int8_enable_fp32_cpu_offload: True
26
+ - llm_int8_has_fp16_weight: False
27
+ - bnb_4bit_quant_type: nf4
28
+ - bnb_4bit_use_double_quant: True
29
+ - bnb_4bit_compute_dtype: bfloat16
30
+ ### Framework versions
31
+
32
+ - PEFT 0.5.0.dev0
33
 
34
+ - PEFT 0.5.0.dev0
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e97c31326c925c036c6a23bebeb87b2b240c8a9f474c195749040c6e09879feb
3
- size 39407821
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406
3
+ size 443
checkpoint-300/README.md CHANGED
@@ -5,15 +5,16 @@ library_name: peft
5
 
6
 
7
  The following `bitsandbytes` quantization config was used during training:
8
- - load_in_8bit: True
9
- - load_in_4bit: False
 
10
  - llm_int8_threshold: 6.0
11
  - llm_int8_skip_modules: None
12
- - llm_int8_enable_fp32_cpu_offload: False
13
  - llm_int8_has_fp16_weight: False
14
- - bnb_4bit_quant_type: fp4
15
- - bnb_4bit_use_double_quant: False
16
- - bnb_4bit_compute_dtype: float32
17
  ### Framework versions
18
 
19
 
 
5
 
6
 
7
  The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
  - llm_int8_threshold: 6.0
12
  - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: True
14
  - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: True
17
+ - bnb_4bit_compute_dtype: bfloat16
18
  ### Framework versions
19
 
20
 
checkpoint-300/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fb5f0344200dca6ae9b113347b56e736eeab03fc504490e51e36ed8fd4ca483f
3
- size 78844165
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be9efd61012bf13cedff400fcbb8a2a70e1d6dfb96b4b9c96b0f2082b4421403
3
+ size 19991557
checkpoint-300/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6ce95605ffdbe9bae7671ddac045840f537281836f2e0bab431da4fb2c0daa6
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d58a2d49c8bd88ae73de10c23b44dda16a3841706f663470cc7b0116797d3b2
3
  size 14575
checkpoint-300/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9850941e7721c4485c4be6cdc71482f604e4a686e50a540c13dcd9ef580d226b
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:503d141bd03d55a69c94b428e328d3b060b988ace448865bb4b7d17216ac0c68
3
  size 627
checkpoint-300/trainer_state.json CHANGED
@@ -1,7 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.09273570324574962,
 
5
  "global_step": 300,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
@@ -9,1808 +10,1810 @@
9
  "log_history": [
10
  {
11
  "epoch": 0.0,
12
- "learning_rate": 3.333333333333333e-07,
13
- "loss": 1.1201,
14
  "step": 1
15
  },
16
  {
17
  "epoch": 0.0,
18
- "learning_rate": 6.666666666666666e-07,
19
- "loss": 1.0439,
20
  "step": 2
21
  },
22
  {
23
  "epoch": 0.0,
24
- "learning_rate": 1e-06,
25
- "loss": 1.0928,
26
  "step": 3
27
  },
28
  {
29
  "epoch": 0.0,
30
- "learning_rate": 1.3333333333333332e-06,
31
- "loss": 1.2591,
32
  "step": 4
33
  },
34
  {
35
  "epoch": 0.0,
36
- "learning_rate": 1.6666666666666669e-06,
37
- "loss": 1.2586,
38
  "step": 5
39
  },
40
  {
41
  "epoch": 0.0,
42
- "learning_rate": 2e-06,
43
- "loss": 1.1372,
44
  "step": 6
45
  },
46
  {
47
  "epoch": 0.0,
48
- "learning_rate": 2.3333333333333336e-06,
49
- "loss": 1.2434,
50
  "step": 7
51
  },
52
  {
53
  "epoch": 0.0,
54
- "learning_rate": 2.6666666666666664e-06,
55
- "loss": 1.2124,
56
  "step": 8
57
  },
58
  {
59
- "epoch": 0.0,
60
- "learning_rate": 3e-06,
61
- "loss": 1.2344,
62
  "step": 9
63
  },
64
  {
65
- "epoch": 0.0,
66
- "learning_rate": 2.9999125880491853e-06,
67
- "loss": 1.2335,
68
  "step": 10
69
  },
70
  {
71
- "epoch": 0.0,
72
- "learning_rate": 2.9996503623845394e-06,
73
- "loss": 1.3104,
74
  "step": 11
75
  },
76
  {
77
- "epoch": 0.0,
78
- "learning_rate": 2.9992133535682728e-06,
79
- "loss": 1.3061,
80
  "step": 12
81
  },
82
  {
83
- "epoch": 0.0,
84
- "learning_rate": 2.998601612533441e-06,
85
- "loss": 1.2788,
86
  "step": 13
87
  },
88
  {
89
- "epoch": 0.0,
90
- "learning_rate": 2.9978152105780155e-06,
91
- "loss": 1.4151,
92
  "step": 14
93
  },
94
  {
95
- "epoch": 0.0,
96
- "learning_rate": 2.9968542393565676e-06,
97
- "loss": 1.4191,
98
  "step": 15
99
  },
100
  {
101
- "epoch": 0.0,
102
- "learning_rate": 2.9957188108695897e-06,
103
- "loss": 1.3953,
104
  "step": 16
105
  },
106
  {
107
  "epoch": 0.01,
108
- "learning_rate": 2.99440905745044e-06,
109
- "loss": 1.3752,
110
  "step": 17
111
  },
112
  {
113
  "epoch": 0.01,
114
- "learning_rate": 2.992925131749921e-06,
115
- "loss": 1.4986,
116
  "step": 18
117
  },
118
  {
119
  "epoch": 0.01,
120
- "learning_rate": 2.9912672067184863e-06,
121
- "loss": 1.5255,
122
  "step": 19
123
  },
124
  {
125
  "epoch": 0.01,
126
- "learning_rate": 2.9894354755860847e-06,
127
- "loss": 1.5467,
128
  "step": 20
129
  },
130
  {
131
  "epoch": 0.01,
132
- "learning_rate": 2.9874301518396377e-06,
133
- "loss": 1.5445,
134
  "step": 21
135
  },
136
  {
137
  "epoch": 0.01,
138
- "learning_rate": 2.98525146919816e-06,
139
- "loss": 1.6161,
140
  "step": 22
141
  },
142
  {
143
  "epoch": 0.01,
144
- "learning_rate": 2.982899681585518e-06,
145
- "loss": 1.6153,
146
  "step": 23
147
  },
148
  {
149
  "epoch": 0.01,
150
- "learning_rate": 2.980375063100836e-06,
151
- "loss": 1.5874,
152
  "step": 24
153
  },
154
  {
155
- "epoch": 0.01,
156
- "learning_rate": 2.9776779079865498e-06,
157
- "loss": 1.6868,
158
  "step": 25
159
  },
160
  {
161
- "epoch": 0.01,
162
- "learning_rate": 2.9748085305941124e-06,
163
- "loss": 1.7119,
164
  "step": 26
165
  },
166
  {
167
- "epoch": 0.01,
168
- "learning_rate": 2.9717672653473587e-06,
169
- "loss": 1.7338,
170
  "step": 27
171
  },
172
  {
173
- "epoch": 0.01,
174
- "learning_rate": 2.9685544667035257e-06,
175
- "loss": 1.8348,
176
  "step": 28
177
  },
178
  {
179
- "epoch": 0.01,
180
- "learning_rate": 2.9651705091119422e-06,
181
- "loss": 1.7692,
182
  "step": 29
183
  },
184
  {
185
- "epoch": 0.01,
186
- "learning_rate": 2.9616157869703894e-06,
187
- "loss": 1.7577,
188
  "step": 30
189
  },
190
  {
191
- "epoch": 0.01,
192
- "learning_rate": 2.957890714579128e-06,
193
- "loss": 1.8723,
194
  "step": 31
195
  },
196
  {
197
- "epoch": 0.01,
198
- "learning_rate": 2.9539957260926184e-06,
199
- "loss": 1.9175,
200
  "step": 32
201
  },
202
  {
203
- "epoch": 0.01,
204
- "learning_rate": 2.949931275468917e-06,
205
- "loss": 1.9534,
206
  "step": 33
207
  },
208
  {
209
- "epoch": 0.01,
210
- "learning_rate": 2.9456978364167667e-06,
211
- "loss": 2.049,
212
  "step": 34
213
  },
214
  {
215
- "epoch": 0.01,
216
- "learning_rate": 2.9412959023403906e-06,
217
- "loss": 2.0493,
218
  "step": 35
219
  },
220
  {
221
- "epoch": 0.01,
222
- "learning_rate": 2.9367259862819805e-06,
223
- "loss": 2.1604,
224
  "step": 36
225
  },
226
  {
227
- "epoch": 0.01,
228
- "learning_rate": 2.931988620861908e-06,
229
- "loss": 2.2148,
230
  "step": 37
231
  },
232
  {
233
- "epoch": 0.01,
234
- "learning_rate": 2.9270843582166428e-06,
235
- "loss": 2.2224,
236
  "step": 38
237
  },
238
  {
239
- "epoch": 0.01,
240
- "learning_rate": 2.9220137699344057e-06,
241
- "loss": 2.2689,
242
  "step": 39
243
  },
244
  {
245
- "epoch": 0.01,
246
- "learning_rate": 2.9167774469885483e-06,
247
- "loss": 2.2726,
248
  "step": 40
249
  },
250
  {
251
- "epoch": 0.01,
252
- "learning_rate": 2.911375999668675e-06,
253
- "loss": 2.345,
254
  "step": 41
255
  },
256
  {
257
- "epoch": 0.01,
258
- "learning_rate": 2.905810057509516e-06,
259
- "loss": 2.3685,
260
  "step": 42
261
  },
262
  {
263
- "epoch": 0.01,
264
- "learning_rate": 2.900080269217554e-06,
265
- "loss": 2.3902,
266
  "step": 43
267
  },
268
  {
269
- "epoch": 0.01,
270
- "learning_rate": 2.8941873025954193e-06,
271
- "loss": 2.3829,
272
  "step": 44
273
  },
274
  {
275
- "epoch": 0.01,
276
- "learning_rate": 2.8881318444640566e-06,
277
- "loss": 2.4634,
278
  "step": 45
279
  },
280
  {
281
- "epoch": 0.01,
282
- "learning_rate": 2.881914600582677e-06,
283
- "loss": 2.5071,
284
  "step": 46
285
  },
286
  {
287
- "epoch": 0.01,
288
- "learning_rate": 2.8755362955665014e-06,
289
- "loss": 2.5543,
290
  "step": 47
291
  },
292
  {
293
- "epoch": 0.01,
294
- "learning_rate": 2.8689976728023105e-06,
295
- "loss": 2.5088,
296
  "step": 48
297
  },
298
  {
299
- "epoch": 0.02,
300
- "learning_rate": 2.8622994943617984e-06,
301
- "loss": 2.5907,
302
  "step": 49
303
  },
304
  {
305
- "epoch": 0.02,
306
- "learning_rate": 2.855442540912758e-06,
307
- "loss": 2.6968,
308
  "step": 50
309
  },
310
  {
311
- "epoch": 0.02,
312
- "learning_rate": 2.848427611628093e-06,
313
- "loss": 1.1239,
314
  "step": 51
315
  },
316
  {
317
- "epoch": 0.02,
318
- "learning_rate": 2.8412555240926745e-06,
319
- "loss": 1.1649,
320
  "step": 52
321
  },
322
  {
323
- "epoch": 0.02,
324
- "learning_rate": 2.8339271142080537e-06,
325
- "loss": 1.086,
326
  "step": 53
327
  },
328
  {
329
- "epoch": 0.02,
330
- "learning_rate": 2.8264432360950353e-06,
331
- "loss": 1.1588,
332
  "step": 54
333
  },
334
  {
335
- "epoch": 0.02,
336
- "learning_rate": 2.8188047619941344e-06,
337
- "loss": 1.1331,
338
  "step": 55
339
  },
340
  {
341
- "epoch": 0.02,
342
- "learning_rate": 2.8110125821639135e-06,
343
- "loss": 1.1617,
344
  "step": 56
345
  },
346
  {
347
- "epoch": 0.02,
348
- "learning_rate": 2.803067604777227e-06,
349
- "loss": 1.1953,
350
  "step": 57
351
  },
352
  {
353
- "epoch": 0.02,
354
- "learning_rate": 2.7949707558153703e-06,
355
- "loss": 1.1326,
356
  "step": 58
357
  },
358
  {
359
- "epoch": 0.02,
360
- "learning_rate": 2.7867229789601615e-06,
361
- "loss": 1.278,
362
  "step": 59
363
  },
364
  {
365
- "epoch": 0.02,
366
- "learning_rate": 2.778325235483954e-06,
367
- "loss": 1.1767,
368
  "step": 60
369
  },
370
  {
371
- "epoch": 0.02,
372
- "learning_rate": 2.7697785041376006e-06,
373
- "loss": 1.1836,
374
  "step": 61
375
  },
376
  {
377
- "epoch": 0.02,
378
- "learning_rate": 2.7610837810363814e-06,
379
- "loss": 1.2632,
380
  "step": 62
381
  },
382
  {
383
- "epoch": 0.02,
384
- "learning_rate": 2.752242079543907e-06,
385
- "loss": 1.2792,
386
  "step": 63
387
  },
388
  {
389
- "epoch": 0.02,
390
- "learning_rate": 2.743254430154012e-06,
391
- "loss": 1.3675,
392
  "step": 64
393
  },
394
  {
395
- "epoch": 0.02,
396
- "learning_rate": 2.734121880370652e-06,
397
- "loss": 1.2054,
398
  "step": 65
399
  },
400
  {
401
- "epoch": 0.02,
402
- "learning_rate": 2.7248454945858163e-06,
403
- "loss": 1.3353,
404
  "step": 66
405
  },
406
  {
407
- "epoch": 0.02,
408
- "learning_rate": 2.7154263539554765e-06,
409
- "loss": 1.4393,
410
  "step": 67
411
  },
412
  {
413
- "epoch": 0.02,
414
- "learning_rate": 2.7058655562735753e-06,
415
- "loss": 1.4269,
416
  "step": 68
417
  },
418
  {
419
- "epoch": 0.02,
420
- "learning_rate": 2.696164215844081e-06,
421
- "loss": 1.362,
422
  "step": 69
423
  },
424
  {
425
- "epoch": 0.02,
426
- "learning_rate": 2.6863234633511186e-06,
427
- "loss": 1.4146,
428
  "step": 70
429
  },
430
  {
431
- "epoch": 0.02,
432
- "learning_rate": 2.6763444457271836e-06,
433
- "loss": 1.4821,
434
  "step": 71
435
  },
436
  {
437
- "epoch": 0.02,
438
- "learning_rate": 2.666228326019474e-06,
439
- "loss": 1.5489,
440
  "step": 72
441
  },
442
  {
443
- "epoch": 0.02,
444
- "learning_rate": 2.655976283254334e-06,
445
- "loss": 1.5923,
446
  "step": 73
447
  },
448
  {
449
- "epoch": 0.02,
450
- "learning_rate": 2.6455895122998405e-06,
451
- "loss": 1.4415,
452
  "step": 74
453
  },
454
  {
455
- "epoch": 0.02,
456
- "learning_rate": 2.6350692237265428e-06,
457
- "loss": 1.6504,
458
  "step": 75
459
  },
460
  {
461
- "epoch": 0.02,
462
- "learning_rate": 2.624416643666371e-06,
463
- "loss": 1.5234,
464
  "step": 76
465
  },
466
  {
467
- "epoch": 0.02,
468
- "learning_rate": 2.6136330136697304e-06,
469
- "loss": 1.5802,
470
  "step": 77
471
  },
472
  {
473
- "epoch": 0.02,
474
- "learning_rate": 2.602719590560801e-06,
475
- "loss": 1.6365,
476
  "step": 78
477
  },
478
  {
479
- "epoch": 0.02,
480
- "learning_rate": 2.591677646291054e-06,
481
- "loss": 1.6234,
482
  "step": 79
483
  },
484
  {
485
- "epoch": 0.02,
486
- "learning_rate": 2.58050846779101e-06,
487
- "loss": 1.7628,
488
  "step": 80
489
  },
490
  {
491
- "epoch": 0.03,
492
- "learning_rate": 2.569213356820244e-06,
493
- "loss": 1.9083,
494
  "step": 81
495
  },
496
  {
497
- "epoch": 0.03,
498
- "learning_rate": 2.557793629815669e-06,
499
- "loss": 1.859,
500
  "step": 82
501
  },
502
  {
503
- "epoch": 0.03,
504
- "learning_rate": 2.5462506177381045e-06,
505
- "loss": 2.0742,
506
  "step": 83
507
  },
508
  {
509
- "epoch": 0.03,
510
- "learning_rate": 2.5345856659171565e-06,
511
- "loss": 1.9934,
512
  "step": 84
513
  },
514
  {
515
- "epoch": 0.03,
516
- "learning_rate": 2.522800133894418e-06,
517
- "loss": 2.0762,
518
  "step": 85
519
  },
520
  {
521
- "epoch": 0.03,
522
- "learning_rate": 2.510895395265016e-06,
523
- "loss": 2.1801,
524
  "step": 86
525
  },
526
  {
527
- "epoch": 0.03,
528
- "learning_rate": 2.498872837517522e-06,
529
- "loss": 2.1072,
530
  "step": 87
531
  },
532
  {
533
- "epoch": 0.03,
534
- "learning_rate": 2.486733861872236e-06,
535
- "loss": 2.2246,
536
  "step": 88
537
  },
538
  {
539
- "epoch": 0.03,
540
- "learning_rate": 2.4744798831178817e-06,
541
- "loss": 2.2589,
542
  "step": 89
543
  },
544
  {
545
- "epoch": 0.03,
546
- "learning_rate": 2.4621123294467098e-06,
547
- "loss": 2.2699,
548
  "step": 90
549
  },
550
  {
551
- "epoch": 0.03,
552
- "learning_rate": 2.449632642288045e-06,
553
- "loss": 2.3417,
554
  "step": 91
555
  },
556
  {
557
- "epoch": 0.03,
558
- "learning_rate": 2.437042276140287e-06,
559
- "loss": 2.2745,
560
  "step": 92
561
  },
562
  {
563
- "epoch": 0.03,
564
- "learning_rate": 2.424342698401391e-06,
565
- "loss": 2.3438,
566
  "step": 93
567
  },
568
  {
569
- "epoch": 0.03,
570
- "learning_rate": 2.4115353891978432e-06,
571
- "loss": 2.324,
572
  "step": 94
573
  },
574
  {
575
- "epoch": 0.03,
576
- "learning_rate": 2.398621841212154e-06,
577
- "loss": 2.3608,
578
  "step": 95
579
  },
580
  {
581
- "epoch": 0.03,
582
- "learning_rate": 2.3856035595088842e-06,
583
- "loss": 2.4066,
584
  "step": 96
585
  },
586
  {
587
- "epoch": 0.03,
588
- "learning_rate": 2.372482061359234e-06,
589
- "loss": 2.4345,
590
  "step": 97
591
  },
592
  {
593
- "epoch": 0.03,
594
- "learning_rate": 2.3592588760642046e-06,
595
- "loss": 2.4411,
596
  "step": 98
597
  },
598
  {
599
- "epoch": 0.03,
600
- "learning_rate": 2.34593554477636e-06,
601
- "loss": 2.4336,
602
  "step": 99
603
  },
604
  {
605
- "epoch": 0.03,
606
- "learning_rate": 2.332513620320205e-06,
607
- "loss": 2.5427,
608
  "step": 100
609
  },
610
  {
611
- "epoch": 0.03,
612
- "learning_rate": 2.318994667011207e-06,
613
- "loss": 1.0668,
614
  "step": 101
615
  },
616
  {
617
- "epoch": 0.03,
618
- "learning_rate": 2.305380260473476e-06,
619
- "loss": 1.1508,
620
  "step": 102
621
  },
622
  {
623
- "epoch": 0.03,
624
- "learning_rate": 2.2916719874561227e-06,
625
- "loss": 1.1392,
626
  "step": 103
627
  },
628
  {
629
- "epoch": 0.03,
630
- "learning_rate": 2.277871445648332e-06,
631
- "loss": 1.1332,
632
  "step": 104
633
  },
634
  {
635
- "epoch": 0.03,
636
- "learning_rate": 2.2639802434931445e-06,
637
- "loss": 1.1816,
638
  "step": 105
639
  },
640
  {
641
- "epoch": 0.03,
642
- "learning_rate": 2.25e-06,
643
- "loss": 1.1044,
644
  "step": 106
645
  },
646
  {
647
- "epoch": 0.03,
648
- "learning_rate": 2.2359323445560408e-06,
649
- "loss": 1.0874,
650
  "step": 107
651
  },
652
  {
653
- "epoch": 0.03,
654
- "learning_rate": 2.221778916736208e-06,
655
- "loss": 1.1705,
656
  "step": 108
657
  },
658
  {
659
- "epoch": 0.03,
660
- "learning_rate": 2.2075413661121492e-06,
661
- "loss": 1.1666,
662
  "step": 109
663
  },
664
  {
665
- "epoch": 0.03,
666
- "learning_rate": 2.1932213520599652e-06,
667
- "loss": 1.2341,
668
  "step": 110
669
  },
670
  {
671
- "epoch": 0.03,
672
- "learning_rate": 2.1788205435668085e-06,
673
- "loss": 1.267,
674
  "step": 111
675
  },
676
  {
677
- "epoch": 0.03,
678
- "learning_rate": 2.1643406190363625e-06,
679
- "loss": 1.293,
680
  "step": 112
681
  },
682
  {
683
- "epoch": 0.03,
684
- "learning_rate": 2.1497832660932298e-06,
685
- "loss": 1.3119,
686
  "step": 113
687
  },
688
  {
689
- "epoch": 0.04,
690
- "learning_rate": 2.135150181386236e-06,
691
- "loss": 1.3284,
692
  "step": 114
693
  },
694
  {
695
- "epoch": 0.04,
696
- "learning_rate": 2.1204430703906874e-06,
697
- "loss": 1.2481,
698
  "step": 115
699
  },
700
  {
701
- "epoch": 0.04,
702
- "learning_rate": 2.1056636472096025e-06,
703
- "loss": 1.2963,
704
  "step": 116
705
  },
706
  {
707
- "epoch": 0.04,
708
- "learning_rate": 2.090813634373931e-06,
709
- "loss": 1.2788,
710
  "step": 117
711
  },
712
  {
713
- "epoch": 0.04,
714
- "learning_rate": 2.0758947626417945e-06,
715
- "loss": 1.4015,
716
  "step": 118
717
  },
718
  {
719
- "epoch": 0.04,
720
- "learning_rate": 2.060908770796769e-06,
721
- "loss": 1.4401,
722
  "step": 119
723
  },
724
  {
725
- "epoch": 0.04,
726
- "learning_rate": 2.0458574054452316e-06,
727
- "loss": 1.5218,
728
  "step": 120
729
  },
730
  {
731
- "epoch": 0.04,
732
- "learning_rate": 2.0307424208127912e-06,
733
- "loss": 1.547,
734
  "step": 121
735
  },
736
  {
737
- "epoch": 0.04,
738
- "learning_rate": 2.0155655785398396e-06,
739
- "loss": 1.5848,
740
  "step": 122
741
  },
742
  {
743
- "epoch": 0.04,
744
- "learning_rate": 2.000328647476231e-06,
745
- "loss": 1.5504,
746
  "step": 123
747
  },
748
  {
749
- "epoch": 0.04,
750
- "learning_rate": 1.985033403475123e-06,
751
- "loss": 1.5956,
752
  "step": 124
753
  },
754
  {
755
- "epoch": 0.04,
756
- "learning_rate": 1.969681629186004e-06,
757
- "loss": 1.5685,
758
  "step": 125
759
  },
760
  {
761
- "epoch": 0.04,
762
- "learning_rate": 1.954275113846926e-06,
763
- "loss": 1.6568,
764
  "step": 126
765
  },
766
  {
767
- "epoch": 0.04,
768
- "learning_rate": 1.9388156530759715e-06,
769
- "loss": 1.7607,
770
  "step": 127
771
  },
772
  {
773
- "epoch": 0.04,
774
- "learning_rate": 1.9233050486619715e-06,
775
- "loss": 1.7214,
776
  "step": 128
777
  },
778
  {
779
- "epoch": 0.04,
780
- "learning_rate": 1.9077451083545143e-06,
781
- "loss": 1.6867,
782
  "step": 129
783
  },
784
  {
785
- "epoch": 0.04,
786
- "learning_rate": 1.8921376456532485e-06,
787
- "loss": 1.7445,
788
  "step": 130
789
  },
790
  {
791
- "epoch": 0.04,
792
- "learning_rate": 1.8764844795965232e-06,
793
- "loss": 1.8066,
794
  "step": 131
795
  },
796
  {
797
- "epoch": 0.04,
798
- "learning_rate": 1.8607874345493807e-06,
799
- "loss": 1.881,
800
  "step": 132
801
  },
802
  {
803
- "epoch": 0.04,
804
- "learning_rate": 1.8450483399909265e-06,
805
- "loss": 1.9269,
806
  "step": 133
807
  },
808
  {
809
- "epoch": 0.04,
810
- "learning_rate": 1.8292690303011076e-06,
811
- "loss": 2.0067,
812
  "step": 134
813
  },
814
  {
815
- "epoch": 0.04,
816
- "learning_rate": 1.813451344546913e-06,
817
- "loss": 2.0112,
818
  "step": 135
819
  },
820
  {
821
- "epoch": 0.04,
822
- "learning_rate": 1.7975971262680348e-06,
823
- "loss": 2.0712,
824
  "step": 136
825
  },
826
  {
827
- "epoch": 0.04,
828
- "learning_rate": 1.7817082232620054e-06,
829
- "loss": 2.0998,
830
  "step": 137
831
  },
832
  {
833
- "epoch": 0.04,
834
- "learning_rate": 1.7657864873688345e-06,
835
- "loss": 2.0603,
836
  "step": 138
837
  },
838
  {
839
- "epoch": 0.04,
840
- "learning_rate": 1.7498337742551817e-06,
841
- "loss": 2.1719,
842
  "step": 139
843
  },
844
  {
845
- "epoch": 0.04,
846
- "learning_rate": 1.7338519431980798e-06,
847
- "loss": 2.103,
848
  "step": 140
849
  },
850
  {
851
- "epoch": 0.04,
852
- "learning_rate": 1.7178428568682356e-06,
853
- "loss": 2.233,
854
  "step": 141
855
  },
856
  {
857
- "epoch": 0.04,
858
- "learning_rate": 1.701808381112938e-06,
859
- "loss": 2.2755,
860
  "step": 142
861
  },
862
  {
863
- "epoch": 0.04,
864
- "learning_rate": 1.6857503847385956e-06,
865
- "loss": 2.2513,
866
  "step": 143
867
  },
868
  {
869
- "epoch": 0.04,
870
- "learning_rate": 1.6696707392929268e-06,
871
- "loss": 2.2176,
872
  "step": 144
873
  },
874
  {
875
- "epoch": 0.04,
876
- "learning_rate": 1.653571318846834e-06,
877
- "loss": 2.2541,
878
  "step": 145
879
  },
880
  {
881
- "epoch": 0.05,
882
- "learning_rate": 1.6374539997759822e-06,
883
- "loss": 2.2916,
884
  "step": 146
885
  },
886
  {
887
- "epoch": 0.05,
888
- "learning_rate": 1.6213206605421064e-06,
889
- "loss": 2.3988,
890
  "step": 147
891
  },
892
  {
893
- "epoch": 0.05,
894
- "learning_rate": 1.605173181474081e-06,
895
- "loss": 2.3258,
896
  "step": 148
897
  },
898
  {
899
- "epoch": 0.05,
900
- "learning_rate": 1.5890134445487679e-06,
901
- "loss": 2.389,
902
  "step": 149
903
  },
904
  {
905
- "epoch": 0.05,
906
- "learning_rate": 1.5728433331716726e-06,
907
- "loss": 2.5375,
908
  "step": 150
909
  },
910
  {
911
- "epoch": 0.05,
912
- "learning_rate": 1.5566647319574351e-06,
913
- "loss": 1.0571,
914
  "step": 151
915
  },
916
  {
917
- "epoch": 0.05,
918
- "learning_rate": 1.5404795265101808e-06,
919
- "loss": 1.0796,
920
  "step": 152
921
  },
922
  {
923
- "epoch": 0.05,
924
- "learning_rate": 1.5242896032037523e-06,
925
- "loss": 1.0492,
926
  "step": 153
927
  },
928
  {
929
- "epoch": 0.05,
930
- "learning_rate": 1.5080968489618567e-06,
931
- "loss": 1.0444,
932
  "step": 154
933
  },
934
  {
935
- "epoch": 0.05,
936
- "learning_rate": 1.4919031510381438e-06,
937
- "loss": 1.0879,
938
  "step": 155
939
  },
940
  {
941
- "epoch": 0.05,
942
- "learning_rate": 1.4757103967962477e-06,
943
- "loss": 1.1583,
944
  "step": 156
945
  },
946
  {
947
- "epoch": 0.05,
948
- "learning_rate": 1.4595204734898199e-06,
949
- "loss": 1.2424,
950
  "step": 157
951
  },
952
  {
953
- "epoch": 0.05,
954
- "learning_rate": 1.4433352680425654e-06,
955
- "loss": 1.213,
956
  "step": 158
957
  },
958
  {
959
- "epoch": 0.05,
960
- "learning_rate": 1.4271566668283281e-06,
961
- "loss": 1.1966,
962
  "step": 159
963
  },
964
  {
965
- "epoch": 0.05,
966
- "learning_rate": 1.410986555451232e-06,
967
- "loss": 1.2829,
968
  "step": 160
969
  },
970
  {
971
- "epoch": 0.05,
972
- "learning_rate": 1.3948268185259188e-06,
973
- "loss": 1.235,
974
  "step": 161
975
  },
976
  {
977
- "epoch": 0.05,
978
- "learning_rate": 1.3786793394578939e-06,
979
- "loss": 1.2561,
980
  "step": 162
981
  },
982
  {
983
- "epoch": 0.05,
984
- "learning_rate": 1.362546000224018e-06,
985
- "loss": 1.2367,
986
  "step": 163
987
  },
988
  {
989
- "epoch": 0.05,
990
- "learning_rate": 1.3464286811531663e-06,
991
- "loss": 1.2332,
992
  "step": 164
993
  },
994
  {
995
- "epoch": 0.05,
996
- "learning_rate": 1.3303292607070737e-06,
997
- "loss": 1.3269,
998
  "step": 165
999
  },
1000
  {
1001
- "epoch": 0.05,
1002
- "learning_rate": 1.314249615261405e-06,
1003
- "loss": 1.3558,
1004
  "step": 166
1005
  },
1006
  {
1007
- "epoch": 0.05,
1008
- "learning_rate": 1.2981916188870622e-06,
1009
- "loss": 1.349,
1010
  "step": 167
1011
  },
1012
  {
1013
- "epoch": 0.05,
1014
- "learning_rate": 1.282157143131765e-06,
1015
- "loss": 1.328,
1016
  "step": 168
1017
  },
1018
  {
1019
- "epoch": 0.05,
1020
- "learning_rate": 1.2661480568019203e-06,
1021
- "loss": 1.4605,
1022
  "step": 169
1023
  },
1024
  {
1025
- "epoch": 0.05,
1026
- "learning_rate": 1.2501662257448184e-06,
1027
- "loss": 1.4983,
1028
  "step": 170
1029
  },
1030
  {
1031
- "epoch": 0.05,
1032
- "learning_rate": 1.234213512631166e-06,
1033
- "loss": 1.4655,
1034
  "step": 171
1035
  },
1036
  {
1037
- "epoch": 0.05,
1038
- "learning_rate": 1.218291776737995e-06,
1039
- "loss": 1.5223,
1040
  "step": 172
1041
  },
1042
  {
1043
- "epoch": 0.05,
1044
- "learning_rate": 1.2024028737319653e-06,
1045
- "loss": 1.5357,
1046
  "step": 173
1047
  },
1048
  {
1049
- "epoch": 0.05,
1050
- "learning_rate": 1.1865486554530874e-06,
1051
- "loss": 1.5622,
1052
  "step": 174
1053
  },
1054
  {
1055
- "epoch": 0.05,
1056
- "learning_rate": 1.170730969698893e-06,
1057
- "loss": 1.613,
1058
  "step": 175
1059
  },
1060
  {
1061
- "epoch": 0.05,
1062
- "learning_rate": 1.154951660009074e-06,
1063
- "loss": 1.5815,
1064
  "step": 176
1065
  },
1066
  {
1067
- "epoch": 0.05,
1068
- "learning_rate": 1.13921256545062e-06,
1069
- "loss": 1.585,
1070
  "step": 177
1071
  },
1072
  {
1073
- "epoch": 0.06,
1074
- "learning_rate": 1.1235155204034768e-06,
1075
- "loss": 1.7356,
1076
  "step": 178
1077
  },
1078
  {
1079
- "epoch": 0.06,
1080
- "learning_rate": 1.1078623543467518e-06,
1081
- "loss": 1.642,
1082
  "step": 179
1083
  },
1084
  {
1085
- "epoch": 0.06,
1086
- "learning_rate": 1.0922548916454855e-06,
1087
- "loss": 1.7566,
1088
  "step": 180
1089
  },
1090
  {
1091
- "epoch": 0.06,
1092
- "learning_rate": 1.0766949513380286e-06,
1093
- "loss": 1.7018,
1094
  "step": 181
1095
  },
1096
  {
1097
- "epoch": 0.06,
1098
- "learning_rate": 1.061184346924029e-06,
1099
- "loss": 1.8531,
1100
  "step": 182
1101
  },
1102
  {
1103
- "epoch": 0.06,
1104
- "learning_rate": 1.0457248861530742e-06,
1105
- "loss": 1.8846,
1106
  "step": 183
1107
  },
1108
  {
1109
- "epoch": 0.06,
1110
- "learning_rate": 1.0303183708139966e-06,
1111
- "loss": 1.8961,
1112
  "step": 184
1113
  },
1114
  {
1115
- "epoch": 0.06,
1116
- "learning_rate": 1.0149665965248775e-06,
1117
- "loss": 1.8999,
1118
  "step": 185
1119
  },
1120
  {
1121
- "epoch": 0.06,
1122
- "learning_rate": 9.996713525237694e-07,
1123
- "loss": 1.9636,
1124
  "step": 186
1125
  },
1126
  {
1127
- "epoch": 0.06,
1128
- "learning_rate": 9.8443442146016e-07,
1129
- "loss": 2.0039,
1130
  "step": 187
1131
  },
1132
  {
1133
- "epoch": 0.06,
1134
- "learning_rate": 9.69257579187209e-07,
1135
- "loss": 2.0982,
1136
  "step": 188
1137
  },
1138
  {
1139
- "epoch": 0.06,
1140
- "learning_rate": 9.54142594554769e-07,
1141
- "loss": 2.0402,
1142
  "step": 189
1143
  },
1144
  {
1145
- "epoch": 0.06,
1146
- "learning_rate": 9.39091229203231e-07,
1147
- "loss": 2.1466,
1148
  "step": 190
1149
  },
1150
  {
1151
- "epoch": 0.06,
1152
- "learning_rate": 9.241052373582058e-07,
1153
- "loss": 2.1888,
1154
  "step": 191
1155
  },
1156
  {
1157
- "epoch": 0.06,
1158
- "learning_rate": 9.091863656260696e-07,
1159
- "loss": 2.1426,
1160
  "step": 192
1161
  },
1162
  {
1163
- "epoch": 0.06,
1164
- "learning_rate": 8.943363527903977e-07,
1165
- "loss": 2.1605,
1166
  "step": 193
1167
  },
1168
  {
1169
- "epoch": 0.06,
1170
- "learning_rate": 8.795569296093133e-07,
1171
- "loss": 2.2008,
1172
  "step": 194
1173
  },
1174
  {
1175
- "epoch": 0.06,
1176
- "learning_rate": 8.648498186137653e-07,
1177
- "loss": 2.2754,
1178
  "step": 195
1179
  },
1180
  {
1181
- "epoch": 0.06,
1182
- "learning_rate": 8.502167339067705e-07,
1183
- "loss": 2.2318,
1184
  "step": 196
1185
  },
1186
  {
1187
- "epoch": 0.06,
1188
- "learning_rate": 8.356593809636371e-07,
1189
- "loss": 2.3144,
1190
  "step": 197
1191
  },
1192
  {
1193
- "epoch": 0.06,
1194
- "learning_rate": 8.211794564331918e-07,
1195
- "loss": 2.3606,
1196
  "step": 198
1197
  },
1198
  {
1199
- "epoch": 0.06,
1200
- "learning_rate": 8.067786479400346e-07,
1201
- "loss": 2.3714,
1202
  "step": 199
1203
  },
1204
  {
1205
- "epoch": 0.06,
1206
- "learning_rate": 7.924586338878512e-07,
1207
- "loss": 2.3866,
1208
  "step": 200
1209
  },
1210
  {
1211
- "epoch": 0.06,
1212
- "learning_rate": 7.782210832637924e-07,
1213
- "loss": 1.0666,
1214
  "step": 201
1215
  },
1216
  {
1217
- "epoch": 0.06,
1218
- "learning_rate": 7.640676554439594e-07,
1219
- "loss": 1.1337,
1220
  "step": 202
1221
  },
1222
  {
1223
- "epoch": 0.06,
1224
- "learning_rate": 7.500000000000003e-07,
1225
- "loss": 1.0069,
1226
  "step": 203
1227
  },
1228
  {
1229
- "epoch": 0.06,
1230
- "learning_rate": 7.360197565068561e-07,
1231
- "loss": 1.0938,
1232
  "step": 204
1233
  },
1234
  {
1235
- "epoch": 0.06,
1236
- "learning_rate": 7.22128554351668e-07,
1237
- "loss": 1.1479,
1238
  "step": 205
1239
  },
1240
  {
1241
- "epoch": 0.06,
1242
- "learning_rate": 7.083280125438766e-07,
1243
- "loss": 1.136,
1244
  "step": 206
1245
  },
1246
  {
1247
- "epoch": 0.06,
1248
- "learning_rate": 6.946197395265243e-07,
1249
- "loss": 1.1862,
1250
  "step": 207
1251
  },
1252
  {
1253
- "epoch": 0.06,
1254
- "learning_rate": 6.810053329887929e-07,
1255
- "loss": 1.1319,
1256
  "step": 208
1257
  },
1258
  {
1259
- "epoch": 0.06,
1260
- "learning_rate": 6.674863796797954e-07,
1261
- "loss": 1.1757,
1262
  "step": 209
1263
  },
1264
  {
1265
- "epoch": 0.06,
1266
- "learning_rate": 6.540644552236401e-07,
1267
- "loss": 1.262,
1268
  "step": 210
1269
  },
1270
  {
1271
- "epoch": 0.07,
1272
- "learning_rate": 6.407411239357954e-07,
1273
- "loss": 1.222,
1274
  "step": 211
1275
  },
1276
  {
1277
- "epoch": 0.07,
1278
- "learning_rate": 6.275179386407663e-07,
1279
- "loss": 1.2991,
1280
  "step": 212
1281
  },
1282
  {
1283
- "epoch": 0.07,
1284
- "learning_rate": 6.143964404911165e-07,
1285
- "loss": 1.2677,
1286
  "step": 213
1287
  },
1288
  {
1289
- "epoch": 0.07,
1290
- "learning_rate": 6.013781587878464e-07,
1291
- "loss": 1.2905,
1292
  "step": 214
1293
  },
1294
  {
1295
- "epoch": 0.07,
1296
- "learning_rate": 5.884646108021563e-07,
1297
- "loss": 1.1892,
1298
  "step": 215
1299
  },
1300
  {
1301
- "epoch": 0.07,
1302
- "learning_rate": 5.756573015986089e-07,
1303
- "loss": 1.2595,
1304
  "step": 216
1305
  },
1306
  {
1307
- "epoch": 0.07,
1308
- "learning_rate": 5.629577238597132e-07,
1309
- "loss": 1.267,
1310
  "step": 217
1311
  },
1312
  {
1313
- "epoch": 0.07,
1314
- "learning_rate": 5.503673577119552e-07,
1315
- "loss": 1.3492,
1316
  "step": 218
1317
  },
1318
  {
1319
- "epoch": 0.07,
1320
- "learning_rate": 5.378876705532904e-07,
1321
- "loss": 1.357,
1322
  "step": 219
1323
  },
1324
  {
1325
- "epoch": 0.07,
1326
- "learning_rate": 5.255201168821183e-07,
1327
- "loss": 1.4069,
1328
  "step": 220
1329
  },
1330
  {
1331
- "epoch": 0.07,
1332
- "learning_rate": 5.132661381277644e-07,
1333
- "loss": 1.3945,
1334
  "step": 221
1335
  },
1336
  {
1337
- "epoch": 0.07,
1338
- "learning_rate": 5.011271624824787e-07,
1339
- "loss": 1.4184,
1340
  "step": 222
1341
  },
1342
  {
1343
- "epoch": 0.07,
1344
- "learning_rate": 4.891046047349837e-07,
1345
- "loss": 1.51,
1346
  "step": 223
1347
  },
1348
  {
1349
- "epoch": 0.07,
1350
- "learning_rate": 4.771998661055823e-07,
1351
- "loss": 1.6056,
1352
  "step": 224
1353
  },
1354
  {
1355
- "epoch": 0.07,
1356
- "learning_rate": 4.6541433408284356e-07,
1357
- "loss": 1.5254,
1358
  "step": 225
1359
  },
1360
  {
1361
- "epoch": 0.07,
1362
- "learning_rate": 4.5374938226189584e-07,
1363
- "loss": 1.4541,
1364
  "step": 226
1365
  },
1366
  {
1367
- "epoch": 0.07,
1368
- "learning_rate": 4.4220637018433163e-07,
1369
- "loss": 1.5449,
1370
  "step": 227
1371
  },
1372
  {
1373
- "epoch": 0.07,
1374
- "learning_rate": 4.3078664317975654e-07,
1375
- "loss": 1.6804,
1376
  "step": 228
1377
  },
1378
  {
1379
- "epoch": 0.07,
1380
- "learning_rate": 4.1949153220898987e-07,
1381
- "loss": 1.6257,
1382
  "step": 229
1383
  },
1384
  {
1385
- "epoch": 0.07,
1386
- "learning_rate": 4.0832235370894604e-07,
1387
- "loss": 1.656,
1388
  "step": 230
1389
  },
1390
  {
1391
- "epoch": 0.07,
1392
- "learning_rate": 3.972804094391998e-07,
1393
- "loss": 1.8701,
1394
  "step": 231
1395
  },
1396
  {
1397
- "epoch": 0.07,
1398
- "learning_rate": 3.863669863302698e-07,
1399
- "loss": 1.7619,
1400
  "step": 232
1401
  },
1402
  {
1403
- "epoch": 0.07,
1404
- "learning_rate": 3.755833563336293e-07,
1405
- "loss": 1.8791,
1406
  "step": 233
1407
  },
1408
  {
1409
- "epoch": 0.07,
1410
- "learning_rate": 3.64930776273457e-07,
1411
- "loss": 1.9227,
1412
  "step": 234
1413
  },
1414
  {
1415
- "epoch": 0.07,
1416
- "learning_rate": 3.544104877001596e-07,
1417
- "loss": 1.9138,
1418
  "step": 235
1419
  },
1420
  {
1421
- "epoch": 0.07,
1422
- "learning_rate": 3.440237167456663e-07,
1423
- "loss": 2.0062,
1424
  "step": 236
1425
  },
1426
  {
1427
- "epoch": 0.07,
1428
- "learning_rate": 3.337716739805264e-07,
1429
- "loss": 2.0386,
1430
  "step": 237
1431
  },
1432
  {
1433
- "epoch": 0.07,
1434
- "learning_rate": 3.2365555427281634e-07,
1435
- "loss": 2.0127,
1436
  "step": 238
1437
  },
1438
  {
1439
- "epoch": 0.07,
1440
- "learning_rate": 3.1367653664888173e-07,
1441
- "loss": 2.0965,
1442
  "step": 239
1443
  },
1444
  {
1445
- "epoch": 0.07,
1446
- "learning_rate": 3.0383578415591913e-07,
1447
- "loss": 2.1195,
1448
  "step": 240
1449
  },
1450
  {
1451
- "epoch": 0.07,
1452
- "learning_rate": 2.9413444372642496e-07,
1453
- "loss": 2.097,
1454
  "step": 241
1455
  },
1456
  {
1457
- "epoch": 0.07,
1458
- "learning_rate": 2.8457364604452376e-07,
1459
- "loss": 2.2226,
1460
  "step": 242
1461
  },
1462
  {
1463
- "epoch": 0.08,
1464
- "learning_rate": 2.751545054141834e-07,
1465
- "loss": 2.1617,
1466
  "step": 243
1467
  },
1468
  {
1469
- "epoch": 0.08,
1470
- "learning_rate": 2.6587811962934823e-07,
1471
- "loss": 2.1102,
1472
  "step": 244
1473
  },
1474
  {
1475
- "epoch": 0.08,
1476
- "learning_rate": 2.567455698459882e-07,
1477
- "loss": 2.1771,
1478
  "step": 245
1479
  },
1480
  {
1481
- "epoch": 0.08,
1482
- "learning_rate": 2.4775792045609353e-07,
1483
- "loss": 2.1973,
1484
  "step": 246
1485
  },
1486
  {
1487
- "epoch": 0.08,
1488
- "learning_rate": 2.389162189636188e-07,
1489
- "loss": 2.2832,
1490
  "step": 247
1491
  },
1492
  {
1493
- "epoch": 0.08,
1494
- "learning_rate": 2.3022149586239972e-07,
1495
- "loss": 2.1945,
1496
  "step": 248
1497
  },
1498
  {
1499
- "epoch": 0.08,
1500
- "learning_rate": 2.2167476451604624e-07,
1501
- "loss": 2.3652,
1502
  "step": 249
1503
  },
1504
  {
1505
- "epoch": 0.08,
1506
- "learning_rate": 2.1327702103983864e-07,
1507
- "loss": 2.4325,
1508
  "step": 250
1509
  },
1510
  {
1511
- "epoch": 0.08,
1512
- "learning_rate": 2.0502924418463014e-07,
1513
- "loss": 1.069,
1514
  "step": 251
1515
  },
1516
  {
1517
- "epoch": 0.08,
1518
- "learning_rate": 1.9693239522277327e-07,
1519
- "loss": 1.1529,
1520
  "step": 252
1521
  },
1522
  {
1523
- "epoch": 0.08,
1524
- "learning_rate": 1.8898741783608642e-07,
1525
- "loss": 1.0939,
1526
  "step": 253
1527
  },
1528
  {
1529
- "epoch": 0.08,
1530
- "learning_rate": 1.811952380058657e-07,
1531
- "loss": 1.1747,
1532
  "step": 254
1533
  },
1534
  {
1535
- "epoch": 0.08,
1536
- "learning_rate": 1.7355676390496482e-07,
1537
- "loss": 1.1416,
1538
  "step": 255
1539
  },
1540
  {
1541
- "epoch": 0.08,
1542
- "learning_rate": 1.660728857919464e-07,
1543
- "loss": 1.1001,
1544
  "step": 256
1545
  },
1546
  {
1547
- "epoch": 0.08,
1548
- "learning_rate": 1.5874447590732537e-07,
1549
- "loss": 1.1736,
1550
  "step": 257
1551
  },
1552
  {
1553
- "epoch": 0.08,
1554
- "learning_rate": 1.5157238837190719e-07,
1555
- "loss": 1.1635,
1556
  "step": 258
1557
  },
1558
  {
1559
- "epoch": 0.08,
1560
- "learning_rate": 1.4455745908724226e-07,
1561
- "loss": 1.1446,
1562
  "step": 259
1563
  },
1564
  {
1565
- "epoch": 0.08,
1566
- "learning_rate": 1.377005056382018e-07,
1567
- "loss": 1.2487,
1568
  "step": 260
1569
  },
1570
  {
1571
- "epoch": 0.08,
1572
- "learning_rate": 1.3100232719768996e-07,
1573
- "loss": 1.2044,
1574
  "step": 261
1575
  },
1576
  {
1577
- "epoch": 0.08,
1578
- "learning_rate": 1.2446370443349863e-07,
1579
- "loss": 1.313,
1580
  "step": 262
1581
  },
1582
  {
1583
- "epoch": 0.08,
1584
- "learning_rate": 1.180853994173236e-07,
1585
- "loss": 1.2448,
1586
  "step": 263
1587
  },
1588
  {
1589
- "epoch": 0.08,
1590
- "learning_rate": 1.1186815553594382e-07,
1591
- "loss": 1.2088,
1592
  "step": 264
1593
  },
1594
  {
1595
- "epoch": 0.08,
1596
- "learning_rate": 1.058126974045811e-07,
1597
- "loss": 1.3052,
1598
  "step": 265
1599
  },
1600
  {
1601
- "epoch": 0.08,
1602
- "learning_rate": 9.991973078244638e-08,
1603
- "loss": 1.2762,
1604
  "step": 266
1605
  },
1606
  {
1607
- "epoch": 0.08,
1608
- "learning_rate": 9.418994249048474e-08,
1609
- "loss": 1.3319,
1610
  "step": 267
1611
  },
1612
  {
1613
- "epoch": 0.08,
1614
- "learning_rate": 8.862400033132573e-08,
1615
- "loss": 1.3724,
1616
  "step": 268
1617
  },
1618
  {
1619
- "epoch": 0.08,
1620
- "learning_rate": 8.322255301145204e-08,
1621
- "loss": 1.3288,
1622
  "step": 269
1623
  },
1624
  {
1625
- "epoch": 0.08,
1626
- "learning_rate": 7.798623006559436e-08,
1627
- "loss": 1.3914,
1628
  "step": 270
1629
  },
1630
  {
1631
- "epoch": 0.08,
1632
- "learning_rate": 7.291564178335719e-08,
1633
- "loss": 1.3964,
1634
  "step": 271
1635
  },
1636
  {
1637
- "epoch": 0.08,
1638
- "learning_rate": 6.801137913809214e-08,
1639
- "loss": 1.4659,
1640
  "step": 272
1641
  },
1642
  {
1643
- "epoch": 0.08,
1644
- "learning_rate": 6.327401371801944e-08,
1645
- "loss": 1.5826,
1646
  "step": 273
1647
  },
1648
  {
1649
- "epoch": 0.08,
1650
- "learning_rate": 5.870409765960966e-08,
1651
- "loss": 1.514,
1652
  "step": 274
1653
  },
1654
  {
1655
- "epoch": 0.09,
1656
- "learning_rate": 5.430216358323309e-08,
1657
- "loss": 1.569,
1658
  "step": 275
1659
  },
1660
  {
1661
- "epoch": 0.09,
1662
- "learning_rate": 5.00687245310833e-08,
1663
- "loss": 1.5725,
1664
  "step": 276
1665
  },
1666
  {
1667
- "epoch": 0.09,
1668
- "learning_rate": 4.60042739073816e-08,
1669
- "loss": 1.5446,
1670
  "step": 277
1671
  },
1672
  {
1673
- "epoch": 0.09,
1674
- "learning_rate": 4.2109285420872055e-08,
1675
- "loss": 1.6477,
1676
  "step": 278
1677
  },
1678
  {
1679
- "epoch": 0.09,
1680
- "learning_rate": 3.838421302961098e-08,
1681
- "loss": 1.784,
1682
  "step": 279
1683
  },
1684
  {
1685
- "epoch": 0.09,
1686
- "learning_rate": 3.4829490888057424e-08,
1687
- "loss": 1.6312,
1688
  "step": 280
1689
  },
1690
  {
1691
- "epoch": 0.09,
1692
- "learning_rate": 3.1445533296474484e-08,
1693
- "loss": 1.6294,
1694
  "step": 281
1695
  },
1696
  {
1697
- "epoch": 0.09,
1698
- "learning_rate": 2.8232734652641424e-08,
1699
- "loss": 1.66,
1700
  "step": 282
1701
  },
1702
  {
1703
- "epoch": 0.09,
1704
- "learning_rate": 2.5191469405887625e-08,
1705
- "loss": 1.767,
1706
  "step": 283
1707
  },
1708
  {
1709
- "epoch": 0.09,
1710
- "learning_rate": 2.2322092013450313e-08,
1711
- "loss": 1.8148,
1712
  "step": 284
1713
  },
1714
  {
1715
- "epoch": 0.09,
1716
- "learning_rate": 1.962493689916395e-08,
1717
- "loss": 1.8453,
1718
  "step": 285
1719
  },
1720
  {
1721
- "epoch": 0.09,
1722
- "learning_rate": 1.7100318414482063e-08,
1723
- "loss": 1.848,
1724
  "step": 286
1725
  },
1726
  {
1727
- "epoch": 0.09,
1728
- "learning_rate": 1.4748530801840076e-08,
1729
- "loss": 1.9895,
1730
  "step": 287
1731
  },
1732
  {
1733
- "epoch": 0.09,
1734
- "learning_rate": 1.2569848160362384e-08,
1735
- "loss": 1.9292,
1736
  "step": 288
1737
  },
1738
  {
1739
- "epoch": 0.09,
1740
- "learning_rate": 1.0564524413915422e-08,
1741
- "loss": 1.9817,
1742
  "step": 289
1743
  },
1744
  {
1745
- "epoch": 0.09,
1746
- "learning_rate": 8.732793281513663e-09,
1747
- "loss": 2.0912,
1748
  "step": 290
1749
  },
1750
  {
1751
- "epoch": 0.09,
1752
- "learning_rate": 7.074868250079081e-09,
1753
- "loss": 2.1077,
1754
  "step": 291
1755
  },
1756
  {
1757
- "epoch": 0.09,
1758
- "learning_rate": 5.590942549560052e-09,
1759
- "loss": 2.1071,
1760
  "step": 292
1761
  },
1762
  {
1763
- "epoch": 0.09,
1764
- "learning_rate": 4.2811891304105345e-09,
1765
- "loss": 2.2567,
1766
  "step": 293
1767
  },
1768
  {
1769
- "epoch": 0.09,
1770
- "learning_rate": 3.145760643432527e-09,
1771
- "loss": 2.1001,
1772
  "step": 294
1773
  },
1774
  {
1775
- "epoch": 0.09,
1776
- "learning_rate": 2.1847894219846343e-09,
1777
- "loss": 2.177,
1778
  "step": 295
1779
  },
1780
  {
1781
- "epoch": 0.09,
1782
- "learning_rate": 1.3983874665589035e-09,
1783
- "loss": 2.1568,
1784
  "step": 296
1785
  },
1786
  {
1787
- "epoch": 0.09,
1788
- "learning_rate": 7.866464317276001e-10,
1789
- "loss": 2.1978,
1790
  "step": 297
1791
  },
1792
  {
1793
- "epoch": 0.09,
1794
- "learning_rate": 3.496376154604186e-10,
1795
- "loss": 2.2076,
1796
  "step": 298
1797
  },
1798
  {
1799
- "epoch": 0.09,
1800
- "learning_rate": 8.741195081479747e-11,
1801
- "loss": 2.3157,
1802
  "step": 299
1803
  },
1804
  {
1805
- "epoch": 0.09,
1806
  "learning_rate": 0.0,
1807
- "loss": 2.4273,
1808
  "step": 300
1809
  }
1810
  ],
 
1811
  "max_steps": 300,
1812
  "num_train_epochs": 1,
1813
- "total_flos": 8.932835919101952e+16,
 
1814
  "trial_name": null,
1815
  "trial_params": null
1816
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.18547140649149924,
5
+ "eval_steps": 500,
6
  "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 3.333333333333333e-05,
14
+ "loss": 1.1619,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.0,
19
+ "learning_rate": 6.666666666666666e-05,
20
+ "loss": 1.1173,
21
  "step": 2
22
  },
23
  {
24
  "epoch": 0.0,
25
+ "learning_rate": 9.999999999999999e-05,
26
+ "loss": 1.1862,
27
  "step": 3
28
  },
29
  {
30
  "epoch": 0.0,
31
+ "learning_rate": 0.0001333333333333333,
32
+ "loss": 1.243,
33
  "step": 4
34
  },
35
  {
36
  "epoch": 0.0,
37
+ "learning_rate": 0.00016666666666666666,
38
+ "loss": 1.2164,
39
  "step": 5
40
  },
41
  {
42
  "epoch": 0.0,
43
+ "learning_rate": 0.00019999999999999998,
44
+ "loss": 1.1963,
45
  "step": 6
46
  },
47
  {
48
  "epoch": 0.0,
49
+ "learning_rate": 0.0002333333333333333,
50
+ "loss": 1.19,
51
  "step": 7
52
  },
53
  {
54
  "epoch": 0.0,
55
+ "learning_rate": 0.0002666666666666666,
56
+ "loss": 1.1195,
57
  "step": 8
58
  },
59
  {
60
+ "epoch": 0.01,
61
+ "learning_rate": 0.0003,
62
+ "loss": 1.1389,
63
  "step": 9
64
  },
65
  {
66
+ "epoch": 0.01,
67
+ "learning_rate": 0.00029999125880491846,
68
+ "loss": 1.0748,
69
  "step": 10
70
  },
71
  {
72
+ "epoch": 0.01,
73
+ "learning_rate": 0.00029996503623845393,
74
+ "loss": 1.0508,
75
  "step": 11
76
  },
77
  {
78
+ "epoch": 0.01,
79
+ "learning_rate": 0.00029992133535682725,
80
+ "loss": 0.9739,
81
  "step": 12
82
  },
83
  {
84
+ "epoch": 0.01,
85
+ "learning_rate": 0.00029986016125334406,
86
+ "loss": 0.9708,
87
  "step": 13
88
  },
89
  {
90
+ "epoch": 0.01,
91
+ "learning_rate": 0.0002997815210578015,
92
+ "loss": 0.9641,
93
  "step": 14
94
  },
95
  {
96
+ "epoch": 0.01,
97
+ "learning_rate": 0.0002996854239356567,
98
+ "loss": 0.8698,
99
  "step": 15
100
  },
101
  {
102
+ "epoch": 0.01,
103
+ "learning_rate": 0.0002995718810869589,
104
+ "loss": 0.9405,
105
  "step": 16
106
  },
107
  {
108
  "epoch": 0.01,
109
+ "learning_rate": 0.00029944090574504395,
110
+ "loss": 0.9377,
111
  "step": 17
112
  },
113
  {
114
  "epoch": 0.01,
115
+ "learning_rate": 0.0002992925131749921,
116
+ "loss": 0.8775,
117
  "step": 18
118
  },
119
  {
120
  "epoch": 0.01,
121
+ "learning_rate": 0.0002991267206718486,
122
+ "loss": 0.8682,
123
  "step": 19
124
  },
125
  {
126
  "epoch": 0.01,
127
+ "learning_rate": 0.00029894354755860845,
128
+ "loss": 0.8541,
129
  "step": 20
130
  },
131
  {
132
  "epoch": 0.01,
133
+ "learning_rate": 0.00029874301518396376,
134
+ "loss": 0.8653,
135
  "step": 21
136
  },
137
  {
138
  "epoch": 0.01,
139
+ "learning_rate": 0.000298525146919816,
140
+ "loss": 0.9036,
141
  "step": 22
142
  },
143
  {
144
  "epoch": 0.01,
145
+ "learning_rate": 0.0002982899681585518,
146
+ "loss": 0.8277,
147
  "step": 23
148
  },
149
  {
150
  "epoch": 0.01,
151
+ "learning_rate": 0.00029803750631008356,
152
+ "loss": 0.8784,
153
  "step": 24
154
  },
155
  {
156
+ "epoch": 0.02,
157
+ "learning_rate": 0.00029776779079865496,
158
+ "loss": 0.7871,
159
  "step": 25
160
  },
161
  {
162
+ "epoch": 0.02,
163
+ "learning_rate": 0.00029748085305941123,
164
+ "loss": 0.8347,
165
  "step": 26
166
  },
167
  {
168
+ "epoch": 0.02,
169
+ "learning_rate": 0.0002971767265347358,
170
+ "loss": 0.8515,
171
  "step": 27
172
  },
173
  {
174
+ "epoch": 0.02,
175
+ "learning_rate": 0.0002968554466703525,
176
+ "loss": 0.7445,
177
  "step": 28
178
  },
179
  {
180
+ "epoch": 0.02,
181
+ "learning_rate": 0.0002965170509111942,
182
+ "loss": 0.7724,
183
  "step": 29
184
  },
185
  {
186
+ "epoch": 0.02,
187
+ "learning_rate": 0.0002961615786970389,
188
+ "loss": 0.8284,
189
  "step": 30
190
  },
191
  {
192
+ "epoch": 0.02,
193
+ "learning_rate": 0.00029578907145791274,
194
+ "loss": 0.8293,
195
  "step": 31
196
  },
197
  {
198
+ "epoch": 0.02,
199
+ "learning_rate": 0.00029539957260926183,
200
+ "loss": 0.7876,
201
  "step": 32
202
  },
203
  {
204
+ "epoch": 0.02,
205
+ "learning_rate": 0.0002949931275468917,
206
+ "loss": 0.7748,
207
  "step": 33
208
  },
209
  {
210
+ "epoch": 0.02,
211
+ "learning_rate": 0.0002945697836416767,
212
+ "loss": 0.7591,
213
  "step": 34
214
  },
215
  {
216
+ "epoch": 0.02,
217
+ "learning_rate": 0.000294129590234039,
218
+ "loss": 0.73,
219
  "step": 35
220
  },
221
  {
222
+ "epoch": 0.02,
223
+ "learning_rate": 0.00029367259862819804,
224
+ "loss": 0.7324,
225
  "step": 36
226
  },
227
  {
228
+ "epoch": 0.02,
229
+ "learning_rate": 0.00029319886208619073,
230
+ "loss": 0.7659,
231
  "step": 37
232
  },
233
  {
234
+ "epoch": 0.02,
235
+ "learning_rate": 0.00029270843582166427,
236
+ "loss": 0.7069,
237
  "step": 38
238
  },
239
  {
240
+ "epoch": 0.02,
241
+ "learning_rate": 0.00029220137699344055,
242
+ "loss": 0.7087,
243
  "step": 39
244
  },
245
  {
246
+ "epoch": 0.02,
247
+ "learning_rate": 0.0002916777446988548,
248
+ "loss": 0.6674,
249
  "step": 40
250
  },
251
  {
252
+ "epoch": 0.03,
253
+ "learning_rate": 0.00029113759996686743,
254
+ "loss": 0.5521,
255
  "step": 41
256
  },
257
  {
258
+ "epoch": 0.03,
259
+ "learning_rate": 0.0002905810057509515,
260
+ "loss": 0.6268,
261
  "step": 42
262
  },
263
  {
264
+ "epoch": 0.03,
265
+ "learning_rate": 0.00029000802692175537,
266
+ "loss": 0.666,
267
  "step": 43
268
  },
269
  {
270
+ "epoch": 0.03,
271
+ "learning_rate": 0.0002894187302595419,
272
+ "loss": 0.6509,
273
  "step": 44
274
  },
275
  {
276
+ "epoch": 0.03,
277
+ "learning_rate": 0.0002888131844464056,
278
+ "loss": 0.5752,
279
  "step": 45
280
  },
281
  {
282
+ "epoch": 0.03,
283
+ "learning_rate": 0.0002881914600582676,
284
+ "loss": 0.5702,
285
  "step": 46
286
  },
287
  {
288
+ "epoch": 0.03,
289
+ "learning_rate": 0.0002875536295566501,
290
+ "loss": 0.5882,
291
  "step": 47
292
  },
293
  {
294
+ "epoch": 0.03,
295
+ "learning_rate": 0.000286899767280231,
296
+ "loss": 0.5374,
297
  "step": 48
298
  },
299
  {
300
+ "epoch": 0.03,
301
+ "learning_rate": 0.0002862299494361798,
302
+ "loss": 0.4725,
303
  "step": 49
304
  },
305
  {
306
+ "epoch": 0.03,
307
+ "learning_rate": 0.0002855442540912758,
308
+ "loss": 0.4424,
309
  "step": 50
310
  },
311
  {
312
+ "epoch": 0.03,
313
+ "learning_rate": 0.00028484276116280926,
314
+ "loss": 0.9098,
315
  "step": 51
316
  },
317
  {
318
+ "epoch": 0.03,
319
+ "learning_rate": 0.0002841255524092674,
320
+ "loss": 0.8235,
321
  "step": 52
322
  },
323
  {
324
+ "epoch": 0.03,
325
+ "learning_rate": 0.00028339271142080534,
326
+ "loss": 0.8614,
327
  "step": 53
328
  },
329
  {
330
+ "epoch": 0.03,
331
+ "learning_rate": 0.00028264432360950353,
332
+ "loss": 0.8358,
333
  "step": 54
334
  },
335
  {
336
+ "epoch": 0.03,
337
+ "learning_rate": 0.00028188047619941343,
338
+ "loss": 0.8151,
339
  "step": 55
340
  },
341
  {
342
+ "epoch": 0.03,
343
+ "learning_rate": 0.0002811012582163913,
344
+ "loss": 0.8758,
345
  "step": 56
346
  },
347
  {
348
+ "epoch": 0.04,
349
+ "learning_rate": 0.00028030676047772265,
350
+ "loss": 0.8098,
351
  "step": 57
352
  },
353
  {
354
+ "epoch": 0.04,
355
+ "learning_rate": 0.000279497075581537,
356
+ "loss": 0.8003,
357
  "step": 58
358
  },
359
  {
360
+ "epoch": 0.04,
361
+ "learning_rate": 0.0002786722978960161,
362
+ "loss": 0.7646,
363
  "step": 59
364
  },
365
  {
366
+ "epoch": 0.04,
367
+ "learning_rate": 0.0002778325235483954,
368
+ "loss": 0.7734,
369
  "step": 60
370
  },
371
  {
372
+ "epoch": 0.04,
373
+ "learning_rate": 0.00027697785041376006,
374
+ "loss": 0.7807,
375
  "step": 61
376
  },
377
  {
378
+ "epoch": 0.04,
379
+ "learning_rate": 0.0002761083781036381,
380
+ "loss": 0.8398,
381
  "step": 62
382
  },
383
  {
384
+ "epoch": 0.04,
385
+ "learning_rate": 0.00027522420795439065,
386
+ "loss": 0.7767,
387
  "step": 63
388
  },
389
  {
390
+ "epoch": 0.04,
391
+ "learning_rate": 0.0002743254430154012,
392
+ "loss": 0.8138,
393
  "step": 64
394
  },
395
  {
396
+ "epoch": 0.04,
397
+ "learning_rate": 0.0002734121880370652,
398
+ "loss": 0.7638,
399
  "step": 65
400
  },
401
  {
402
+ "epoch": 0.04,
403
+ "learning_rate": 0.0002724845494585816,
404
+ "loss": 0.8572,
405
  "step": 66
406
  },
407
  {
408
+ "epoch": 0.04,
409
+ "learning_rate": 0.0002715426353955476,
410
+ "loss": 0.8552,
411
  "step": 67
412
  },
413
  {
414
+ "epoch": 0.04,
415
+ "learning_rate": 0.0002705865556273575,
416
+ "loss": 0.7779,
417
  "step": 68
418
  },
419
  {
420
+ "epoch": 0.04,
421
+ "learning_rate": 0.0002696164215844081,
422
+ "loss": 0.8717,
423
  "step": 69
424
  },
425
  {
426
+ "epoch": 0.04,
427
+ "learning_rate": 0.00026863234633511183,
428
+ "loss": 0.8209,
429
  "step": 70
430
  },
431
  {
432
+ "epoch": 0.04,
433
+ "learning_rate": 0.00026763444457271837,
434
+ "loss": 0.8173,
435
  "step": 71
436
  },
437
  {
438
+ "epoch": 0.04,
439
+ "learning_rate": 0.0002666228326019474,
440
+ "loss": 0.8009,
441
  "step": 72
442
  },
443
  {
444
+ "epoch": 0.05,
445
+ "learning_rate": 0.00026559762832543336,
446
+ "loss": 0.8182,
447
  "step": 73
448
  },
449
  {
450
+ "epoch": 0.05,
451
+ "learning_rate": 0.000264558951229984,
452
+ "loss": 0.8317,
453
  "step": 74
454
  },
455
  {
456
+ "epoch": 0.05,
457
+ "learning_rate": 0.00026350692237265427,
458
+ "loss": 0.7987,
459
  "step": 75
460
  },
461
  {
462
+ "epoch": 0.05,
463
+ "learning_rate": 0.0002624416643666371,
464
+ "loss": 0.7929,
465
  "step": 76
466
  },
467
  {
468
+ "epoch": 0.05,
469
+ "learning_rate": 0.000261363301366973,
470
+ "loss": 0.7416,
471
  "step": 77
472
  },
473
  {
474
+ "epoch": 0.05,
475
+ "learning_rate": 0.00026027195905608006,
476
+ "loss": 0.799,
477
  "step": 78
478
  },
479
  {
480
+ "epoch": 0.05,
481
+ "learning_rate": 0.0002591677646291054,
482
+ "loss": 0.7643,
483
  "step": 79
484
  },
485
  {
486
+ "epoch": 0.05,
487
+ "learning_rate": 0.00025805084677910095,
488
+ "loss": 0.7045,
489
  "step": 80
490
  },
491
  {
492
+ "epoch": 0.05,
493
+ "learning_rate": 0.0002569213356820244,
494
+ "loss": 0.7651,
495
  "step": 81
496
  },
497
  {
498
+ "epoch": 0.05,
499
+ "learning_rate": 0.0002557793629815669,
500
+ "loss": 0.727,
501
  "step": 82
502
  },
503
  {
504
+ "epoch": 0.05,
505
+ "learning_rate": 0.00025462506177381043,
506
+ "loss": 0.7623,
507
  "step": 83
508
  },
509
  {
510
+ "epoch": 0.05,
511
+ "learning_rate": 0.00025345856659171563,
512
+ "loss": 0.7209,
513
  "step": 84
514
  },
515
  {
516
+ "epoch": 0.05,
517
+ "learning_rate": 0.00025228001338944175,
518
+ "loss": 0.7104,
519
  "step": 85
520
  },
521
  {
522
+ "epoch": 0.05,
523
+ "learning_rate": 0.0002510895395265016,
524
+ "loss": 0.6989,
525
  "step": 86
526
  },
527
  {
528
+ "epoch": 0.05,
529
+ "learning_rate": 0.00024988728375175214,
530
+ "loss": 0.7347,
531
  "step": 87
532
  },
533
  {
534
+ "epoch": 0.05,
535
+ "learning_rate": 0.00024867338618722357,
536
+ "loss": 0.6649,
537
  "step": 88
538
  },
539
  {
540
+ "epoch": 0.06,
541
+ "learning_rate": 0.0002474479883117882,
542
+ "loss": 0.6432,
543
  "step": 89
544
  },
545
  {
546
+ "epoch": 0.06,
547
+ "learning_rate": 0.00024621123294467096,
548
+ "loss": 0.6257,
549
  "step": 90
550
  },
551
  {
552
+ "epoch": 0.06,
553
+ "learning_rate": 0.0002449632642288045,
554
+ "loss": 0.5743,
555
  "step": 91
556
  },
557
  {
558
+ "epoch": 0.06,
559
+ "learning_rate": 0.00024370422761402867,
560
+ "loss": 0.6499,
561
  "step": 92
562
  },
563
  {
564
+ "epoch": 0.06,
565
+ "learning_rate": 0.0002424342698401391,
566
+ "loss": 0.5454,
567
  "step": 93
568
  },
569
  {
570
+ "epoch": 0.06,
571
+ "learning_rate": 0.00024115353891978431,
572
+ "loss": 0.561,
573
  "step": 94
574
  },
575
  {
576
+ "epoch": 0.06,
577
+ "learning_rate": 0.00023986218412121537,
578
+ "loss": 0.6056,
579
  "step": 95
580
  },
581
  {
582
+ "epoch": 0.06,
583
+ "learning_rate": 0.00023856035595088839,
584
+ "loss": 0.518,
585
  "step": 96
586
  },
587
  {
588
+ "epoch": 0.06,
589
+ "learning_rate": 0.00023724820613592337,
590
+ "loss": 0.5195,
591
  "step": 97
592
  },
593
  {
594
+ "epoch": 0.06,
595
+ "learning_rate": 0.00023592588760642044,
596
+ "loss": 0.5386,
597
  "step": 98
598
  },
599
  {
600
+ "epoch": 0.06,
601
+ "learning_rate": 0.00023459355447763596,
602
+ "loss": 0.4816,
603
  "step": 99
604
  },
605
  {
606
+ "epoch": 0.06,
607
+ "learning_rate": 0.00023325136203202049,
608
+ "loss": 0.3958,
609
  "step": 100
610
  },
611
  {
612
+ "epoch": 0.06,
613
+ "learning_rate": 0.00023189946670112069,
614
+ "loss": 0.7837,
615
  "step": 101
616
  },
617
  {
618
+ "epoch": 0.06,
619
+ "learning_rate": 0.00023053802604734757,
620
+ "loss": 0.8046,
621
  "step": 102
622
  },
623
  {
624
+ "epoch": 0.06,
625
+ "learning_rate": 0.00022916719874561226,
626
+ "loss": 0.8202,
627
  "step": 103
628
  },
629
  {
630
+ "epoch": 0.06,
631
+ "learning_rate": 0.0002277871445648332,
632
+ "loss": 0.8035,
633
  "step": 104
634
  },
635
  {
636
+ "epoch": 0.06,
637
+ "learning_rate": 0.00022639802434931444,
638
+ "loss": 0.8827,
639
  "step": 105
640
  },
641
  {
642
+ "epoch": 0.07,
643
+ "learning_rate": 0.000225,
644
+ "loss": 0.7623,
645
  "step": 106
646
  },
647
  {
648
+ "epoch": 0.07,
649
+ "learning_rate": 0.00022359323445560406,
650
+ "loss": 0.8376,
651
  "step": 107
652
  },
653
  {
654
+ "epoch": 0.07,
655
+ "learning_rate": 0.00022217789167362073,
656
+ "loss": 0.8254,
657
  "step": 108
658
  },
659
  {
660
+ "epoch": 0.07,
661
+ "learning_rate": 0.00022075413661121492,
662
+ "loss": 0.8591,
663
  "step": 109
664
  },
665
  {
666
+ "epoch": 0.07,
667
+ "learning_rate": 0.00021932213520599653,
668
+ "loss": 0.77,
669
  "step": 110
670
  },
671
  {
672
+ "epoch": 0.07,
673
+ "learning_rate": 0.00021788205435668083,
674
+ "loss": 0.8327,
675
  "step": 111
676
  },
677
  {
678
+ "epoch": 0.07,
679
+ "learning_rate": 0.00021643406190363624,
680
+ "loss": 0.8196,
681
  "step": 112
682
  },
683
  {
684
+ "epoch": 0.07,
685
+ "learning_rate": 0.00021497832660932295,
686
+ "loss": 0.7589,
687
  "step": 113
688
  },
689
  {
690
+ "epoch": 0.07,
691
+ "learning_rate": 0.00021351501813862356,
692
+ "loss": 0.8008,
693
  "step": 114
694
  },
695
  {
696
+ "epoch": 0.07,
697
+ "learning_rate": 0.0002120443070390687,
698
+ "loss": 0.7663,
699
  "step": 115
700
  },
701
  {
702
+ "epoch": 0.07,
703
+ "learning_rate": 0.00021056636472096025,
704
+ "loss": 0.8268,
705
  "step": 116
706
  },
707
  {
708
+ "epoch": 0.07,
709
+ "learning_rate": 0.00020908136343739307,
710
+ "loss": 0.7696,
711
  "step": 117
712
  },
713
  {
714
+ "epoch": 0.07,
715
+ "learning_rate": 0.00020758947626417943,
716
+ "loss": 0.8556,
717
  "step": 118
718
  },
719
  {
720
+ "epoch": 0.07,
721
+ "learning_rate": 0.0002060908770796769,
722
+ "loss": 0.8164,
723
  "step": 119
724
  },
725
  {
726
+ "epoch": 0.07,
727
+ "learning_rate": 0.00020458574054452313,
728
+ "loss": 0.8059,
729
  "step": 120
730
  },
731
  {
732
+ "epoch": 0.07,
733
+ "learning_rate": 0.00020307424208127912,
734
+ "loss": 0.8276,
735
  "step": 121
736
  },
737
  {
738
+ "epoch": 0.08,
739
+ "learning_rate": 0.00020155655785398393,
740
+ "loss": 0.7899,
741
  "step": 122
742
  },
743
  {
744
+ "epoch": 0.08,
745
+ "learning_rate": 0.0002000328647476231,
746
+ "loss": 0.7504,
747
  "step": 123
748
  },
749
  {
750
+ "epoch": 0.08,
751
+ "learning_rate": 0.00019850334034751226,
752
+ "loss": 0.8724,
753
  "step": 124
754
  },
755
  {
756
+ "epoch": 0.08,
757
+ "learning_rate": 0.00019696816291860038,
758
+ "loss": 0.8076,
759
  "step": 125
760
  },
761
  {
762
+ "epoch": 0.08,
763
+ "learning_rate": 0.0001954275113846926,
764
+ "loss": 0.7999,
765
  "step": 126
766
  },
767
  {
768
+ "epoch": 0.08,
769
+ "learning_rate": 0.00019388156530759712,
770
+ "loss": 0.7523,
771
  "step": 127
772
  },
773
  {
774
+ "epoch": 0.08,
775
+ "learning_rate": 0.00019233050486619713,
776
+ "loss": 0.7789,
777
  "step": 128
778
  },
779
  {
780
+ "epoch": 0.08,
781
+ "learning_rate": 0.0001907745108354514,
782
+ "loss": 0.7342,
783
  "step": 129
784
  },
785
  {
786
+ "epoch": 0.08,
787
+ "learning_rate": 0.00018921376456532482,
788
+ "loss": 0.7801,
789
  "step": 130
790
  },
791
  {
792
+ "epoch": 0.08,
793
+ "learning_rate": 0.00018764844795965229,
794
+ "loss": 0.7579,
795
  "step": 131
796
  },
797
  {
798
+ "epoch": 0.08,
799
+ "learning_rate": 0.00018607874345493805,
800
+ "loss": 0.6844,
801
  "step": 132
802
  },
803
  {
804
+ "epoch": 0.08,
805
+ "learning_rate": 0.00018450483399909263,
806
+ "loss": 0.7052,
807
  "step": 133
808
  },
809
  {
810
+ "epoch": 0.08,
811
+ "learning_rate": 0.00018292690303011076,
812
+ "loss": 0.6515,
813
  "step": 134
814
  },
815
  {
816
+ "epoch": 0.08,
817
+ "learning_rate": 0.00018134513445469127,
818
+ "loss": 0.6846,
819
  "step": 135
820
  },
821
  {
822
+ "epoch": 0.08,
823
+ "learning_rate": 0.00017975971262680347,
824
+ "loss": 0.6469,
825
  "step": 136
826
  },
827
  {
828
+ "epoch": 0.08,
829
+ "learning_rate": 0.00017817082232620052,
830
+ "loss": 0.6703,
831
  "step": 137
832
  },
833
  {
834
+ "epoch": 0.09,
835
+ "learning_rate": 0.00017657864873688343,
836
+ "loss": 0.6966,
837
  "step": 138
838
  },
839
  {
840
+ "epoch": 0.09,
841
+ "learning_rate": 0.00017498337742551817,
842
+ "loss": 0.6274,
843
  "step": 139
844
  },
845
  {
846
+ "epoch": 0.09,
847
+ "learning_rate": 0.00017338519431980796,
848
+ "loss": 0.6189,
849
  "step": 140
850
  },
851
  {
852
+ "epoch": 0.09,
853
+ "learning_rate": 0.00017178428568682353,
854
+ "loss": 0.6881,
855
  "step": 141
856
  },
857
  {
858
+ "epoch": 0.09,
859
+ "learning_rate": 0.0001701808381112938,
860
+ "loss": 0.6194,
861
  "step": 142
862
  },
863
  {
864
+ "epoch": 0.09,
865
+ "learning_rate": 0.00016857503847385953,
866
+ "loss": 0.5674,
867
  "step": 143
868
  },
869
  {
870
+ "epoch": 0.09,
871
+ "learning_rate": 0.00016696707392929266,
872
+ "loss": 0.5878,
873
  "step": 144
874
  },
875
  {
876
+ "epoch": 0.09,
877
+ "learning_rate": 0.0001653571318846834,
878
+ "loss": 0.5229,
879
  "step": 145
880
  },
881
  {
882
+ "epoch": 0.09,
883
+ "learning_rate": 0.00016374539997759821,
884
+ "loss": 0.535,
885
  "step": 146
886
  },
887
  {
888
+ "epoch": 0.09,
889
+ "learning_rate": 0.00016213206605421063,
890
+ "loss": 0.4889,
891
  "step": 147
892
  },
893
  {
894
+ "epoch": 0.09,
895
+ "learning_rate": 0.0001605173181474081,
896
+ "loss": 0.5009,
897
  "step": 148
898
  },
899
  {
900
+ "epoch": 0.09,
901
+ "learning_rate": 0.00015890134445487676,
902
+ "loss": 0.4467,
903
  "step": 149
904
  },
905
  {
906
+ "epoch": 0.09,
907
+ "learning_rate": 0.00015728433331716724,
908
+ "loss": 0.4275,
909
  "step": 150
910
  },
911
  {
912
+ "epoch": 0.09,
913
+ "learning_rate": 0.0001556664731957435,
914
+ "loss": 0.7781,
915
  "step": 151
916
  },
917
  {
918
+ "epoch": 0.09,
919
+ "learning_rate": 0.00015404795265101806,
920
+ "loss": 0.8081,
921
  "step": 152
922
  },
923
  {
924
+ "epoch": 0.09,
925
+ "learning_rate": 0.00015242896032037522,
926
+ "loss": 0.8014,
927
  "step": 153
928
  },
929
  {
930
+ "epoch": 0.1,
931
+ "learning_rate": 0.00015080968489618565,
932
+ "loss": 0.8031,
933
  "step": 154
934
  },
935
  {
936
+ "epoch": 0.1,
937
+ "learning_rate": 0.00014919031510381435,
938
+ "loss": 0.8138,
939
  "step": 155
940
  },
941
  {
942
+ "epoch": 0.1,
943
+ "learning_rate": 0.00014757103967962475,
944
+ "loss": 0.7578,
945
  "step": 156
946
  },
947
  {
948
+ "epoch": 0.1,
949
+ "learning_rate": 0.00014595204734898197,
950
+ "loss": 0.8272,
951
  "step": 157
952
  },
953
  {
954
+ "epoch": 0.1,
955
+ "learning_rate": 0.0001443335268042565,
956
+ "loss": 0.809,
957
  "step": 158
958
  },
959
  {
960
+ "epoch": 0.1,
961
+ "learning_rate": 0.0001427156666828328,
962
+ "loss": 0.8103,
963
  "step": 159
964
  },
965
  {
966
+ "epoch": 0.1,
967
+ "learning_rate": 0.00014109865554512319,
968
+ "loss": 0.7904,
969
  "step": 160
970
  },
971
  {
972
+ "epoch": 0.1,
973
+ "learning_rate": 0.00013948268185259188,
974
+ "loss": 0.7967,
975
  "step": 161
976
  },
977
  {
978
+ "epoch": 0.1,
979
+ "learning_rate": 0.00013786793394578937,
980
+ "loss": 0.8111,
981
  "step": 162
982
  },
983
  {
984
+ "epoch": 0.1,
985
+ "learning_rate": 0.0001362546000224018,
986
+ "loss": 0.787,
987
  "step": 163
988
  },
989
  {
990
+ "epoch": 0.1,
991
+ "learning_rate": 0.00013464286811531661,
992
+ "loss": 0.8282,
993
  "step": 164
994
  },
995
  {
996
+ "epoch": 0.1,
997
+ "learning_rate": 0.00013303292607070737,
998
+ "loss": 0.7792,
999
  "step": 165
1000
  },
1001
  {
1002
+ "epoch": 0.1,
1003
+ "learning_rate": 0.0001314249615261405,
1004
+ "loss": 0.7955,
1005
  "step": 166
1006
  },
1007
  {
1008
+ "epoch": 0.1,
1009
+ "learning_rate": 0.0001298191618887062,
1010
+ "loss": 0.7874,
1011
  "step": 167
1012
  },
1013
  {
1014
+ "epoch": 0.1,
1015
+ "learning_rate": 0.00012821571431317647,
1016
+ "loss": 0.7637,
1017
  "step": 168
1018
  },
1019
  {
1020
+ "epoch": 0.1,
1021
+ "learning_rate": 0.00012661480568019201,
1022
+ "loss": 0.7641,
1023
  "step": 169
1024
  },
1025
  {
1026
+ "epoch": 0.11,
1027
+ "learning_rate": 0.0001250166225744818,
1028
+ "loss": 0.8431,
1029
  "step": 170
1030
  },
1031
  {
1032
+ "epoch": 0.11,
1033
+ "learning_rate": 0.0001234213512631166,
1034
+ "loss": 0.8353,
1035
  "step": 171
1036
  },
1037
  {
1038
+ "epoch": 0.11,
1039
+ "learning_rate": 0.00012182917767379948,
1040
+ "loss": 0.7732,
1041
  "step": 172
1042
  },
1043
  {
1044
+ "epoch": 0.11,
1045
+ "learning_rate": 0.00012024028737319652,
1046
+ "loss": 0.789,
1047
  "step": 173
1048
  },
1049
  {
1050
+ "epoch": 0.11,
1051
+ "learning_rate": 0.00011865486554530873,
1052
+ "loss": 0.7473,
1053
  "step": 174
1054
  },
1055
  {
1056
+ "epoch": 0.11,
1057
+ "learning_rate": 0.0001170730969698893,
1058
+ "loss": 0.8313,
1059
  "step": 175
1060
  },
1061
  {
1062
+ "epoch": 0.11,
1063
+ "learning_rate": 0.00011549516600090737,
1064
+ "loss": 0.855,
1065
  "step": 176
1066
  },
1067
  {
1068
+ "epoch": 0.11,
1069
+ "learning_rate": 0.00011392125654506198,
1070
+ "loss": 0.7981,
1071
  "step": 177
1072
  },
1073
  {
1074
+ "epoch": 0.11,
1075
+ "learning_rate": 0.00011235155204034767,
1076
+ "loss": 0.7148,
1077
  "step": 178
1078
  },
1079
  {
1080
+ "epoch": 0.11,
1081
+ "learning_rate": 0.00011078623543467518,
1082
+ "loss": 0.7012,
1083
  "step": 179
1084
  },
1085
  {
1086
+ "epoch": 0.11,
1087
+ "learning_rate": 0.00010922548916454855,
1088
+ "loss": 0.7313,
1089
  "step": 180
1090
  },
1091
  {
1092
+ "epoch": 0.11,
1093
+ "learning_rate": 0.00010766949513380284,
1094
+ "loss": 0.6691,
1095
  "step": 181
1096
  },
1097
  {
1098
+ "epoch": 0.11,
1099
+ "learning_rate": 0.00010611843469240288,
1100
+ "loss": 0.658,
1101
  "step": 182
1102
  },
1103
  {
1104
+ "epoch": 0.11,
1105
+ "learning_rate": 0.00010457248861530741,
1106
+ "loss": 0.7276,
1107
  "step": 183
1108
  },
1109
  {
1110
+ "epoch": 0.11,
1111
+ "learning_rate": 0.00010303183708139964,
1112
+ "loss": 0.732,
1113
  "step": 184
1114
  },
1115
  {
1116
+ "epoch": 0.11,
1117
+ "learning_rate": 0.00010149665965248775,
1118
+ "loss": 0.6751,
1119
  "step": 185
1120
  },
1121
  {
1122
+ "epoch": 0.11,
1123
+ "learning_rate": 9.996713525237694e-05,
1124
+ "loss": 0.6968,
1125
  "step": 186
1126
  },
1127
  {
1128
+ "epoch": 0.12,
1129
+ "learning_rate": 9.8443442146016e-05,
1130
+ "loss": 0.6131,
1131
  "step": 187
1132
  },
1133
  {
1134
+ "epoch": 0.12,
1135
+ "learning_rate": 9.692575791872089e-05,
1136
+ "loss": 0.6497,
1137
  "step": 188
1138
  },
1139
  {
1140
+ "epoch": 0.12,
1141
+ "learning_rate": 9.541425945547687e-05,
1142
+ "loss": 0.5895,
1143
  "step": 189
1144
  },
1145
  {
1146
+ "epoch": 0.12,
1147
+ "learning_rate": 9.390912292032309e-05,
1148
+ "loss": 0.6346,
1149
  "step": 190
1150
  },
1151
  {
1152
+ "epoch": 0.12,
1153
+ "learning_rate": 9.241052373582057e-05,
1154
+ "loss": 0.5706,
1155
  "step": 191
1156
  },
1157
  {
1158
+ "epoch": 0.12,
1159
+ "learning_rate": 9.091863656260695e-05,
1160
+ "loss": 0.5917,
1161
  "step": 192
1162
  },
1163
  {
1164
+ "epoch": 0.12,
1165
+ "learning_rate": 8.943363527903976e-05,
1166
+ "loss": 0.5934,
1167
  "step": 193
1168
  },
1169
  {
1170
+ "epoch": 0.12,
1171
+ "learning_rate": 8.795569296093132e-05,
1172
+ "loss": 0.5427,
1173
  "step": 194
1174
  },
1175
  {
1176
+ "epoch": 0.12,
1177
+ "learning_rate": 8.648498186137653e-05,
1178
+ "loss": 0.5419,
1179
  "step": 195
1180
  },
1181
  {
1182
+ "epoch": 0.12,
1183
+ "learning_rate": 8.502167339067705e-05,
1184
+ "loss": 0.5431,
1185
  "step": 196
1186
  },
1187
  {
1188
+ "epoch": 0.12,
1189
+ "learning_rate": 8.356593809636371e-05,
1190
+ "loss": 0.4831,
1191
  "step": 197
1192
  },
1193
  {
1194
+ "epoch": 0.12,
1195
+ "learning_rate": 8.211794564331917e-05,
1196
+ "loss": 0.5207,
1197
  "step": 198
1198
  },
1199
  {
1200
+ "epoch": 0.12,
1201
+ "learning_rate": 8.067786479400346e-05,
1202
+ "loss": 0.4644,
1203
  "step": 199
1204
  },
1205
  {
1206
+ "epoch": 0.12,
1207
+ "learning_rate": 7.924586338878511e-05,
1208
+ "loss": 0.5012,
1209
  "step": 200
1210
  },
1211
  {
1212
+ "epoch": 0.12,
1213
+ "learning_rate": 7.782210832637923e-05,
1214
+ "loss": 0.7555,
1215
  "step": 201
1216
  },
1217
  {
1218
+ "epoch": 0.12,
1219
+ "learning_rate": 7.640676554439594e-05,
1220
+ "loss": 0.7989,
1221
  "step": 202
1222
  },
1223
  {
1224
+ "epoch": 0.13,
1225
+ "learning_rate": 7.500000000000002e-05,
1226
+ "loss": 0.7734,
1227
  "step": 203
1228
  },
1229
  {
1230
+ "epoch": 0.13,
1231
+ "learning_rate": 7.36019756506856e-05,
1232
+ "loss": 0.7744,
1233
  "step": 204
1234
  },
1235
  {
1236
+ "epoch": 0.13,
1237
+ "learning_rate": 7.22128554351668e-05,
1238
+ "loss": 0.768,
1239
  "step": 205
1240
  },
1241
  {
1242
+ "epoch": 0.13,
1243
+ "learning_rate": 7.083280125438766e-05,
1244
+ "loss": 0.775,
1245
  "step": 206
1246
  },
1247
  {
1248
+ "epoch": 0.13,
1249
+ "learning_rate": 6.946197395265242e-05,
1250
+ "loss": 0.8156,
1251
  "step": 207
1252
  },
1253
  {
1254
+ "epoch": 0.13,
1255
+ "learning_rate": 6.810053329887928e-05,
1256
+ "loss": 0.7672,
1257
  "step": 208
1258
  },
1259
  {
1260
+ "epoch": 0.13,
1261
+ "learning_rate": 6.674863796797953e-05,
1262
+ "loss": 0.8119,
1263
  "step": 209
1264
  },
1265
  {
1266
+ "epoch": 0.13,
1267
+ "learning_rate": 6.540644552236401e-05,
1268
+ "loss": 0.8254,
1269
  "step": 210
1270
  },
1271
  {
1272
+ "epoch": 0.13,
1273
+ "learning_rate": 6.407411239357953e-05,
1274
+ "loss": 0.7838,
1275
  "step": 211
1276
  },
1277
  {
1278
+ "epoch": 0.13,
1279
+ "learning_rate": 6.275179386407663e-05,
1280
+ "loss": 0.7809,
1281
  "step": 212
1282
  },
1283
  {
1284
+ "epoch": 0.13,
1285
+ "learning_rate": 6.143964404911164e-05,
1286
+ "loss": 0.7387,
1287
  "step": 213
1288
  },
1289
  {
1290
+ "epoch": 0.13,
1291
+ "learning_rate": 6.013781587878463e-05,
1292
+ "loss": 0.7791,
1293
  "step": 214
1294
  },
1295
  {
1296
+ "epoch": 0.13,
1297
+ "learning_rate": 5.8846461080215626e-05,
1298
+ "loss": 0.8441,
1299
  "step": 215
1300
  },
1301
  {
1302
+ "epoch": 0.13,
1303
+ "learning_rate": 5.756573015986089e-05,
1304
+ "loss": 0.8518,
1305
  "step": 216
1306
  },
1307
  {
1308
+ "epoch": 0.13,
1309
+ "learning_rate": 5.629577238597132e-05,
1310
+ "loss": 0.7721,
1311
  "step": 217
1312
  },
1313
  {
1314
+ "epoch": 0.13,
1315
+ "learning_rate": 5.503673577119552e-05,
1316
+ "loss": 0.7412,
1317
  "step": 218
1318
  },
1319
  {
1320
+ "epoch": 0.14,
1321
+ "learning_rate": 5.378876705532904e-05,
1322
+ "loss": 0.8937,
1323
  "step": 219
1324
  },
1325
  {
1326
+ "epoch": 0.14,
1327
+ "learning_rate": 5.2552011688211835e-05,
1328
+ "loss": 0.823,
1329
  "step": 220
1330
  },
1331
  {
1332
+ "epoch": 0.14,
1333
+ "learning_rate": 5.1326613812776434e-05,
1334
+ "loss": 0.7921,
1335
  "step": 221
1336
  },
1337
  {
1338
+ "epoch": 0.14,
1339
+ "learning_rate": 5.011271624824786e-05,
1340
+ "loss": 0.7972,
1341
  "step": 222
1342
  },
1343
  {
1344
+ "epoch": 0.14,
1345
+ "learning_rate": 4.891046047349837e-05,
1346
+ "loss": 0.8309,
1347
  "step": 223
1348
  },
1349
  {
1350
+ "epoch": 0.14,
1351
+ "learning_rate": 4.7719986610558234e-05,
1352
+ "loss": 0.7832,
1353
  "step": 224
1354
  },
1355
  {
1356
+ "epoch": 0.14,
1357
+ "learning_rate": 4.654143340828435e-05,
1358
+ "loss": 0.7451,
1359
  "step": 225
1360
  },
1361
  {
1362
+ "epoch": 0.14,
1363
+ "learning_rate": 4.537493822618958e-05,
1364
+ "loss": 0.8234,
1365
  "step": 226
1366
  },
1367
  {
1368
+ "epoch": 0.14,
1369
+ "learning_rate": 4.422063701843316e-05,
1370
+ "loss": 0.7882,
1371
  "step": 227
1372
  },
1373
  {
1374
+ "epoch": 0.14,
1375
+ "learning_rate": 4.3078664317975646e-05,
1376
+ "loss": 0.8251,
1377
  "step": 228
1378
  },
1379
  {
1380
+ "epoch": 0.14,
1381
+ "learning_rate": 4.194915322089898e-05,
1382
+ "loss": 0.8103,
1383
  "step": 229
1384
  },
1385
  {
1386
+ "epoch": 0.14,
1387
+ "learning_rate": 4.08322353708946e-05,
1388
+ "loss": 0.7449,
1389
  "step": 230
1390
  },
1391
  {
1392
+ "epoch": 0.14,
1393
+ "learning_rate": 3.972804094391998e-05,
1394
+ "loss": 0.75,
1395
  "step": 231
1396
  },
1397
  {
1398
+ "epoch": 0.14,
1399
+ "learning_rate": 3.863669863302697e-05,
1400
+ "loss": 0.7377,
1401
  "step": 232
1402
  },
1403
  {
1404
+ "epoch": 0.14,
1405
+ "learning_rate": 3.755833563336293e-05,
1406
+ "loss": 0.7428,
1407
  "step": 233
1408
  },
1409
  {
1410
+ "epoch": 0.14,
1411
+ "learning_rate": 3.64930776273457e-05,
1412
+ "loss": 0.7281,
1413
  "step": 234
1414
  },
1415
  {
1416
+ "epoch": 0.15,
1417
+ "learning_rate": 3.5441048770015954e-05,
1418
+ "loss": 0.676,
1419
  "step": 235
1420
  },
1421
  {
1422
+ "epoch": 0.15,
1423
+ "learning_rate": 3.4402371674566626e-05,
1424
+ "loss": 0.6793,
1425
  "step": 236
1426
  },
1427
  {
1428
+ "epoch": 0.15,
1429
+ "learning_rate": 3.3377167398052636e-05,
1430
+ "loss": 0.6211,
1431
  "step": 237
1432
  },
1433
  {
1434
+ "epoch": 0.15,
1435
+ "learning_rate": 3.2365555427281634e-05,
1436
+ "loss": 0.5825,
1437
  "step": 238
1438
  },
1439
  {
1440
+ "epoch": 0.15,
1441
+ "learning_rate": 3.136765366488817e-05,
1442
+ "loss": 0.6659,
1443
  "step": 239
1444
  },
1445
  {
1446
+ "epoch": 0.15,
1447
+ "learning_rate": 3.038357841559191e-05,
1448
+ "loss": 0.6726,
1449
  "step": 240
1450
  },
1451
  {
1452
+ "epoch": 0.15,
1453
+ "learning_rate": 2.941344437264249e-05,
1454
+ "loss": 0.595,
1455
  "step": 241
1456
  },
1457
  {
1458
+ "epoch": 0.15,
1459
+ "learning_rate": 2.8457364604452372e-05,
1460
+ "loss": 0.5769,
1461
  "step": 242
1462
  },
1463
  {
1464
+ "epoch": 0.15,
1465
+ "learning_rate": 2.7515450541418338e-05,
1466
+ "loss": 0.5748,
1467
  "step": 243
1468
  },
1469
  {
1470
+ "epoch": 0.15,
1471
+ "learning_rate": 2.658781196293482e-05,
1472
+ "loss": 0.5505,
1473
  "step": 244
1474
  },
1475
  {
1476
+ "epoch": 0.15,
1477
+ "learning_rate": 2.5674556984598822e-05,
1478
+ "loss": 0.5472,
1479
  "step": 245
1480
  },
1481
  {
1482
+ "epoch": 0.15,
1483
+ "learning_rate": 2.477579204560935e-05,
1484
+ "loss": 0.518,
1485
  "step": 246
1486
  },
1487
  {
1488
+ "epoch": 0.15,
1489
+ "learning_rate": 2.389162189636188e-05,
1490
+ "loss": 0.5002,
1491
  "step": 247
1492
  },
1493
  {
1494
+ "epoch": 0.15,
1495
+ "learning_rate": 2.3022149586239968e-05,
1496
+ "loss": 0.4444,
1497
  "step": 248
1498
  },
1499
  {
1500
+ "epoch": 0.15,
1501
+ "learning_rate": 2.216747645160462e-05,
1502
+ "loss": 0.5107,
1503
  "step": 249
1504
  },
1505
  {
1506
+ "epoch": 0.15,
1507
+ "learning_rate": 2.1327702103983863e-05,
1508
+ "loss": 0.4729,
1509
  "step": 250
1510
  },
1511
  {
1512
+ "epoch": 0.16,
1513
+ "learning_rate": 2.0502924418463013e-05,
1514
+ "loss": 0.7721,
1515
  "step": 251
1516
  },
1517
  {
1518
+ "epoch": 0.16,
1519
+ "learning_rate": 1.9693239522277327e-05,
1520
+ "loss": 0.774,
1521
  "step": 252
1522
  },
1523
  {
1524
+ "epoch": 0.16,
1525
+ "learning_rate": 1.889874178360864e-05,
1526
+ "loss": 0.8274,
1527
  "step": 253
1528
  },
1529
  {
1530
+ "epoch": 0.16,
1531
+ "learning_rate": 1.8119523800586568e-05,
1532
+ "loss": 0.7989,
1533
  "step": 254
1534
  },
1535
  {
1536
+ "epoch": 0.16,
1537
+ "learning_rate": 1.735567639049648e-05,
1538
+ "loss": 0.7691,
1539
  "step": 255
1540
  },
1541
  {
1542
+ "epoch": 0.16,
1543
+ "learning_rate": 1.6607288579194638e-05,
1544
+ "loss": 0.7938,
1545
  "step": 256
1546
  },
1547
  {
1548
+ "epoch": 0.16,
1549
+ "learning_rate": 1.5874447590732538e-05,
1550
+ "loss": 0.8506,
1551
  "step": 257
1552
  },
1553
  {
1554
+ "epoch": 0.16,
1555
+ "learning_rate": 1.5157238837190716e-05,
1556
+ "loss": 0.7695,
1557
  "step": 258
1558
  },
1559
  {
1560
+ "epoch": 0.16,
1561
+ "learning_rate": 1.4455745908724226e-05,
1562
+ "loss": 0.819,
1563
  "step": 259
1564
  },
1565
  {
1566
+ "epoch": 0.16,
1567
+ "learning_rate": 1.3770050563820179e-05,
1568
+ "loss": 0.7904,
1569
  "step": 260
1570
  },
1571
  {
1572
+ "epoch": 0.16,
1573
+ "learning_rate": 1.3100232719768994e-05,
1574
+ "loss": 0.8647,
1575
  "step": 261
1576
  },
1577
  {
1578
+ "epoch": 0.16,
1579
+ "learning_rate": 1.2446370443349863e-05,
1580
+ "loss": 0.777,
1581
  "step": 262
1582
  },
1583
  {
1584
+ "epoch": 0.16,
1585
+ "learning_rate": 1.180853994173236e-05,
1586
+ "loss": 0.7898,
1587
  "step": 263
1588
  },
1589
  {
1590
+ "epoch": 0.16,
1591
+ "learning_rate": 1.118681555359438e-05,
1592
+ "loss": 0.7466,
1593
  "step": 264
1594
  },
1595
  {
1596
+ "epoch": 0.16,
1597
+ "learning_rate": 1.058126974045811e-05,
1598
+ "loss": 0.8193,
1599
  "step": 265
1600
  },
1601
  {
1602
+ "epoch": 0.16,
1603
+ "learning_rate": 9.991973078244636e-06,
1604
+ "loss": 0.8072,
1605
  "step": 266
1606
  },
1607
  {
1608
+ "epoch": 0.17,
1609
+ "learning_rate": 9.418994249048472e-06,
1610
+ "loss": 0.8395,
1611
  "step": 267
1612
  },
1613
  {
1614
+ "epoch": 0.17,
1615
+ "learning_rate": 8.862400033132571e-06,
1616
+ "loss": 0.8109,
1617
  "step": 268
1618
  },
1619
  {
1620
+ "epoch": 0.17,
1621
+ "learning_rate": 8.322255301145204e-06,
1622
+ "loss": 0.8379,
1623
  "step": 269
1624
  },
1625
  {
1626
+ "epoch": 0.17,
1627
+ "learning_rate": 7.798623006559435e-06,
1628
+ "loss": 0.7609,
1629
  "step": 270
1630
  },
1631
  {
1632
+ "epoch": 0.17,
1633
+ "learning_rate": 7.291564178335718e-06,
1634
+ "loss": 0.7881,
1635
  "step": 271
1636
  },
1637
  {
1638
+ "epoch": 0.17,
1639
+ "learning_rate": 6.801137913809213e-06,
1640
+ "loss": 0.8359,
1641
  "step": 272
1642
  },
1643
  {
1644
+ "epoch": 0.17,
1645
+ "learning_rate": 6.3274013718019434e-06,
1646
+ "loss": 0.8019,
1647
  "step": 273
1648
  },
1649
  {
1650
+ "epoch": 0.17,
1651
+ "learning_rate": 5.870409765960965e-06,
1652
+ "loss": 0.7971,
1653
  "step": 274
1654
  },
1655
  {
1656
+ "epoch": 0.17,
1657
+ "learning_rate": 5.430216358323309e-06,
1658
+ "loss": 0.7786,
1659
  "step": 275
1660
  },
1661
  {
1662
+ "epoch": 0.17,
1663
+ "learning_rate": 5.006872453108329e-06,
1664
+ "loss": 0.7921,
1665
  "step": 276
1666
  },
1667
  {
1668
+ "epoch": 0.17,
1669
+ "learning_rate": 4.600427390738159e-06,
1670
+ "loss": 0.7785,
1671
  "step": 277
1672
  },
1673
  {
1674
+ "epoch": 0.17,
1675
+ "learning_rate": 4.210928542087206e-06,
1676
+ "loss": 0.7637,
1677
  "step": 278
1678
  },
1679
  {
1680
+ "epoch": 0.17,
1681
+ "learning_rate": 3.838421302961098e-06,
1682
+ "loss": 0.7673,
1683
  "step": 279
1684
  },
1685
  {
1686
+ "epoch": 0.17,
1687
+ "learning_rate": 3.482949088805742e-06,
1688
+ "loss": 0.7417,
1689
  "step": 280
1690
  },
1691
  {
1692
+ "epoch": 0.17,
1693
+ "learning_rate": 3.1445533296474478e-06,
1694
+ "loss": 0.7175,
1695
  "step": 281
1696
  },
1697
  {
1698
+ "epoch": 0.17,
1699
+ "learning_rate": 2.823273465264142e-06,
1700
+ "loss": 0.6789,
1701
  "step": 282
1702
  },
1703
  {
1704
+ "epoch": 0.17,
1705
+ "learning_rate": 2.519146940588762e-06,
1706
+ "loss": 0.6716,
1707
  "step": 283
1708
  },
1709
  {
1710
+ "epoch": 0.18,
1711
+ "learning_rate": 2.232209201345031e-06,
1712
+ "loss": 0.6743,
1713
  "step": 284
1714
  },
1715
  {
1716
+ "epoch": 0.18,
1717
+ "learning_rate": 1.9624936899163945e-06,
1718
+ "loss": 0.673,
1719
  "step": 285
1720
  },
1721
  {
1722
+ "epoch": 0.18,
1723
+ "learning_rate": 1.7100318414482061e-06,
1724
+ "loss": 0.6362,
1725
  "step": 286
1726
  },
1727
  {
1728
+ "epoch": 0.18,
1729
+ "learning_rate": 1.4748530801840074e-06,
1730
+ "loss": 0.673,
1731
  "step": 287
1732
  },
1733
  {
1734
+ "epoch": 0.18,
1735
+ "learning_rate": 1.2569848160362384e-06,
1736
+ "loss": 0.6348,
1737
  "step": 288
1738
  },
1739
  {
1740
+ "epoch": 0.18,
1741
+ "learning_rate": 1.056452441391542e-06,
1742
+ "loss": 0.6603,
1743
  "step": 289
1744
  },
1745
  {
1746
+ "epoch": 0.18,
1747
+ "learning_rate": 8.732793281513661e-07,
1748
+ "loss": 0.5877,
1749
  "step": 290
1750
  },
1751
  {
1752
+ "epoch": 0.18,
1753
+ "learning_rate": 7.07486825007908e-07,
1754
+ "loss": 0.6369,
1755
  "step": 291
1756
  },
1757
  {
1758
+ "epoch": 0.18,
1759
+ "learning_rate": 5.590942549560051e-07,
1760
+ "loss": 0.5664,
1761
  "step": 292
1762
  },
1763
  {
1764
+ "epoch": 0.18,
1765
+ "learning_rate": 4.281189130410534e-07,
1766
+ "loss": 0.5721,
1767
  "step": 293
1768
  },
1769
  {
1770
+ "epoch": 0.18,
1771
+ "learning_rate": 3.1457606434325266e-07,
1772
+ "loss": 0.5755,
1773
  "step": 294
1774
  },
1775
  {
1776
+ "epoch": 0.18,
1777
+ "learning_rate": 2.184789421984634e-07,
1778
+ "loss": 0.5584,
1779
  "step": 295
1780
  },
1781
  {
1782
+ "epoch": 0.18,
1783
+ "learning_rate": 1.3983874665589035e-07,
1784
+ "loss": 0.5746,
1785
  "step": 296
1786
  },
1787
  {
1788
+ "epoch": 0.18,
1789
+ "learning_rate": 7.866464317276e-08,
1790
+ "loss": 0.4979,
1791
  "step": 297
1792
  },
1793
  {
1794
+ "epoch": 0.18,
1795
+ "learning_rate": 3.4963761546041855e-08,
1796
+ "loss": 0.5017,
1797
  "step": 298
1798
  },
1799
  {
1800
+ "epoch": 0.18,
1801
+ "learning_rate": 8.741195081479747e-09,
1802
+ "loss": 0.5112,
1803
  "step": 299
1804
  },
1805
  {
1806
+ "epoch": 0.19,
1807
  "learning_rate": 0.0,
1808
+ "loss": 0.453,
1809
  "step": 300
1810
  }
1811
  ],
1812
+ "logging_steps": 1,
1813
  "max_steps": 300,
1814
  "num_train_epochs": 1,
1815
+ "save_steps": 50,
1816
+ "total_flos": 8.93034880303104e+16,
1817
  "trial_name": null,
1818
  "trial_params": null
1819
  }
checkpoint-300/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be007ae36129116338a68f438eb003e2347780a6625c4d4bcccce4e17179bba8
3
  size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1c02534982789891a108fd8845b01f09dd0f60e3bcbbfb171714be6598e3a93
3
  size 4027
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:484b0ce0401bc6add3f19fbb477c0135596aee4bb34588b55aa0d86a0936c0e5
3
  size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1c02534982789891a108fd8845b01f09dd0f60e3bcbbfb171714be6598e3a93
3
  size 4027