tanliboy commited on
Commit
acc6b79
·
verified ·
1 Parent(s): 6566486

Model save

Browse files
README.md CHANGED
@@ -2,15 +2,11 @@
2
  license: gemma
3
  base_model: google/gemma-2-9b
4
  tags:
5
- - alignment-handbook
6
- - trl
7
- - sft
8
- - generated_from_trainer
9
  - trl
10
  - sft
11
  - generated_from_trainer
12
  datasets:
13
- - HuggingFaceH4/ultrachat_200k
14
  model-index:
15
  - name: zephyr-gemma-2-9b-sft
16
  results: []
@@ -19,12 +15,12 @@ model-index:
19
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
20
  should probably proofread and complete it, then remove this comment. -->
21
 
22
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/tanliboy/huggingface/runs/v8t9mu31)
23
  # zephyr-gemma-2-9b-sft
24
 
25
- This model is a fine-tuned version of [google/gemma-2-9b](https://huggingface.co/google/gemma-2-9b) on the HuggingFaceH4/ultrachat_200k dataset.
26
  It achieves the following results on the evaluation set:
27
- - Loss: 1.0638
28
 
29
  ## Model description
30
 
@@ -61,12 +57,12 @@ The following hyperparameters were used during training:
61
 
62
  | Training Loss | Epoch | Step | Validation Loss |
63
  |:-------------:|:------:|:----:|:---------------:|
64
- | 1.096 | 0.9995 | 951 | 1.0638 |
65
 
66
 
67
  ### Framework versions
68
 
69
- - Transformers 4.42.4
70
  - Pytorch 2.3.1+cu121
71
  - Datasets 2.19.1
72
  - Tokenizers 0.19.1
 
2
  license: gemma
3
  base_model: google/gemma-2-9b
4
  tags:
 
 
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
8
  datasets:
9
+ - generator
10
  model-index:
11
  - name: zephyr-gemma-2-9b-sft
12
  results: []
 
15
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
16
  should probably proofread and complete it, then remove this comment. -->
17
 
18
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/tanliboy/huggingface/runs/lk9oo65j)
19
  # zephyr-gemma-2-9b-sft
20
 
21
+ This model is a fine-tuned version of [google/gemma-2-9b](https://huggingface.co/google/gemma-2-9b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.0639
24
 
25
  ## Model description
26
 
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
+ | 1.0962 | 0.9995 | 951 | 1.0639 |
61
 
62
 
63
  ### Framework versions
64
 
65
+ - Transformers 4.43.0.dev0
66
  - Pytorch 2.3.1+cu121
67
  - Datasets 2.19.1
68
  - Tokenizers 0.19.1
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 0.999474513925381,
3
- "eval_loss": 1.0638214349746704,
4
- "eval_runtime": 592.7206,
5
- "eval_samples": 23109,
6
- "eval_samples_per_second": 22.734,
7
- "eval_steps_per_second": 0.712,
8
  "total_flos": 905758069751808.0,
9
- "train_loss": 1.1023603343061092,
10
- "train_runtime": 21224.38,
11
  "train_samples": 207864,
12
- "train_samples_per_second": 5.738,
13
- "train_steps_per_second": 0.045
14
  }
 
1
  {
2
  "epoch": 0.999474513925381,
 
 
 
 
 
3
  "total_flos": 905758069751808.0,
4
+ "train_loss": 1.102385032064155,
5
+ "train_runtime": 21864.5552,
6
  "train_samples": 207864,
7
+ "train_samples_per_second": 5.57,
8
+ "train_steps_per_second": 0.043
9
  }
generation_config.json CHANGED
@@ -4,5 +4,5 @@
4
  "cache_implementation": "hybrid",
5
  "eos_token_id": 1,
6
  "pad_token_id": 0,
7
- "transformers_version": "4.42.4"
8
  }
 
4
  "cache_implementation": "hybrid",
5
  "eos_token_id": 1,
6
  "pad_token_id": 0,
7
+ "transformers_version": "4.43.0.dev0"
8
  }
runs/Jul19_14-58-09_action-graph-trainer/events.out.tfevents.1721401122.action-graph-trainer.704471.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:726b9ae38b843a9e75a36bc294f4c0b0b9927e997a375271d2957174f34b6c0d
3
- size 45931
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:899e651843faea4fc5e62ec965e216057a695c914c9f61bddbdcc8c09817fa44
3
+ size 46556
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.999474513925381,
3
  "total_flos": 905758069751808.0,
4
- "train_loss": 1.1023603343061092,
5
- "train_runtime": 21224.38,
6
  "train_samples": 207864,
7
- "train_samples_per_second": 5.738,
8
- "train_steps_per_second": 0.045
9
  }
 
1
  {
2
  "epoch": 0.999474513925381,
3
  "total_flos": 905758069751808.0,
4
+ "train_loss": 1.102385032064155,
5
+ "train_runtime": 21864.5552,
6
  "train_samples": 207864,
7
+ "train_samples_per_second": 5.57,
8
+ "train_steps_per_second": 0.043
9
  }
trainer_state.json CHANGED
@@ -150,1217 +150,1217 @@
150
  },
151
  {
152
  "epoch": 0.10509721492380451,
153
- "grad_norm": 0.9462854605403194,
154
  "learning_rate": 2.9998379903275155e-06,
155
  "loss": 1.1108,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.11035207566999475,
160
- "grad_norm": 0.9678742768022235,
161
  "learning_rate": 2.9991798860113893e-06,
162
  "loss": 1.1162,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.11560693641618497,
167
- "grad_norm": 0.9200494964131395,
168
  "learning_rate": 2.998015783397426e-06,
169
  "loss": 1.1091,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.1208617971623752,
174
- "grad_norm": 0.9581044551729833,
175
  "learning_rate": 2.9963460753897363e-06,
176
- "loss": 1.0963,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.1261166579085654,
181
- "grad_norm": 0.911911983689549,
182
  "learning_rate": 2.994171325542714e-06,
183
  "loss": 1.0911,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.13137151865475566,
188
- "grad_norm": 0.8931755968055785,
189
  "learning_rate": 2.991492267870822e-06,
190
  "loss": 1.0917,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.13662637940094588,
195
- "grad_norm": 0.882022590090018,
196
  "learning_rate": 2.9883098066008556e-06,
197
- "loss": 1.088,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.1418812401471361,
202
- "grad_norm": 0.9953724063836157,
203
  "learning_rate": 2.984625015866745e-06,
204
- "loss": 1.0859,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.14713610089332632,
209
- "grad_norm": 1.878615978166485,
210
  "learning_rate": 2.9804391393470235e-06,
211
- "loss": 1.1006,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.15239096163951654,
216
- "grad_norm": 0.970035528231665,
217
  "learning_rate": 2.975753589845059e-06,
218
- "loss": 1.1022,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.1576458223857068,
223
- "grad_norm": 0.9147530382000614,
224
  "learning_rate": 2.970569948812214e-06,
225
  "loss": 1.0691,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.162900683131897,
230
- "grad_norm": 0.9374168562620145,
231
  "learning_rate": 2.9648899658140767e-06,
232
- "loss": 1.0861,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.16815554387808723,
237
- "grad_norm": 0.8832219863831399,
238
  "learning_rate": 2.9587155579399543e-06,
239
  "loss": 1.0823,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.17341040462427745,
244
- "grad_norm": 0.9316518026448157,
245
  "learning_rate": 2.9520488091558225e-06,
246
- "loss": 1.0772,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.17866526537046767,
251
- "grad_norm": 0.9096920364451422,
252
  "learning_rate": 2.944891969600953e-06,
253
  "loss": 1.0875,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.1839201261166579,
258
- "grad_norm": 0.9007871965996922,
259
  "learning_rate": 2.9372474548284537e-06,
260
- "loss": 1.0949,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.18917498686284814,
265
- "grad_norm": 0.876698627967954,
266
  "learning_rate": 2.9291178449899786e-06,
267
  "loss": 1.0934,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.19442984760903836,
272
- "grad_norm": 0.9059798194790952,
273
  "learning_rate": 2.920505883964884e-06,
274
- "loss": 1.0918,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.19968470835522859,
279
- "grad_norm": 0.8939533236216217,
280
  "learning_rate": 2.9114144784341226e-06,
281
  "loss": 1.0854,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.2049395691014188,
286
- "grad_norm": 0.9262446830111234,
287
  "learning_rate": 2.9018466968991914e-06,
288
  "loss": 1.0749,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.21019442984760903,
293
- "grad_norm": 0.8549091150362632,
294
  "learning_rate": 2.8918057686464587e-06,
295
- "loss": 1.0825,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.21544929059379928,
300
- "grad_norm": 0.8724281083548375,
301
  "learning_rate": 2.881295082657229e-06,
302
  "loss": 1.0769,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.2207041513399895,
307
- "grad_norm": 0.8895456534687597,
308
  "learning_rate": 2.8703181864639013e-06,
309
- "loss": 1.0926,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.22595901208617972,
314
- "grad_norm": 0.9539096387018893,
315
  "learning_rate": 2.8588787849526228e-06,
316
- "loss": 1.0733,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.23121387283236994,
321
- "grad_norm": 0.8703158811213173,
322
  "learning_rate": 2.846980739112822e-06,
323
  "loss": 1.0749,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.23646873357856016,
328
- "grad_norm": 0.8785853904575431,
329
  "learning_rate": 2.834628064734065e-06,
330
- "loss": 1.0698,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.2417235943247504,
335
- "grad_norm": 0.889877361216805,
336
  "learning_rate": 2.821824931050655e-06,
337
  "loss": 1.0751,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.24697845507094063,
342
- "grad_norm": 0.8959992372183687,
343
  "learning_rate": 2.8085756593344505e-06,
344
  "loss": 1.07,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.2522333158171308,
349
- "grad_norm": 0.8657778595116195,
350
  "learning_rate": 2.794884721436361e-06,
351
- "loss": 1.0653,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.25748817656332107,
356
- "grad_norm": 0.8631791797614975,
357
  "learning_rate": 2.780756738277021e-06,
358
- "loss": 1.0979,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.2627430373095113,
363
- "grad_norm": 0.9028283942047837,
364
  "learning_rate": 2.766196478287156e-06,
365
  "loss": 1.0907,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.2679978980557015,
370
- "grad_norm": 1.105737597863472,
371
  "learning_rate": 2.751208855798155e-06,
372
  "loss": 1.0801,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.27325275880189176,
377
- "grad_norm": 0.9224613563982136,
378
  "learning_rate": 2.7357989293834005e-06,
379
- "loss": 1.0821,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.27850761954808195,
384
- "grad_norm": 0.8624741728874925,
385
  "learning_rate": 2.7199719001509175e-06,
386
- "loss": 1.075,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.2837624802942722,
391
- "grad_norm": 0.8715022403371375,
392
  "learning_rate": 2.7037331099879117e-06,
393
- "loss": 1.088,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.28901734104046245,
398
- "grad_norm": 0.8409580987018042,
399
  "learning_rate": 2.687088039757792e-06,
400
  "loss": 1.0797,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.29427220178665264,
405
- "grad_norm": 0.9670308072371985,
406
  "learning_rate": 2.6700423074502888e-06,
407
  "loss": 1.0717,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.2995270625328429,
412
- "grad_norm": 0.8449151190647378,
413
  "learning_rate": 2.652601666285289e-06,
414
- "loss": 1.1141,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.3047819232790331,
419
- "grad_norm": 0.911604937769195,
420
  "learning_rate": 2.6347720027710253e-06,
421
  "loss": 1.067,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.31003678402522333,
426
- "grad_norm": 0.8655301197268408,
427
  "learning_rate": 2.6165593347172837e-06,
428
- "loss": 1.0731,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.3152916447714136,
433
- "grad_norm": 0.880039459937268,
434
  "learning_rate": 2.5979698092042925e-06,
435
- "loss": 1.0877,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.3205465055176038,
440
- "grad_norm": 0.9360576412818693,
441
  "learning_rate": 2.5790097005079765e-06,
442
- "loss": 1.0899,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.325801366263794,
447
- "grad_norm": 0.8489030162852722,
448
  "learning_rate": 2.559685407982288e-06,
449
- "loss": 1.0667,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.3310562270099842,
454
- "grad_norm": 0.8638592230026647,
455
  "learning_rate": 2.5400034538993135e-06,
456
- "loss": 1.0867,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.33631108775617446,
461
- "grad_norm": 0.9101865357911341,
462
  "learning_rate": 2.519970481247901e-06,
463
- "loss": 1.0521,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 0.3415659485023647,
468
- "grad_norm": 0.8626549506195685,
469
  "learning_rate": 2.4995932514915404e-06,
470
- "loss": 1.0684,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 0.3468208092485549,
475
- "grad_norm": 0.8534791671434315,
476
  "learning_rate": 2.478878642286253e-06,
477
  "loss": 1.0644,
478
  "step": 330
479
  },
480
  {
481
  "epoch": 0.35207566999474516,
482
- "grad_norm": 0.8353900580922416,
483
  "learning_rate": 2.4578336451592705e-06,
484
- "loss": 1.0813,
485
  "step": 335
486
  },
487
  {
488
  "epoch": 0.35733053074093535,
489
- "grad_norm": 0.8518466429904108,
490
  "learning_rate": 2.4364653631492774e-06,
491
- "loss": 1.0626,
492
  "step": 340
493
  },
494
  {
495
  "epoch": 0.3625853914871256,
496
- "grad_norm": 0.8341915866164423,
497
  "learning_rate": 2.414781008409014e-06,
498
  "loss": 1.0737,
499
  "step": 345
500
  },
501
  {
502
  "epoch": 0.3678402522333158,
503
- "grad_norm": 0.8723412570499911,
504
  "learning_rate": 2.3927878997710575e-06,
505
  "loss": 1.0981,
506
  "step": 350
507
  },
508
  {
509
  "epoch": 0.37309511297950604,
510
- "grad_norm": 0.841906976458512,
511
  "learning_rate": 2.3704934602775926e-06,
512
  "loss": 1.0827,
513
  "step": 355
514
  },
515
  {
516
  "epoch": 0.3783499737256963,
517
- "grad_norm": 0.879119551017937,
518
  "learning_rate": 2.347905214675008e-06,
519
  "loss": 1.0713,
520
  "step": 360
521
  },
522
  {
523
  "epoch": 0.3836048344718865,
524
- "grad_norm": 0.8718882110600734,
525
  "learning_rate": 2.3250307868741717e-06,
526
- "loss": 1.0707,
527
  "step": 365
528
  },
529
  {
530
  "epoch": 0.38885969521807673,
531
- "grad_norm": 0.8002367155481277,
532
  "learning_rate": 2.3018778973772334e-06,
533
  "loss": 1.0573,
534
  "step": 370
535
  },
536
  {
537
  "epoch": 0.3941145559642669,
538
- "grad_norm": 0.8221977480285654,
539
  "learning_rate": 2.278454360671823e-06,
540
- "loss": 1.0867,
541
  "step": 375
542
  },
543
  {
544
  "epoch": 0.39936941671045717,
545
- "grad_norm": 0.8870576322337221,
546
  "learning_rate": 2.2547680825935325e-06,
547
- "loss": 1.0851,
548
  "step": 380
549
  },
550
  {
551
  "epoch": 0.4046242774566474,
552
- "grad_norm": 0.8153258721073422,
553
  "learning_rate": 2.2308270576575657e-06,
554
  "loss": 1.0683,
555
  "step": 385
556
  },
557
  {
558
  "epoch": 0.4098791382028376,
559
- "grad_norm": 0.8299248390745442,
560
  "learning_rate": 2.206639366360451e-06,
561
- "loss": 1.0883,
562
  "step": 390
563
  },
564
  {
565
  "epoch": 0.41513399894902786,
566
- "grad_norm": 0.8891580311632679,
567
  "learning_rate": 2.1822131724527425e-06,
568
- "loss": 1.0587,
569
  "step": 395
570
  },
571
  {
572
  "epoch": 0.42038885969521805,
573
- "grad_norm": 0.830348041290943,
574
  "learning_rate": 2.157556720183616e-06,
575
  "loss": 1.0542,
576
  "step": 400
577
  },
578
  {
579
  "epoch": 0.4256437204414083,
580
- "grad_norm": 0.8254218651707308,
581
  "learning_rate": 2.1326783315182984e-06,
582
  "loss": 1.0666,
583
  "step": 405
584
  },
585
  {
586
  "epoch": 0.43089858118759855,
587
- "grad_norm": 0.8211111371978261,
588
  "learning_rate": 2.1075864033292623e-06,
589
- "loss": 1.0723,
590
  "step": 410
591
  },
592
  {
593
  "epoch": 0.43615344193378874,
594
- "grad_norm": 0.8308235610863787,
595
  "learning_rate": 2.082289404562144e-06,
596
- "loss": 1.0767,
597
  "step": 415
598
  },
599
  {
600
  "epoch": 0.441408302679979,
601
- "grad_norm": 0.8127153392787219,
602
  "learning_rate": 2.0567958733773313e-06,
603
- "loss": 1.0614,
604
  "step": 420
605
  },
606
  {
607
  "epoch": 0.4466631634261692,
608
- "grad_norm": 0.8123707093337108,
609
  "learning_rate": 2.0311144142681904e-06,
610
- "loss": 1.064,
611
  "step": 425
612
  },
613
  {
614
  "epoch": 0.45191802417235943,
615
- "grad_norm": 0.829943736431802,
616
  "learning_rate": 2.005253695156909e-06,
617
  "loss": 1.0472,
618
  "step": 430
619
  },
620
  {
621
  "epoch": 0.4571728849185497,
622
- "grad_norm": 0.831220074995737,
623
  "learning_rate": 1.9792224444689222e-06,
624
- "loss": 1.0615,
625
  "step": 435
626
  },
627
  {
628
  "epoch": 0.4624277456647399,
629
- "grad_norm": 0.8745913548912436,
630
  "learning_rate": 1.9530294481869286e-06,
631
- "loss": 1.0802,
632
  "step": 440
633
  },
634
  {
635
  "epoch": 0.4676826064109301,
636
- "grad_norm": 0.8449818880595231,
637
  "learning_rate": 1.926683546885469e-06,
638
  "loss": 1.0588,
639
  "step": 445
640
  },
641
  {
642
  "epoch": 0.4729374671571203,
643
- "grad_norm": 0.8068996721164886,
644
  "learning_rate": 1.9001936327470894e-06,
645
- "loss": 1.0708,
646
  "step": 450
647
  },
648
  {
649
  "epoch": 0.47819232790331057,
650
- "grad_norm": 0.824592533003207,
651
  "learning_rate": 1.873568646561075e-06,
652
- "loss": 1.0672,
653
  "step": 455
654
  },
655
  {
656
  "epoch": 0.4834471886495008,
657
- "grad_norm": 0.83022160250907,
658
  "learning_rate": 1.8468175747057898e-06,
659
  "loss": 1.0748,
660
  "step": 460
661
  },
662
  {
663
  "epoch": 0.488702049395691,
664
- "grad_norm": 0.8333740262644744,
665
  "learning_rate": 1.8199494461156204e-06,
666
- "loss": 1.0532,
667
  "step": 465
668
  },
669
  {
670
  "epoch": 0.49395691014188126,
671
- "grad_norm": 0.8922005131672666,
672
  "learning_rate": 1.7929733292335591e-06,
673
  "loss": 1.0733,
674
  "step": 470
675
  },
676
  {
677
  "epoch": 0.49921177088807145,
678
- "grad_norm": 0.9005753696617086,
679
  "learning_rate": 1.765898328950455e-06,
680
- "loss": 1.0647,
681
  "step": 475
682
  },
683
  {
684
  "epoch": 0.5044666316342616,
685
- "grad_norm": 0.8247436750347265,
686
  "learning_rate": 1.738733583531959e-06,
687
  "loss": 1.0802,
688
  "step": 480
689
  },
690
  {
691
  "epoch": 0.509721492380452,
692
- "grad_norm": 0.841670784750436,
693
  "learning_rate": 1.7114882615342073e-06,
694
  "loss": 1.056,
695
  "step": 485
696
  },
697
  {
698
  "epoch": 0.5149763531266421,
699
- "grad_norm": 0.8182688706522747,
700
  "learning_rate": 1.6841715587092798e-06,
701
- "loss": 1.0783,
702
  "step": 490
703
  },
704
  {
705
  "epoch": 0.5202312138728323,
706
- "grad_norm": 1.239386954886359,
707
  "learning_rate": 1.6567926949014804e-06,
708
  "loss": 1.0745,
709
  "step": 495
710
  },
711
  {
712
  "epoch": 0.5254860746190226,
713
- "grad_norm": 0.8030922261487655,
714
  "learning_rate": 1.6293609109354836e-06,
715
- "loss": 1.0612,
716
  "step": 500
717
  },
718
  {
719
  "epoch": 0.5307409353652128,
720
- "grad_norm": 0.8189314991097896,
721
  "learning_rate": 1.601885465497404e-06,
722
- "loss": 1.065,
723
  "step": 505
724
  },
725
  {
726
  "epoch": 0.535995796111403,
727
- "grad_norm": 0.9050946114254789,
728
  "learning_rate": 1.5743756320098334e-06,
729
  "loss": 1.0643,
730
  "step": 510
731
  },
732
  {
733
  "epoch": 0.5412506568575933,
734
- "grad_norm": 0.8807760866261877,
735
  "learning_rate": 1.5468406955019059e-06,
736
- "loss": 1.0569,
737
  "step": 515
738
  },
739
  {
740
  "epoch": 0.5465055176037835,
741
- "grad_norm": 0.8569862964159628,
742
  "learning_rate": 1.5192899494754443e-06,
743
  "loss": 1.0731,
744
  "step": 520
745
  },
746
  {
747
  "epoch": 0.5517603783499737,
748
- "grad_norm": 0.8091790353711866,
749
  "learning_rate": 1.4917326927682494e-06,
750
- "loss": 1.0703,
751
  "step": 525
752
  },
753
  {
754
  "epoch": 0.5570152390961639,
755
- "grad_norm": 0.822047948500721,
756
  "learning_rate": 1.4641782264155852e-06,
757
- "loss": 1.0737,
758
  "step": 530
759
  },
760
  {
761
  "epoch": 0.5622700998423542,
762
- "grad_norm": 0.8252210935515614,
763
  "learning_rate": 1.4366358505109237e-06,
764
  "loss": 1.0857,
765
  "step": 535
766
  },
767
  {
768
  "epoch": 0.5675249605885444,
769
- "grad_norm": 0.8152512103821862,
770
  "learning_rate": 1.4091148610670098e-06,
771
- "loss": 1.0697,
772
  "step": 540
773
  },
774
  {
775
  "epoch": 0.5727798213347346,
776
- "grad_norm": 0.8348760975293736,
777
  "learning_rate": 1.3816245468782988e-06,
778
- "loss": 1.0598,
779
  "step": 545
780
  },
781
  {
782
  "epoch": 0.5780346820809249,
783
- "grad_norm": 0.8008142952039858,
784
  "learning_rate": 1.3541741863858352e-06,
785
- "loss": 1.0532,
786
  "step": 550
787
  },
788
  {
789
  "epoch": 0.5832895428271151,
790
- "grad_norm": 0.8525665050865836,
791
  "learning_rate": 1.326773044545621e-06,
792
  "loss": 1.0672,
793
  "step": 555
794
  },
795
  {
796
  "epoch": 0.5885444035733053,
797
- "grad_norm": 0.828489906999064,
798
  "learning_rate": 1.299430369701541e-06,
799
- "loss": 1.0578,
800
  "step": 560
801
  },
802
  {
803
  "epoch": 0.5937992643194955,
804
- "grad_norm": 0.8071915455528251,
805
  "learning_rate": 1.272155390463889e-06,
806
  "loss": 1.084,
807
  "step": 565
808
  },
809
  {
810
  "epoch": 0.5990541250656858,
811
- "grad_norm": 0.8213168186088048,
812
  "learning_rate": 1.2449573125945607e-06,
813
- "loss": 1.0678,
814
  "step": 570
815
  },
816
  {
817
  "epoch": 0.604308985811876,
818
- "grad_norm": 0.808856584314896,
819
  "learning_rate": 1.2178453158999509e-06,
820
- "loss": 1.0705,
821
  "step": 575
822
  },
823
  {
824
  "epoch": 0.6095638465580662,
825
- "grad_norm": 0.8097966068652359,
826
  "learning_rate": 1.1908285511326195e-06,
827
- "loss": 1.0561,
828
  "step": 580
829
  },
830
  {
831
  "epoch": 0.6148187073042565,
832
- "grad_norm": 0.8103079561814972,
833
  "learning_rate": 1.1639161369027564e-06,
834
- "loss": 1.0543,
835
  "step": 585
836
  },
837
  {
838
  "epoch": 0.6200735680504467,
839
- "grad_norm": 0.7934957921328578,
840
  "learning_rate": 1.1371171566004986e-06,
841
- "loss": 1.0507,
842
  "step": 590
843
  },
844
  {
845
  "epoch": 0.6253284287966369,
846
- "grad_norm": 0.7898748331831235,
847
  "learning_rate": 1.1104406553301357e-06,
848
- "loss": 1.0637,
849
  "step": 595
850
  },
851
  {
852
  "epoch": 0.6305832895428272,
853
- "grad_norm": 0.8261021360086062,
854
  "learning_rate": 1.0838956368572335e-06,
855
  "loss": 1.0526,
856
  "step": 600
857
  },
858
  {
859
  "epoch": 0.6358381502890174,
860
- "grad_norm": 0.8086241727940497,
861
  "learning_rate": 1.0574910605697135e-06,
862
  "loss": 1.0715,
863
  "step": 605
864
  },
865
  {
866
  "epoch": 0.6410930110352075,
867
- "grad_norm": 0.9088888967600997,
868
  "learning_rate": 1.03123583845391e-06,
869
  "loss": 1.0692,
870
  "step": 610
871
  },
872
  {
873
  "epoch": 0.6463478717813977,
874
- "grad_norm": 0.8079809849272349,
875
  "learning_rate": 1.0051388320866258e-06,
876
  "loss": 1.0583,
877
  "step": 615
878
  },
879
  {
880
  "epoch": 0.651602732527588,
881
- "grad_norm": 0.8422109756331945,
882
  "learning_rate": 9.792088496441992e-07,
883
- "loss": 1.0819,
884
  "step": 620
885
  },
886
  {
887
  "epoch": 0.6568575932737782,
888
- "grad_norm": 0.8172134359132237,
889
  "learning_rate": 9.53454642929601e-07,
890
  "loss": 1.0772,
891
  "step": 625
892
  },
893
  {
894
  "epoch": 0.6621124540199684,
895
- "grad_norm": 0.9233197165497478,
896
  "learning_rate": 9.278849044185509e-07,
897
- "loss": 1.0637,
898
  "step": 630
899
  },
900
  {
901
  "epoch": 0.6673673147661587,
902
- "grad_norm": 0.9171808214559961,
903
  "learning_rate": 9.025082643256647e-07,
904
- "loss": 1.043,
905
  "step": 635
906
  },
907
  {
908
  "epoch": 0.6726221755123489,
909
- "grad_norm": 0.8043456622976703,
910
  "learning_rate": 8.77333287691609e-07,
911
- "loss": 1.0584,
912
  "step": 640
913
  },
914
  {
915
  "epoch": 0.6778770362585391,
916
- "grad_norm": 0.8108655439985347,
917
  "learning_rate": 8.523684714922608e-07,
918
  "loss": 1.0742,
919
  "step": 645
920
  },
921
  {
922
  "epoch": 0.6831318970047294,
923
- "grad_norm": 0.7968848199287993,
924
  "learning_rate": 8.276222417708309e-07,
925
  "loss": 1.0557,
926
  "step": 650
927
  },
928
  {
929
  "epoch": 0.6883867577509196,
930
- "grad_norm": 0.8129711567557044,
931
  "learning_rate": 8.031029507939401e-07,
932
- "loss": 1.0548,
933
  "step": 655
934
  },
935
  {
936
  "epoch": 0.6936416184971098,
937
- "grad_norm": 1.004235271804759,
938
  "learning_rate": 7.788188742325803e-07,
939
- "loss": 1.0612,
940
  "step": 660
941
  },
942
  {
943
  "epoch": 0.6988964792433,
944
- "grad_norm": 0.8208103251521766,
945
  "learning_rate": 7.547782083689479e-07,
946
  "loss": 1.0643,
947
  "step": 665
948
  },
949
  {
950
  "epoch": 0.7041513399894903,
951
- "grad_norm": 0.8005037088376794,
952
  "learning_rate": 7.309890673300506e-07,
953
- "loss": 1.045,
954
  "step": 670
955
  },
956
  {
957
  "epoch": 0.7094062007356805,
958
- "grad_norm": 1.3197940343995904,
959
  "learning_rate": 7.074594803490618e-07,
960
- "loss": 1.0518,
961
  "step": 675
962
  },
963
  {
964
  "epoch": 0.7146610614818707,
965
- "grad_norm": 0.8555830044827688,
966
  "learning_rate": 6.841973890553168e-07,
967
- "loss": 1.0611,
968
  "step": 680
969
  },
970
  {
971
  "epoch": 0.719915922228061,
972
- "grad_norm": 0.8099691174064823,
973
  "learning_rate": 6.6121064479388e-07,
974
  "loss": 1.0365,
975
  "step": 685
976
  },
977
  {
978
  "epoch": 0.7251707829742512,
979
- "grad_norm": 0.8053501843842861,
980
  "learning_rate": 6.385070059755846e-07,
981
- "loss": 1.074,
982
  "step": 690
983
  },
984
  {
985
  "epoch": 0.7304256437204414,
986
- "grad_norm": 0.8346149088986957,
987
  "learning_rate": 6.160941354584404e-07,
988
- "loss": 1.0609,
989
  "step": 695
990
  },
991
  {
992
  "epoch": 0.7356805044666316,
993
- "grad_norm": 0.8155402643621729,
994
  "learning_rate": 5.93979597961289e-07,
995
  "loss": 1.0505,
996
  "step": 700
997
  },
998
  {
999
  "epoch": 0.7409353652128219,
1000
- "grad_norm": 0.7796064768407052,
1001
  "learning_rate": 5.721708575105861e-07,
1002
  "loss": 1.0589,
1003
  "step": 705
1004
  },
1005
  {
1006
  "epoch": 0.7461902259590121,
1007
- "grad_norm": 0.8275975218466692,
1008
  "learning_rate": 5.506752749211673e-07,
1009
  "loss": 1.0546,
1010
  "step": 710
1011
  },
1012
  {
1013
  "epoch": 0.7514450867052023,
1014
- "grad_norm": 0.8163230049703886,
1015
  "learning_rate": 5.295001053118499e-07,
1016
- "loss": 1.0562,
1017
  "step": 715
1018
  },
1019
  {
1020
  "epoch": 0.7566999474513926,
1021
- "grad_norm": 0.985195116035074,
1022
  "learning_rate": 5.086524956567084e-07,
1023
- "loss": 1.0713,
1024
  "step": 720
1025
  },
1026
  {
1027
  "epoch": 0.7619548081975828,
1028
- "grad_norm": 0.96356977644872,
1029
  "learning_rate": 4.88139482372852e-07,
1030
  "loss": 1.0375,
1031
  "step": 725
1032
  },
1033
  {
1034
  "epoch": 0.767209668943773,
1035
- "grad_norm": 0.8163211083671094,
1036
  "learning_rate": 4.679679889455153e-07,
1037
- "loss": 1.0525,
1038
  "step": 730
1039
  },
1040
  {
1041
  "epoch": 0.7724645296899633,
1042
- "grad_norm": 0.7987340798976842,
1043
  "learning_rate": 4.4814482359126713e-07,
1044
- "loss": 1.082,
1045
  "step": 735
1046
  },
1047
  {
1048
  "epoch": 0.7777193904361535,
1049
- "grad_norm": 0.8117501301968848,
1050
  "learning_rate": 4.2867667696012255e-07,
1051
- "loss": 1.0626,
1052
  "step": 740
1053
  },
1054
  {
1055
  "epoch": 0.7829742511823437,
1056
- "grad_norm": 0.8929835779245712,
1057
  "learning_rate": 4.0957011987733655e-07,
1058
- "loss": 1.076,
1059
  "step": 745
1060
  },
1061
  {
1062
  "epoch": 0.7882291119285338,
1063
- "grad_norm": 0.8088648176966625,
1064
  "learning_rate": 3.908316011256419e-07,
1065
  "loss": 1.0744,
1066
  "step": 750
1067
  },
1068
  {
1069
  "epoch": 0.7934839726747241,
1070
- "grad_norm": 0.7980716806439362,
1071
  "learning_rate": 3.7246744526867525e-07,
1072
- "loss": 1.0566,
1073
  "step": 755
1074
  },
1075
  {
1076
  "epoch": 0.7987388334209143,
1077
- "grad_norm": 0.8089436395697871,
1078
  "learning_rate": 3.5448385051633225e-07,
1079
- "loss": 1.0465,
1080
  "step": 760
1081
  },
1082
  {
1083
  "epoch": 0.8039936941671045,
1084
- "grad_norm": 0.8015612514297874,
1085
  "learning_rate": 3.368868866327678e-07,
1086
- "loss": 1.0611,
1087
  "step": 765
1088
  },
1089
  {
1090
  "epoch": 0.8092485549132948,
1091
- "grad_norm": 0.7877310543664608,
1092
  "learning_rate": 3.1968249288774887e-07,
1093
- "loss": 1.0604,
1094
  "step": 770
1095
  },
1096
  {
1097
  "epoch": 0.814503415659485,
1098
- "grad_norm": 0.8023241081523638,
1099
  "learning_rate": 3.0287647605205155e-07,
1100
- "loss": 1.0723,
1101
  "step": 775
1102
  },
1103
  {
1104
  "epoch": 0.8197582764056752,
1105
- "grad_norm": 0.8051580485406397,
1106
  "learning_rate": 2.86474508437579e-07,
1107
  "loss": 1.0579,
1108
  "step": 780
1109
  },
1110
  {
1111
  "epoch": 0.8250131371518655,
1112
- "grad_norm": 0.8081087810817446,
1113
  "learning_rate": 2.704821259828608e-07,
1114
  "loss": 1.0684,
1115
  "step": 785
1116
  },
1117
  {
1118
  "epoch": 0.8302679978980557,
1119
- "grad_norm": 0.8036521909829868,
1120
  "learning_rate": 2.5490472638458195e-07,
1121
- "loss": 1.0484,
1122
  "step": 790
1123
  },
1124
  {
1125
  "epoch": 0.8355228586442459,
1126
- "grad_norm": 0.8069169918745295,
1127
  "learning_rate": 2.3974756727576886e-07,
1128
  "loss": 1.0698,
1129
  "step": 795
1130
  },
1131
  {
1132
  "epoch": 0.8407777193904361,
1133
- "grad_norm": 0.8010632009065842,
1134
  "learning_rate": 2.2501576445125077e-07,
1135
- "loss": 1.0592,
1136
  "step": 800
1137
  },
1138
  {
1139
  "epoch": 0.8460325801366264,
1140
- "grad_norm": 0.8112748895796706,
1141
  "learning_rate": 2.1071429014099365e-07,
1142
- "loss": 1.063,
1143
  "step": 805
1144
  },
1145
  {
1146
  "epoch": 0.8512874408828166,
1147
- "grad_norm": 0.7987887307844111,
1148
  "learning_rate": 1.9684797133188865e-07,
1149
- "loss": 1.0396,
1150
  "step": 810
1151
  },
1152
  {
1153
  "epoch": 0.8565423016290068,
1154
- "grad_norm": 0.7974743093384434,
1155
  "learning_rate": 1.8342148813856414e-07,
1156
- "loss": 1.0497,
1157
  "step": 815
1158
  },
1159
  {
1160
  "epoch": 0.8617971623751971,
1161
- "grad_norm": 0.9606632973010945,
1162
  "learning_rate": 1.7043937222376766e-07,
1163
  "loss": 1.0484,
1164
  "step": 820
1165
  },
1166
  {
1167
  "epoch": 0.8670520231213873,
1168
- "grad_norm": 0.8027436257960178,
1169
  "learning_rate": 1.579060052688548e-07,
1170
- "loss": 1.0674,
1171
  "step": 825
1172
  },
1173
  {
1174
  "epoch": 0.8723068838675775,
1175
- "grad_norm": 0.8077058467523247,
1176
  "learning_rate": 1.4582561749489847e-07,
1177
  "loss": 1.0658,
1178
  "step": 830
1179
  },
1180
  {
1181
  "epoch": 0.8775617446137677,
1182
- "grad_norm": 0.796090164625106,
1183
  "learning_rate": 1.3420228623491742e-07,
1184
- "loss": 1.0339,
1185
  "step": 835
1186
  },
1187
  {
1188
  "epoch": 0.882816605359958,
1189
- "grad_norm": 0.7956190292731135,
1190
  "learning_rate": 1.2303993455770946e-07,
1191
- "loss": 1.0678,
1192
  "step": 840
1193
  },
1194
  {
1195
  "epoch": 0.8880714661061482,
1196
- "grad_norm": 0.8060093191984734,
1197
  "learning_rate": 1.1234232994374916e-07,
1198
- "loss": 1.0562,
1199
  "step": 845
1200
  },
1201
  {
1202
  "epoch": 0.8933263268523384,
1203
- "grad_norm": 0.8052954491456666,
1204
  "learning_rate": 1.0211308301360039e-07,
1205
  "loss": 1.0635,
1206
  "step": 850
1207
  },
1208
  {
1209
  "epoch": 0.8985811875985287,
1210
- "grad_norm": 0.789706910907941,
1211
  "learning_rate": 9.235564630927196e-08,
1212
- "loss": 1.0497,
1213
  "step": 855
1214
  },
1215
  {
1216
  "epoch": 0.9038360483447189,
1217
- "grad_norm": 0.9407705619653095,
1218
  "learning_rate": 8.307331312892601e-08,
1219
  "loss": 1.0573,
1220
  "step": 860
1221
  },
1222
  {
1223
  "epoch": 0.9090909090909091,
1224
- "grad_norm": 0.7849160686891212,
1225
  "learning_rate": 7.426921641533562e-08,
1226
- "loss": 1.0535,
1227
  "step": 865
1228
  },
1229
  {
1230
  "epoch": 0.9143457698370994,
1231
- "grad_norm": 0.7819582818532375,
1232
  "learning_rate": 6.594632769846354e-08,
1233
  "loss": 1.0605,
1234
  "step": 870
1235
  },
1236
  {
1237
  "epoch": 0.9196006305832896,
1238
- "grad_norm": 0.8235274235744744,
1239
  "learning_rate": 5.810745609252166e-08,
1240
- "loss": 1.0481,
1241
  "step": 875
1242
  },
1243
  {
1244
  "epoch": 0.9248554913294798,
1245
- "grad_norm": 0.8326201665614436,
1246
  "learning_rate": 5.0755247347847814e-08,
1247
- "loss": 1.05,
1248
  "step": 880
1249
  },
1250
  {
1251
  "epoch": 0.9301103520756699,
1252
- "grad_norm": 0.8084764417080952,
1253
  "learning_rate": 4.389218295792002e-08,
1254
- "loss": 1.0649,
1255
  "step": 885
1256
  },
1257
  {
1258
  "epoch": 0.9353652128218602,
1259
- "grad_norm": 0.8589755183334531,
1260
  "learning_rate": 3.7520579321812186e-08,
1261
- "loss": 1.073,
1262
  "step": 890
1263
  },
1264
  {
1265
  "epoch": 0.9406200735680504,
1266
- "grad_norm": 0.8084706845759793,
1267
  "learning_rate": 3.1642586962369765e-08,
1268
  "loss": 1.0363,
1269
  "step": 895
1270
  },
1271
  {
1272
  "epoch": 0.9458749343142406,
1273
- "grad_norm": 0.7879594862220682,
1274
  "learning_rate": 2.6260189800372757e-08,
1275
  "loss": 1.0517,
1276
  "step": 900
1277
  },
1278
  {
1279
  "epoch": 0.9511297950604309,
1280
- "grad_norm": 0.7751896271813346,
1281
  "learning_rate": 2.13752044849288e-08,
1282
- "loss": 1.0625,
1283
  "step": 905
1284
  },
1285
  {
1286
  "epoch": 0.9563846558066211,
1287
- "grad_norm": 0.8037723865057425,
1288
  "learning_rate": 1.698927978032383e-08,
1289
- "loss": 1.0486,
1290
  "step": 910
1291
  },
1292
  {
1293
  "epoch": 0.9616395165528113,
1294
- "grad_norm": 0.7949419082977591,
1295
  "learning_rate": 1.3103896009537208e-08,
1296
  "loss": 1.0476,
1297
  "step": 915
1298
  },
1299
  {
1300
  "epoch": 0.9668943772990016,
1301
- "grad_norm": 0.7711064252783653,
1302
  "learning_rate": 9.720364554606898e-09,
1303
- "loss": 1.0553,
1304
  "step": 920
1305
  },
1306
  {
1307
  "epoch": 0.9721492380451918,
1308
- "grad_norm": 0.9190212136355437,
1309
  "learning_rate": 6.839827414016675e-09,
1310
  "loss": 1.0636,
1311
  "step": 925
1312
  },
1313
  {
1314
  "epoch": 0.977404098791382,
1315
- "grad_norm": 0.7895845548954091,
1316
  "learning_rate": 4.463256817252792e-09,
1317
  "loss": 1.0626,
1318
  "step": 930
1319
  },
1320
  {
1321
  "epoch": 0.9826589595375722,
1322
- "grad_norm": 0.7799515201158175,
1323
  "learning_rate": 2.5914548966596285e-09,
1324
- "loss": 1.0584,
1325
  "step": 935
1326
  },
1327
  {
1328
  "epoch": 0.9879138202837625,
1329
- "grad_norm": 0.7896292102395543,
1330
  "learning_rate": 1.2250534167067561e-09,
1331
- "loss": 1.0562,
1332
  "step": 940
1333
  },
1334
  {
1335
  "epoch": 0.9931686810299527,
1336
- "grad_norm": 0.8122355011762691,
1337
  "learning_rate": 3.6451356075817287e-10,
1338
- "loss": 1.051,
1339
  "step": 945
1340
  },
1341
  {
1342
  "epoch": 0.9984235417761429,
1343
- "grad_norm": 0.7870290469964676,
1344
  "learning_rate": 1.0125775414981941e-11,
1345
- "loss": 1.096,
1346
  "step": 950
1347
  },
1348
  {
1349
  "epoch": 0.999474513925381,
1350
- "eval_loss": 1.0638214349746704,
1351
- "eval_runtime": 593.8184,
1352
- "eval_samples_per_second": 22.692,
1353
- "eval_steps_per_second": 0.711,
1354
  "step": 951
1355
  },
1356
  {
1357
  "epoch": 0.999474513925381,
1358
  "step": 951,
1359
  "total_flos": 905758069751808.0,
1360
- "train_loss": 1.1023603343061092,
1361
- "train_runtime": 21224.38,
1362
- "train_samples_per_second": 5.738,
1363
- "train_steps_per_second": 0.045
1364
  }
1365
  ],
1366
  "logging_steps": 5,
 
150
  },
151
  {
152
  "epoch": 0.10509721492380451,
153
+ "grad_norm": 1.0117170886130586,
154
  "learning_rate": 2.9998379903275155e-06,
155
  "loss": 1.1108,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.11035207566999475,
160
+ "grad_norm": 0.906919049524167,
161
  "learning_rate": 2.9991798860113893e-06,
162
  "loss": 1.1162,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.11560693641618497,
167
+ "grad_norm": 0.9141362173846742,
168
  "learning_rate": 2.998015783397426e-06,
169
  "loss": 1.1091,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.1208617971623752,
174
+ "grad_norm": 0.9619396255470063,
175
  "learning_rate": 2.9963460753897363e-06,
176
+ "loss": 1.0962,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.1261166579085654,
181
+ "grad_norm": 1.0465936968149612,
182
  "learning_rate": 2.994171325542714e-06,
183
  "loss": 1.0911,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.13137151865475566,
188
+ "grad_norm": 0.9530685775511196,
189
  "learning_rate": 2.991492267870822e-06,
190
  "loss": 1.0917,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.13662637940094588,
195
+ "grad_norm": 0.9008971542587714,
196
  "learning_rate": 2.9883098066008556e-06,
197
+ "loss": 1.0879,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.1418812401471361,
202
+ "grad_norm": 1.010852514785442,
203
  "learning_rate": 2.984625015866745e-06,
204
+ "loss": 1.086,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.14713610089332632,
209
+ "grad_norm": 0.8992929518552587,
210
  "learning_rate": 2.9804391393470235e-06,
211
+ "loss": 1.1009,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.15239096163951654,
216
+ "grad_norm": 0.9062761866531701,
217
  "learning_rate": 2.975753589845059e-06,
218
+ "loss": 1.1019,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.1576458223857068,
223
+ "grad_norm": 0.8610097516319934,
224
  "learning_rate": 2.970569948812214e-06,
225
  "loss": 1.0691,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.162900683131897,
230
+ "grad_norm": 0.8872085969729788,
231
  "learning_rate": 2.9648899658140767e-06,
232
+ "loss": 1.0862,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.16815554387808723,
237
+ "grad_norm": 1.0208916358421982,
238
  "learning_rate": 2.9587155579399543e-06,
239
  "loss": 1.0823,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.17341040462427745,
244
+ "grad_norm": 1.000083341373154,
245
  "learning_rate": 2.9520488091558225e-06,
246
+ "loss": 1.0771,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.17866526537046767,
251
+ "grad_norm": 0.9057099612333195,
252
  "learning_rate": 2.944891969600953e-06,
253
  "loss": 1.0875,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.1839201261166579,
258
+ "grad_norm": 0.9155112430806412,
259
  "learning_rate": 2.9372474548284537e-06,
260
+ "loss": 1.095,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.18917498686284814,
265
+ "grad_norm": 0.8983015781319096,
266
  "learning_rate": 2.9291178449899786e-06,
267
  "loss": 1.0934,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.19442984760903836,
272
+ "grad_norm": 0.9013247242472799,
273
  "learning_rate": 2.920505883964884e-06,
274
+ "loss": 1.0917,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.19968470835522859,
279
+ "grad_norm": 0.892018911723705,
280
  "learning_rate": 2.9114144784341226e-06,
281
  "loss": 1.0854,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.2049395691014188,
286
+ "grad_norm": 0.9231553972162896,
287
  "learning_rate": 2.9018466968991914e-06,
288
  "loss": 1.0749,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.21019442984760903,
293
+ "grad_norm": 0.8456793004463601,
294
  "learning_rate": 2.8918057686464587e-06,
295
+ "loss": 1.0823,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.21544929059379928,
300
+ "grad_norm": 0.8761446392793291,
301
  "learning_rate": 2.881295082657229e-06,
302
  "loss": 1.0769,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.2207041513399895,
307
+ "grad_norm": 0.8860527749104554,
308
  "learning_rate": 2.8703181864639013e-06,
309
+ "loss": 1.0925,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.22595901208617972,
314
+ "grad_norm": 0.8708280345626856,
315
  "learning_rate": 2.8588787849526228e-06,
316
+ "loss": 1.0731,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.23121387283236994,
321
+ "grad_norm": 0.8688347790794807,
322
  "learning_rate": 2.846980739112822e-06,
323
  "loss": 1.0749,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.23646873357856016,
328
+ "grad_norm": 0.8729839981153089,
329
  "learning_rate": 2.834628064734065e-06,
330
+ "loss": 1.0697,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.2417235943247504,
335
+ "grad_norm": 0.8993877107760438,
336
  "learning_rate": 2.821824931050655e-06,
337
  "loss": 1.0751,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.24697845507094063,
342
+ "grad_norm": 0.8452099155218541,
343
  "learning_rate": 2.8085756593344505e-06,
344
  "loss": 1.07,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.2522333158171308,
349
+ "grad_norm": 0.8798731633285007,
350
  "learning_rate": 2.794884721436361e-06,
351
+ "loss": 1.0655,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.25748817656332107,
356
+ "grad_norm": 0.895358087891481,
357
  "learning_rate": 2.780756738277021e-06,
358
+ "loss": 1.0983,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.2627430373095113,
363
+ "grad_norm": 0.8609931821431536,
364
  "learning_rate": 2.766196478287156e-06,
365
  "loss": 1.0907,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.2679978980557015,
370
+ "grad_norm": 0.8553591777726404,
371
  "learning_rate": 2.751208855798155e-06,
372
  "loss": 1.0801,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.27325275880189176,
377
+ "grad_norm": 0.8654715123002859,
378
  "learning_rate": 2.7357989293834005e-06,
379
+ "loss": 1.082,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.27850761954808195,
384
+ "grad_norm": 0.8697051687346281,
385
  "learning_rate": 2.7199719001509175e-06,
386
+ "loss": 1.0748,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.2837624802942722,
391
+ "grad_norm": 0.880312866484559,
392
  "learning_rate": 2.7037331099879117e-06,
393
+ "loss": 1.0878,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.28901734104046245,
398
+ "grad_norm": 0.8452698137198734,
399
  "learning_rate": 2.687088039757792e-06,
400
  "loss": 1.0797,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.29427220178665264,
405
+ "grad_norm": 0.9791182527762319,
406
  "learning_rate": 2.6700423074502888e-06,
407
  "loss": 1.0717,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.2995270625328429,
412
+ "grad_norm": 0.8645394739775393,
413
  "learning_rate": 2.652601666285289e-06,
414
+ "loss": 1.114,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.3047819232790331,
419
+ "grad_norm": 0.9004251617653912,
420
  "learning_rate": 2.6347720027710253e-06,
421
  "loss": 1.067,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.31003678402522333,
426
+ "grad_norm": 0.872604926778432,
427
  "learning_rate": 2.6165593347172837e-06,
428
+ "loss": 1.0732,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.3152916447714136,
433
+ "grad_norm": 0.8929460807825601,
434
  "learning_rate": 2.5979698092042925e-06,
435
+ "loss": 1.0876,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.3205465055176038,
440
+ "grad_norm": 0.8588249540629153,
441
  "learning_rate": 2.5790097005079765e-06,
442
+ "loss": 1.0897,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.325801366263794,
447
+ "grad_norm": 0.8727624932776766,
448
  "learning_rate": 2.559685407982288e-06,
449
+ "loss": 1.0666,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.3310562270099842,
454
+ "grad_norm": 0.933192018274695,
455
  "learning_rate": 2.5400034538993135e-06,
456
+ "loss": 1.0868,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.33631108775617446,
461
+ "grad_norm": 0.9302912982362859,
462
  "learning_rate": 2.519970481247901e-06,
463
+ "loss": 1.0518,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 0.3415659485023647,
468
+ "grad_norm": 0.8713971326878898,
469
  "learning_rate": 2.4995932514915404e-06,
470
+ "loss": 1.0683,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 0.3468208092485549,
475
+ "grad_norm": 0.8822615502295114,
476
  "learning_rate": 2.478878642286253e-06,
477
  "loss": 1.0644,
478
  "step": 330
479
  },
480
  {
481
  "epoch": 0.35207566999474516,
482
+ "grad_norm": 0.8368326296787831,
483
  "learning_rate": 2.4578336451592705e-06,
484
+ "loss": 1.0811,
485
  "step": 335
486
  },
487
  {
488
  "epoch": 0.35733053074093535,
489
+ "grad_norm": 0.8544515279738967,
490
  "learning_rate": 2.4364653631492774e-06,
491
+ "loss": 1.0623,
492
  "step": 340
493
  },
494
  {
495
  "epoch": 0.3625853914871256,
496
+ "grad_norm": 0.891511057734901,
497
  "learning_rate": 2.414781008409014e-06,
498
  "loss": 1.0737,
499
  "step": 345
500
  },
501
  {
502
  "epoch": 0.3678402522333158,
503
+ "grad_norm": 0.9052547289541356,
504
  "learning_rate": 2.3927878997710575e-06,
505
  "loss": 1.0981,
506
  "step": 350
507
  },
508
  {
509
  "epoch": 0.37309511297950604,
510
+ "grad_norm": 0.8951487470408807,
511
  "learning_rate": 2.3704934602775926e-06,
512
  "loss": 1.0827,
513
  "step": 355
514
  },
515
  {
516
  "epoch": 0.3783499737256963,
517
+ "grad_norm": 0.9097434855008588,
518
  "learning_rate": 2.347905214675008e-06,
519
  "loss": 1.0713,
520
  "step": 360
521
  },
522
  {
523
  "epoch": 0.3836048344718865,
524
+ "grad_norm": 0.8583189613291908,
525
  "learning_rate": 2.3250307868741717e-06,
526
+ "loss": 1.0706,
527
  "step": 365
528
  },
529
  {
530
  "epoch": 0.38885969521807673,
531
+ "grad_norm": 0.8216502318444575,
532
  "learning_rate": 2.3018778973772334e-06,
533
  "loss": 1.0573,
534
  "step": 370
535
  },
536
  {
537
  "epoch": 0.3941145559642669,
538
+ "grad_norm": 1.0166338065086127,
539
  "learning_rate": 2.278454360671823e-06,
540
+ "loss": 1.0865,
541
  "step": 375
542
  },
543
  {
544
  "epoch": 0.39936941671045717,
545
+ "grad_norm": 0.8685946998953423,
546
  "learning_rate": 2.2547680825935325e-06,
547
+ "loss": 1.0852,
548
  "step": 380
549
  },
550
  {
551
  "epoch": 0.4046242774566474,
552
+ "grad_norm": 0.8206385124004496,
553
  "learning_rate": 2.2308270576575657e-06,
554
  "loss": 1.0683,
555
  "step": 385
556
  },
557
  {
558
  "epoch": 0.4098791382028376,
559
+ "grad_norm": 0.8627531067721118,
560
  "learning_rate": 2.206639366360451e-06,
561
+ "loss": 1.0885,
562
  "step": 390
563
  },
564
  {
565
  "epoch": 0.41513399894902786,
566
+ "grad_norm": 0.8850142996528242,
567
  "learning_rate": 2.1822131724527425e-06,
568
+ "loss": 1.0588,
569
  "step": 395
570
  },
571
  {
572
  "epoch": 0.42038885969521805,
573
+ "grad_norm": 0.8294172803163861,
574
  "learning_rate": 2.157556720183616e-06,
575
  "loss": 1.0542,
576
  "step": 400
577
  },
578
  {
579
  "epoch": 0.4256437204414083,
580
+ "grad_norm": 0.8307870053309333,
581
  "learning_rate": 2.1326783315182984e-06,
582
  "loss": 1.0666,
583
  "step": 405
584
  },
585
  {
586
  "epoch": 0.43089858118759855,
587
+ "grad_norm": 0.8178946165962943,
588
  "learning_rate": 2.1075864033292623e-06,
589
+ "loss": 1.0725,
590
  "step": 410
591
  },
592
  {
593
  "epoch": 0.43615344193378874,
594
+ "grad_norm": 0.8236704022077929,
595
  "learning_rate": 2.082289404562144e-06,
596
+ "loss": 1.0769,
597
  "step": 415
598
  },
599
  {
600
  "epoch": 0.441408302679979,
601
+ "grad_norm": 0.8321193181865734,
602
  "learning_rate": 2.0567958733773313e-06,
603
+ "loss": 1.0613,
604
  "step": 420
605
  },
606
  {
607
  "epoch": 0.4466631634261692,
608
+ "grad_norm": 0.8122682546804041,
609
  "learning_rate": 2.0311144142681904e-06,
610
+ "loss": 1.0642,
611
  "step": 425
612
  },
613
  {
614
  "epoch": 0.45191802417235943,
615
+ "grad_norm": 0.8236878035013919,
616
  "learning_rate": 2.005253695156909e-06,
617
  "loss": 1.0472,
618
  "step": 430
619
  },
620
  {
621
  "epoch": 0.4571728849185497,
622
+ "grad_norm": 0.8493608509140299,
623
  "learning_rate": 1.9792224444689222e-06,
624
+ "loss": 1.0616,
625
  "step": 435
626
  },
627
  {
628
  "epoch": 0.4624277456647399,
629
+ "grad_norm": 0.842399913675415,
630
  "learning_rate": 1.9530294481869286e-06,
631
+ "loss": 1.0803,
632
  "step": 440
633
  },
634
  {
635
  "epoch": 0.4676826064109301,
636
+ "grad_norm": 0.8657084634018396,
637
  "learning_rate": 1.926683546885469e-06,
638
  "loss": 1.0588,
639
  "step": 445
640
  },
641
  {
642
  "epoch": 0.4729374671571203,
643
+ "grad_norm": 0.8242451456404312,
644
  "learning_rate": 1.9001936327470894e-06,
645
+ "loss": 1.0709,
646
  "step": 450
647
  },
648
  {
649
  "epoch": 0.47819232790331057,
650
+ "grad_norm": 0.8243645563523258,
651
  "learning_rate": 1.873568646561075e-06,
652
+ "loss": 1.0674,
653
  "step": 455
654
  },
655
  {
656
  "epoch": 0.4834471886495008,
657
+ "grad_norm": 0.8343407072556835,
658
  "learning_rate": 1.8468175747057898e-06,
659
  "loss": 1.0748,
660
  "step": 460
661
  },
662
  {
663
  "epoch": 0.488702049395691,
664
+ "grad_norm": 0.8704271811597876,
665
  "learning_rate": 1.8199494461156204e-06,
666
+ "loss": 1.0533,
667
  "step": 465
668
  },
669
  {
670
  "epoch": 0.49395691014188126,
671
+ "grad_norm": 0.869601199009106,
672
  "learning_rate": 1.7929733292335591e-06,
673
  "loss": 1.0733,
674
  "step": 470
675
  },
676
  {
677
  "epoch": 0.49921177088807145,
678
+ "grad_norm": 0.8251688330803062,
679
  "learning_rate": 1.765898328950455e-06,
680
+ "loss": 1.0651,
681
  "step": 475
682
  },
683
  {
684
  "epoch": 0.5044666316342616,
685
+ "grad_norm": 0.8182570293593837,
686
  "learning_rate": 1.738733583531959e-06,
687
  "loss": 1.0802,
688
  "step": 480
689
  },
690
  {
691
  "epoch": 0.509721492380452,
692
+ "grad_norm": 0.861721166977276,
693
  "learning_rate": 1.7114882615342073e-06,
694
  "loss": 1.056,
695
  "step": 485
696
  },
697
  {
698
  "epoch": 0.5149763531266421,
699
+ "grad_norm": 0.8025950431569188,
700
  "learning_rate": 1.6841715587092798e-06,
701
+ "loss": 1.0785,
702
  "step": 490
703
  },
704
  {
705
  "epoch": 0.5202312138728323,
706
+ "grad_norm": 0.8462319262578287,
707
  "learning_rate": 1.6567926949014804e-06,
708
  "loss": 1.0745,
709
  "step": 495
710
  },
711
  {
712
  "epoch": 0.5254860746190226,
713
+ "grad_norm": 0.8532567164974908,
714
  "learning_rate": 1.6293609109354836e-06,
715
+ "loss": 1.0611,
716
  "step": 500
717
  },
718
  {
719
  "epoch": 0.5307409353652128,
720
+ "grad_norm": 0.8238719502721198,
721
  "learning_rate": 1.601885465497404e-06,
722
+ "loss": 1.0651,
723
  "step": 505
724
  },
725
  {
726
  "epoch": 0.535995796111403,
727
+ "grad_norm": 0.8290145975340624,
728
  "learning_rate": 1.5743756320098334e-06,
729
  "loss": 1.0643,
730
  "step": 510
731
  },
732
  {
733
  "epoch": 0.5412506568575933,
734
+ "grad_norm": 0.814187446370962,
735
  "learning_rate": 1.5468406955019059e-06,
736
+ "loss": 1.0571,
737
  "step": 515
738
  },
739
  {
740
  "epoch": 0.5465055176037835,
741
+ "grad_norm": 0.8640835720468265,
742
  "learning_rate": 1.5192899494754443e-06,
743
  "loss": 1.0731,
744
  "step": 520
745
  },
746
  {
747
  "epoch": 0.5517603783499737,
748
+ "grad_norm": 0.8096973874507514,
749
  "learning_rate": 1.4917326927682494e-06,
750
+ "loss": 1.0705,
751
  "step": 525
752
  },
753
  {
754
  "epoch": 0.5570152390961639,
755
+ "grad_norm": 0.8060488155757439,
756
  "learning_rate": 1.4641782264155852e-06,
757
+ "loss": 1.0738,
758
  "step": 530
759
  },
760
  {
761
  "epoch": 0.5622700998423542,
762
+ "grad_norm": 0.8272465004554679,
763
  "learning_rate": 1.4366358505109237e-06,
764
  "loss": 1.0857,
765
  "step": 535
766
  },
767
  {
768
  "epoch": 0.5675249605885444,
769
+ "grad_norm": 0.808405985194556,
770
  "learning_rate": 1.4091148610670098e-06,
771
+ "loss": 1.0699,
772
  "step": 540
773
  },
774
  {
775
  "epoch": 0.5727798213347346,
776
+ "grad_norm": 0.8401268224107731,
777
  "learning_rate": 1.3816245468782988e-06,
778
+ "loss": 1.0601,
779
  "step": 545
780
  },
781
  {
782
  "epoch": 0.5780346820809249,
783
+ "grad_norm": 0.8212169210412936,
784
  "learning_rate": 1.3541741863858352e-06,
785
+ "loss": 1.0534,
786
  "step": 550
787
  },
788
  {
789
  "epoch": 0.5832895428271151,
790
+ "grad_norm": 0.909077361069008,
791
  "learning_rate": 1.326773044545621e-06,
792
  "loss": 1.0672,
793
  "step": 555
794
  },
795
  {
796
  "epoch": 0.5885444035733053,
797
+ "grad_norm": 0.8144193123335469,
798
  "learning_rate": 1.299430369701541e-06,
799
+ "loss": 1.058,
800
  "step": 560
801
  },
802
  {
803
  "epoch": 0.5937992643194955,
804
+ "grad_norm": 0.8024091331374241,
805
  "learning_rate": 1.272155390463889e-06,
806
  "loss": 1.084,
807
  "step": 565
808
  },
809
  {
810
  "epoch": 0.5990541250656858,
811
+ "grad_norm": 0.8132457228764496,
812
  "learning_rate": 1.2449573125945607e-06,
813
+ "loss": 1.0677,
814
  "step": 570
815
  },
816
  {
817
  "epoch": 0.604308985811876,
818
+ "grad_norm": 0.8142277094157844,
819
  "learning_rate": 1.2178453158999509e-06,
820
+ "loss": 1.0706,
821
  "step": 575
822
  },
823
  {
824
  "epoch": 0.6095638465580662,
825
+ "grad_norm": 0.7999185759745931,
826
  "learning_rate": 1.1908285511326195e-06,
827
+ "loss": 1.056,
828
  "step": 580
829
  },
830
  {
831
  "epoch": 0.6148187073042565,
832
+ "grad_norm": 0.8042827578821733,
833
  "learning_rate": 1.1639161369027564e-06,
834
+ "loss": 1.0544,
835
  "step": 585
836
  },
837
  {
838
  "epoch": 0.6200735680504467,
839
+ "grad_norm": 0.7974533954875164,
840
  "learning_rate": 1.1371171566004986e-06,
841
+ "loss": 1.0508,
842
  "step": 590
843
  },
844
  {
845
  "epoch": 0.6253284287966369,
846
+ "grad_norm": 0.7901099674485127,
847
  "learning_rate": 1.1104406553301357e-06,
848
+ "loss": 1.0636,
849
  "step": 595
850
  },
851
  {
852
  "epoch": 0.6305832895428272,
853
+ "grad_norm": 0.8076939152475928,
854
  "learning_rate": 1.0838956368572335e-06,
855
  "loss": 1.0526,
856
  "step": 600
857
  },
858
  {
859
  "epoch": 0.6358381502890174,
860
+ "grad_norm": 0.799554008565015,
861
  "learning_rate": 1.0574910605697135e-06,
862
  "loss": 1.0715,
863
  "step": 605
864
  },
865
  {
866
  "epoch": 0.6410930110352075,
867
+ "grad_norm": 0.7979721444226595,
868
  "learning_rate": 1.03123583845391e-06,
869
  "loss": 1.0692,
870
  "step": 610
871
  },
872
  {
873
  "epoch": 0.6463478717813977,
874
+ "grad_norm": 0.8069555984252168,
875
  "learning_rate": 1.0051388320866258e-06,
876
  "loss": 1.0583,
877
  "step": 615
878
  },
879
  {
880
  "epoch": 0.651602732527588,
881
+ "grad_norm": 0.8547163693801487,
882
  "learning_rate": 9.792088496441992e-07,
883
+ "loss": 1.082,
884
  "step": 620
885
  },
886
  {
887
  "epoch": 0.6568575932737782,
888
+ "grad_norm": 0.7889726381679567,
889
  "learning_rate": 9.53454642929601e-07,
890
  "loss": 1.0772,
891
  "step": 625
892
  },
893
  {
894
  "epoch": 0.6621124540199684,
895
+ "grad_norm": 0.8667983764401236,
896
  "learning_rate": 9.278849044185509e-07,
897
+ "loss": 1.0636,
898
  "step": 630
899
  },
900
  {
901
  "epoch": 0.6673673147661587,
902
+ "grad_norm": 0.8030729729500611,
903
  "learning_rate": 9.025082643256647e-07,
904
+ "loss": 1.0431,
905
  "step": 635
906
  },
907
  {
908
  "epoch": 0.6726221755123489,
909
+ "grad_norm": 0.9156958348117665,
910
  "learning_rate": 8.77333287691609e-07,
911
+ "loss": 1.0585,
912
  "step": 640
913
  },
914
  {
915
  "epoch": 0.6778770362585391,
916
+ "grad_norm": 0.8108123796700908,
917
  "learning_rate": 8.523684714922608e-07,
918
  "loss": 1.0742,
919
  "step": 645
920
  },
921
  {
922
  "epoch": 0.6831318970047294,
923
+ "grad_norm": 0.8023252184856209,
924
  "learning_rate": 8.276222417708309e-07,
925
  "loss": 1.0557,
926
  "step": 650
927
  },
928
  {
929
  "epoch": 0.6883867577509196,
930
+ "grad_norm": 0.8088733267219154,
931
  "learning_rate": 8.031029507939401e-07,
932
+ "loss": 1.0549,
933
  "step": 655
934
  },
935
  {
936
  "epoch": 0.6936416184971098,
937
+ "grad_norm": 0.7972317042691273,
938
  "learning_rate": 7.788188742325803e-07,
939
+ "loss": 1.0615,
940
  "step": 660
941
  },
942
  {
943
  "epoch": 0.6988964792433,
944
+ "grad_norm": 0.8305529275791657,
945
  "learning_rate": 7.547782083689479e-07,
946
  "loss": 1.0643,
947
  "step": 665
948
  },
949
  {
950
  "epoch": 0.7041513399894903,
951
+ "grad_norm": 0.8071361500941268,
952
  "learning_rate": 7.309890673300506e-07,
953
+ "loss": 1.0452,
954
  "step": 670
955
  },
956
  {
957
  "epoch": 0.7094062007356805,
958
+ "grad_norm": 0.8100694184976672,
959
  "learning_rate": 7.074594803490618e-07,
960
+ "loss": 1.0517,
961
  "step": 675
962
  },
963
  {
964
  "epoch": 0.7146610614818707,
965
+ "grad_norm": 0.8505474154250443,
966
  "learning_rate": 6.841973890553168e-07,
967
+ "loss": 1.0612,
968
  "step": 680
969
  },
970
  {
971
  "epoch": 0.719915922228061,
972
+ "grad_norm": 0.8165179858505918,
973
  "learning_rate": 6.6121064479388e-07,
974
  "loss": 1.0365,
975
  "step": 685
976
  },
977
  {
978
  "epoch": 0.7251707829742512,
979
+ "grad_norm": 0.8100990471560885,
980
  "learning_rate": 6.385070059755846e-07,
981
+ "loss": 1.0739,
982
  "step": 690
983
  },
984
  {
985
  "epoch": 0.7304256437204414,
986
+ "grad_norm": 0.8184474658493306,
987
  "learning_rate": 6.160941354584404e-07,
988
+ "loss": 1.0611,
989
  "step": 695
990
  },
991
  {
992
  "epoch": 0.7356805044666316,
993
+ "grad_norm": 0.8252657802847051,
994
  "learning_rate": 5.93979597961289e-07,
995
  "loss": 1.0505,
996
  "step": 700
997
  },
998
  {
999
  "epoch": 0.7409353652128219,
1000
+ "grad_norm": 0.7762171975020372,
1001
  "learning_rate": 5.721708575105861e-07,
1002
  "loss": 1.0589,
1003
  "step": 705
1004
  },
1005
  {
1006
  "epoch": 0.7461902259590121,
1007
+ "grad_norm": 0.8192139022265624,
1008
  "learning_rate": 5.506752749211673e-07,
1009
  "loss": 1.0546,
1010
  "step": 710
1011
  },
1012
  {
1013
  "epoch": 0.7514450867052023,
1014
+ "grad_norm": 0.8083286951981082,
1015
  "learning_rate": 5.295001053118499e-07,
1016
+ "loss": 1.0564,
1017
  "step": 715
1018
  },
1019
  {
1020
  "epoch": 0.7566999474513926,
1021
+ "grad_norm": 0.792026256039253,
1022
  "learning_rate": 5.086524956567084e-07,
1023
+ "loss": 1.0714,
1024
  "step": 720
1025
  },
1026
  {
1027
  "epoch": 0.7619548081975828,
1028
+ "grad_norm": 0.7949670797179523,
1029
  "learning_rate": 4.88139482372852e-07,
1030
  "loss": 1.0375,
1031
  "step": 725
1032
  },
1033
  {
1034
  "epoch": 0.767209668943773,
1035
+ "grad_norm": 0.9772289752161802,
1036
  "learning_rate": 4.679679889455153e-07,
1037
+ "loss": 1.0526,
1038
  "step": 730
1039
  },
1040
  {
1041
  "epoch": 0.7724645296899633,
1042
+ "grad_norm": 0.8027717879760866,
1043
  "learning_rate": 4.4814482359126713e-07,
1044
+ "loss": 1.0819,
1045
  "step": 735
1046
  },
1047
  {
1048
  "epoch": 0.7777193904361535,
1049
+ "grad_norm": 0.8018900351097608,
1050
  "learning_rate": 4.2867667696012255e-07,
1051
+ "loss": 1.0627,
1052
  "step": 740
1053
  },
1054
  {
1055
  "epoch": 0.7829742511823437,
1056
+ "grad_norm": 1.2040107178094928,
1057
  "learning_rate": 4.0957011987733655e-07,
1058
+ "loss": 1.0759,
1059
  "step": 745
1060
  },
1061
  {
1062
  "epoch": 0.7882291119285338,
1063
+ "grad_norm": 0.8007696827925082,
1064
  "learning_rate": 3.908316011256419e-07,
1065
  "loss": 1.0744,
1066
  "step": 750
1067
  },
1068
  {
1069
  "epoch": 0.7934839726747241,
1070
+ "grad_norm": 0.7873570653528454,
1071
  "learning_rate": 3.7246744526867525e-07,
1072
+ "loss": 1.0568,
1073
  "step": 755
1074
  },
1075
  {
1076
  "epoch": 0.7987388334209143,
1077
+ "grad_norm": 0.7923946383936664,
1078
  "learning_rate": 3.5448385051633225e-07,
1079
+ "loss": 1.0464,
1080
  "step": 760
1081
  },
1082
  {
1083
  "epoch": 0.8039936941671045,
1084
+ "grad_norm": 0.7863315246660161,
1085
  "learning_rate": 3.368868866327678e-07,
1086
+ "loss": 1.0612,
1087
  "step": 765
1088
  },
1089
  {
1090
  "epoch": 0.8092485549132948,
1091
+ "grad_norm": 1.479921699105945,
1092
  "learning_rate": 3.1968249288774887e-07,
1093
+ "loss": 1.0605,
1094
  "step": 770
1095
  },
1096
  {
1097
  "epoch": 0.814503415659485,
1098
+ "grad_norm": 0.8008655410629794,
1099
  "learning_rate": 3.0287647605205155e-07,
1100
+ "loss": 1.0724,
1101
  "step": 775
1102
  },
1103
  {
1104
  "epoch": 0.8197582764056752,
1105
+ "grad_norm": 0.8062826939041355,
1106
  "learning_rate": 2.86474508437579e-07,
1107
  "loss": 1.0579,
1108
  "step": 780
1109
  },
1110
  {
1111
  "epoch": 0.8250131371518655,
1112
+ "grad_norm": 0.8029021628515398,
1113
  "learning_rate": 2.704821259828608e-07,
1114
  "loss": 1.0684,
1115
  "step": 785
1116
  },
1117
  {
1118
  "epoch": 0.8302679978980557,
1119
+ "grad_norm": 0.8105469057959347,
1120
  "learning_rate": 2.5490472638458195e-07,
1121
+ "loss": 1.0485,
1122
  "step": 790
1123
  },
1124
  {
1125
  "epoch": 0.8355228586442459,
1126
+ "grad_norm": 0.7927228754247995,
1127
  "learning_rate": 2.3974756727576886e-07,
1128
  "loss": 1.0698,
1129
  "step": 795
1130
  },
1131
  {
1132
  "epoch": 0.8407777193904361,
1133
+ "grad_norm": 0.7849174985906052,
1134
  "learning_rate": 2.2501576445125077e-07,
1135
+ "loss": 1.0595,
1136
  "step": 800
1137
  },
1138
  {
1139
  "epoch": 0.8460325801366264,
1140
+ "grad_norm": 0.8186938863488736,
1141
  "learning_rate": 2.1071429014099365e-07,
1142
+ "loss": 1.0631,
1143
  "step": 805
1144
  },
1145
  {
1146
  "epoch": 0.8512874408828166,
1147
+ "grad_norm": 0.7864326637111713,
1148
  "learning_rate": 1.9684797133188865e-07,
1149
+ "loss": 1.0397,
1150
  "step": 810
1151
  },
1152
  {
1153
  "epoch": 0.8565423016290068,
1154
+ "grad_norm": 0.7954693830348463,
1155
  "learning_rate": 1.8342148813856414e-07,
1156
+ "loss": 1.0498,
1157
  "step": 815
1158
  },
1159
  {
1160
  "epoch": 0.8617971623751971,
1161
+ "grad_norm": 0.8900900173262289,
1162
  "learning_rate": 1.7043937222376766e-07,
1163
  "loss": 1.0484,
1164
  "step": 820
1165
  },
1166
  {
1167
  "epoch": 0.8670520231213873,
1168
+ "grad_norm": 0.8002588089694325,
1169
  "learning_rate": 1.579060052688548e-07,
1170
+ "loss": 1.0676,
1171
  "step": 825
1172
  },
1173
  {
1174
  "epoch": 0.8723068838675775,
1175
+ "grad_norm": 0.7947929611802326,
1176
  "learning_rate": 1.4582561749489847e-07,
1177
  "loss": 1.0658,
1178
  "step": 830
1179
  },
1180
  {
1181
  "epoch": 0.8775617446137677,
1182
+ "grad_norm": 0.8288459681225425,
1183
  "learning_rate": 1.3420228623491742e-07,
1184
+ "loss": 1.034,
1185
  "step": 835
1186
  },
1187
  {
1188
  "epoch": 0.882816605359958,
1189
+ "grad_norm": 0.8346010909012226,
1190
  "learning_rate": 1.2303993455770946e-07,
1191
+ "loss": 1.0679,
1192
  "step": 840
1193
  },
1194
  {
1195
  "epoch": 0.8880714661061482,
1196
+ "grad_norm": 0.8436829522982748,
1197
  "learning_rate": 1.1234232994374916e-07,
1198
+ "loss": 1.0561,
1199
  "step": 845
1200
  },
1201
  {
1202
  "epoch": 0.8933263268523384,
1203
+ "grad_norm": 0.8025541491307114,
1204
  "learning_rate": 1.0211308301360039e-07,
1205
  "loss": 1.0635,
1206
  "step": 850
1207
  },
1208
  {
1209
  "epoch": 0.8985811875985287,
1210
+ "grad_norm": 0.8305527668792659,
1211
  "learning_rate": 9.235564630927196e-08,
1212
+ "loss": 1.0499,
1213
  "step": 855
1214
  },
1215
  {
1216
  "epoch": 0.9038360483447189,
1217
+ "grad_norm": 0.798582022149453,
1218
  "learning_rate": 8.307331312892601e-08,
1219
  "loss": 1.0573,
1220
  "step": 860
1221
  },
1222
  {
1223
  "epoch": 0.9090909090909091,
1224
+ "grad_norm": 0.7805401930107103,
1225
  "learning_rate": 7.426921641533562e-08,
1226
+ "loss": 1.0536,
1227
  "step": 865
1228
  },
1229
  {
1230
  "epoch": 0.9143457698370994,
1231
+ "grad_norm": 0.7779984836642342,
1232
  "learning_rate": 6.594632769846354e-08,
1233
  "loss": 1.0605,
1234
  "step": 870
1235
  },
1236
  {
1237
  "epoch": 0.9196006305832896,
1238
+ "grad_norm": 0.7849919719549066,
1239
  "learning_rate": 5.810745609252166e-08,
1240
+ "loss": 1.048,
1241
  "step": 875
1242
  },
1243
  {
1244
  "epoch": 0.9248554913294798,
1245
+ "grad_norm": 0.7974346377505517,
1246
  "learning_rate": 5.0755247347847814e-08,
1247
+ "loss": 1.0503,
1248
  "step": 880
1249
  },
1250
  {
1251
  "epoch": 0.9301103520756699,
1252
+ "grad_norm": 0.8239903131727321,
1253
  "learning_rate": 4.389218295792002e-08,
1254
+ "loss": 1.065,
1255
  "step": 885
1256
  },
1257
  {
1258
  "epoch": 0.9353652128218602,
1259
+ "grad_norm": 0.782976096757697,
1260
  "learning_rate": 3.7520579321812186e-08,
1261
+ "loss": 1.0731,
1262
  "step": 890
1263
  },
1264
  {
1265
  "epoch": 0.9406200735680504,
1266
+ "grad_norm": 0.7800809248031729,
1267
  "learning_rate": 3.1642586962369765e-08,
1268
  "loss": 1.0363,
1269
  "step": 895
1270
  },
1271
  {
1272
  "epoch": 0.9458749343142406,
1273
+ "grad_norm": 0.7848631462241675,
1274
  "learning_rate": 2.6260189800372757e-08,
1275
  "loss": 1.0517,
1276
  "step": 900
1277
  },
1278
  {
1279
  "epoch": 0.9511297950604309,
1280
+ "grad_norm": 0.7779665883276563,
1281
  "learning_rate": 2.13752044849288e-08,
1282
+ "loss": 1.0626,
1283
  "step": 905
1284
  },
1285
  {
1286
  "epoch": 0.9563846558066211,
1287
+ "grad_norm": 0.7949647408081179,
1288
  "learning_rate": 1.698927978032383e-08,
1289
+ "loss": 1.0485,
1290
  "step": 910
1291
  },
1292
  {
1293
  "epoch": 0.9616395165528113,
1294
+ "grad_norm": 0.7920892473556865,
1295
  "learning_rate": 1.3103896009537208e-08,
1296
  "loss": 1.0476,
1297
  "step": 915
1298
  },
1299
  {
1300
  "epoch": 0.9668943772990016,
1301
+ "grad_norm": 0.7894649006593766,
1302
  "learning_rate": 9.720364554606898e-09,
1303
+ "loss": 1.0554,
1304
  "step": 920
1305
  },
1306
  {
1307
  "epoch": 0.9721492380451918,
1308
+ "grad_norm": 0.7926655176731906,
1309
  "learning_rate": 6.839827414016675e-09,
1310
  "loss": 1.0636,
1311
  "step": 925
1312
  },
1313
  {
1314
  "epoch": 0.977404098791382,
1315
+ "grad_norm": 0.7738940217826289,
1316
  "learning_rate": 4.463256817252792e-09,
1317
  "loss": 1.0626,
1318
  "step": 930
1319
  },
1320
  {
1321
  "epoch": 0.9826589595375722,
1322
+ "grad_norm": 0.7796303093908586,
1323
  "learning_rate": 2.5914548966596285e-09,
1324
+ "loss": 1.0583,
1325
  "step": 935
1326
  },
1327
  {
1328
  "epoch": 0.9879138202837625,
1329
+ "grad_norm": 0.7880712824013714,
1330
  "learning_rate": 1.2250534167067561e-09,
1331
+ "loss": 1.0563,
1332
  "step": 940
1333
  },
1334
  {
1335
  "epoch": 0.9931686810299527,
1336
+ "grad_norm": 0.7995046619036974,
1337
  "learning_rate": 3.6451356075817287e-10,
1338
+ "loss": 1.0511,
1339
  "step": 945
1340
  },
1341
  {
1342
  "epoch": 0.9984235417761429,
1343
+ "grad_norm": 0.7975204845224678,
1344
  "learning_rate": 1.0125775414981941e-11,
1345
+ "loss": 1.0962,
1346
  "step": 950
1347
  },
1348
  {
1349
  "epoch": 0.999474513925381,
1350
+ "eval_loss": 1.0638784170150757,
1351
+ "eval_runtime": 594.284,
1352
+ "eval_samples_per_second": 22.674,
1353
+ "eval_steps_per_second": 0.71,
1354
  "step": 951
1355
  },
1356
  {
1357
  "epoch": 0.999474513925381,
1358
  "step": 951,
1359
  "total_flos": 905758069751808.0,
1360
+ "train_loss": 1.102385032064155,
1361
+ "train_runtime": 21864.5552,
1362
+ "train_samples_per_second": 5.57,
1363
+ "train_steps_per_second": 0.043
1364
  }
1365
  ],
1366
  "logging_steps": 5,