satvik-dixit commited on
Commit
c37a6ac
1 Parent(s): 7e9a33f

Uploaded checkpoint-30000

Browse files
Files changed (5) hide show
  1. adapter_model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1763 -5
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ad86e91f2924a189dc19b796deef58a4c1f44b9596040dfbed596e3a58a58a4
3
  size 119975656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b525f711ea01af14a1a4109db33040d95091e1f5f4dccf0c07f874c01bfc2878
3
  size 119975656
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e6dcec2fc7bddf9ccd947d61cfbb7ec0d09e233cfdb81b4d8f00e3043b60ec27
3
  size 240145026
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5e41106e4c714a3b3dad5809e17e3c5394f938215aaa89f9f5521f67be46e41
3
  size 240145026
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:971f8c6b700d32d9d1711207ade77f4dca9cda1be000e561bca9b74000ac50f5
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4922187d49de24e3ef28c2598a3510732f5f461011533915c69243f836690e1
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bae572518ab53ddc674f52a5ef01613875bea64a8d9c53d4b7d4a9aedc712f19
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b36093e06845c6146f3175c64f0e8bdb441d4f7fc67a6962ed0b80b6725daf1
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.2961933612823486,
3
- "best_model_checkpoint": "runs/deepseek_lora_20240422-141601/checkpoint-25000",
4
- "epoch": 0.6875,
5
  "eval_steps": 2500,
6
- "global_step": 27500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -19345,6 +19345,1764 @@
19345
  "eval_samples_per_second": 8.171,
19346
  "eval_steps_per_second": 8.171,
19347
  "step": 27500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19348
  }
19349
  ],
19350
  "logging_steps": 10,
@@ -19352,7 +21110,7 @@
19352
  "num_input_tokens_seen": 0,
19353
  "num_train_epochs": 1,
19354
  "save_steps": 2500,
19355
- "total_flos": 4.4280846483456e+17,
19356
  "train_batch_size": 1,
19357
  "trial_name": null,
19358
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.2914531230926514,
3
+ "best_model_checkpoint": "runs/deepseek_lora_20240422-141601/checkpoint-30000",
4
+ "epoch": 0.75,
5
  "eval_steps": 2500,
6
+ "global_step": 30000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
19345
  "eval_samples_per_second": 8.171,
19346
  "eval_steps_per_second": 8.171,
19347
  "step": 27500
19348
+ },
19349
+ {
19350
+ "epoch": 0.69,
19351
+ "grad_norm": 4.339414119720459,
19352
+ "learning_rate": 1.6881355932203391e-06,
19353
+ "loss": 1.3429,
19354
+ "step": 27510
19355
+ },
19356
+ {
19357
+ "epoch": 0.69,
19358
+ "grad_norm": 2.123643159866333,
19359
+ "learning_rate": 1.6813559322033901e-06,
19360
+ "loss": 1.2949,
19361
+ "step": 27520
19362
+ },
19363
+ {
19364
+ "epoch": 0.69,
19365
+ "grad_norm": 6.727291584014893,
19366
+ "learning_rate": 1.6745762711864409e-06,
19367
+ "loss": 1.4269,
19368
+ "step": 27530
19369
+ },
19370
+ {
19371
+ "epoch": 0.69,
19372
+ "grad_norm": 5.464277744293213,
19373
+ "learning_rate": 1.6677966101694916e-06,
19374
+ "loss": 1.3967,
19375
+ "step": 27540
19376
+ },
19377
+ {
19378
+ "epoch": 0.69,
19379
+ "grad_norm": 3.3927230834960938,
19380
+ "learning_rate": 1.6610169491525424e-06,
19381
+ "loss": 1.2701,
19382
+ "step": 27550
19383
+ },
19384
+ {
19385
+ "epoch": 0.69,
19386
+ "grad_norm": 6.092131614685059,
19387
+ "learning_rate": 1.6542372881355934e-06,
19388
+ "loss": 1.1989,
19389
+ "step": 27560
19390
+ },
19391
+ {
19392
+ "epoch": 0.69,
19393
+ "grad_norm": 4.336033821105957,
19394
+ "learning_rate": 1.6474576271186444e-06,
19395
+ "loss": 1.2679,
19396
+ "step": 27570
19397
+ },
19398
+ {
19399
+ "epoch": 0.69,
19400
+ "grad_norm": 3.4040894508361816,
19401
+ "learning_rate": 1.640677966101695e-06,
19402
+ "loss": 1.4791,
19403
+ "step": 27580
19404
+ },
19405
+ {
19406
+ "epoch": 0.69,
19407
+ "grad_norm": 5.1817626953125,
19408
+ "learning_rate": 1.6338983050847459e-06,
19409
+ "loss": 1.3513,
19410
+ "step": 27590
19411
+ },
19412
+ {
19413
+ "epoch": 0.69,
19414
+ "grad_norm": 14.48134708404541,
19415
+ "learning_rate": 1.6271186440677967e-06,
19416
+ "loss": 1.3058,
19417
+ "step": 27600
19418
+ },
19419
+ {
19420
+ "epoch": 0.69,
19421
+ "grad_norm": 6.538887977600098,
19422
+ "learning_rate": 1.6203389830508476e-06,
19423
+ "loss": 1.3431,
19424
+ "step": 27610
19425
+ },
19426
+ {
19427
+ "epoch": 0.69,
19428
+ "grad_norm": 7.871098518371582,
19429
+ "learning_rate": 1.6135593220338986e-06,
19430
+ "loss": 1.3213,
19431
+ "step": 27620
19432
+ },
19433
+ {
19434
+ "epoch": 0.69,
19435
+ "grad_norm": 6.496534824371338,
19436
+ "learning_rate": 1.6067796610169492e-06,
19437
+ "loss": 1.4389,
19438
+ "step": 27630
19439
+ },
19440
+ {
19441
+ "epoch": 0.69,
19442
+ "grad_norm": 6.736118793487549,
19443
+ "learning_rate": 1.6000000000000001e-06,
19444
+ "loss": 1.3323,
19445
+ "step": 27640
19446
+ },
19447
+ {
19448
+ "epoch": 0.69,
19449
+ "grad_norm": 13.042952537536621,
19450
+ "learning_rate": 1.593220338983051e-06,
19451
+ "loss": 1.1813,
19452
+ "step": 27650
19453
+ },
19454
+ {
19455
+ "epoch": 0.69,
19456
+ "grad_norm": 2.258842706680298,
19457
+ "learning_rate": 1.5864406779661019e-06,
19458
+ "loss": 1.3224,
19459
+ "step": 27660
19460
+ },
19461
+ {
19462
+ "epoch": 0.69,
19463
+ "grad_norm": 6.339553356170654,
19464
+ "learning_rate": 1.5796610169491526e-06,
19465
+ "loss": 1.3433,
19466
+ "step": 27670
19467
+ },
19468
+ {
19469
+ "epoch": 0.69,
19470
+ "grad_norm": 7.741724491119385,
19471
+ "learning_rate": 1.5728813559322034e-06,
19472
+ "loss": 1.3168,
19473
+ "step": 27680
19474
+ },
19475
+ {
19476
+ "epoch": 0.69,
19477
+ "grad_norm": 16.320072174072266,
19478
+ "learning_rate": 1.5661016949152544e-06,
19479
+ "loss": 1.2347,
19480
+ "step": 27690
19481
+ },
19482
+ {
19483
+ "epoch": 0.69,
19484
+ "grad_norm": 5.457180023193359,
19485
+ "learning_rate": 1.5593220338983054e-06,
19486
+ "loss": 1.2835,
19487
+ "step": 27700
19488
+ },
19489
+ {
19490
+ "epoch": 0.69,
19491
+ "grad_norm": 5.750472068786621,
19492
+ "learning_rate": 1.552542372881356e-06,
19493
+ "loss": 1.2346,
19494
+ "step": 27710
19495
+ },
19496
+ {
19497
+ "epoch": 0.69,
19498
+ "grad_norm": 1.067400574684143,
19499
+ "learning_rate": 1.545762711864407e-06,
19500
+ "loss": 1.26,
19501
+ "step": 27720
19502
+ },
19503
+ {
19504
+ "epoch": 0.69,
19505
+ "grad_norm": 14.453791618347168,
19506
+ "learning_rate": 1.5389830508474577e-06,
19507
+ "loss": 1.1674,
19508
+ "step": 27730
19509
+ },
19510
+ {
19511
+ "epoch": 0.69,
19512
+ "grad_norm": 9.273608207702637,
19513
+ "learning_rate": 1.5322033898305086e-06,
19514
+ "loss": 1.2715,
19515
+ "step": 27740
19516
+ },
19517
+ {
19518
+ "epoch": 0.69,
19519
+ "grad_norm": 2.4199931621551514,
19520
+ "learning_rate": 1.5254237288135596e-06,
19521
+ "loss": 1.3738,
19522
+ "step": 27750
19523
+ },
19524
+ {
19525
+ "epoch": 0.69,
19526
+ "grad_norm": 11.302939414978027,
19527
+ "learning_rate": 1.5186440677966102e-06,
19528
+ "loss": 1.2809,
19529
+ "step": 27760
19530
+ },
19531
+ {
19532
+ "epoch": 0.69,
19533
+ "grad_norm": 9.895012855529785,
19534
+ "learning_rate": 1.5118644067796611e-06,
19535
+ "loss": 1.2577,
19536
+ "step": 27770
19537
+ },
19538
+ {
19539
+ "epoch": 0.69,
19540
+ "grad_norm": 9.7351655960083,
19541
+ "learning_rate": 1.505084745762712e-06,
19542
+ "loss": 1.2743,
19543
+ "step": 27780
19544
+ },
19545
+ {
19546
+ "epoch": 0.69,
19547
+ "grad_norm": 3.2647464275360107,
19548
+ "learning_rate": 1.4983050847457629e-06,
19549
+ "loss": 1.32,
19550
+ "step": 27790
19551
+ },
19552
+ {
19553
+ "epoch": 0.69,
19554
+ "grad_norm": 9.726398468017578,
19555
+ "learning_rate": 1.4915254237288139e-06,
19556
+ "loss": 1.3836,
19557
+ "step": 27800
19558
+ },
19559
+ {
19560
+ "epoch": 0.7,
19561
+ "grad_norm": 7.257278919219971,
19562
+ "learning_rate": 1.4847457627118644e-06,
19563
+ "loss": 1.4125,
19564
+ "step": 27810
19565
+ },
19566
+ {
19567
+ "epoch": 0.7,
19568
+ "grad_norm": 10.737898826599121,
19569
+ "learning_rate": 1.4779661016949154e-06,
19570
+ "loss": 1.3774,
19571
+ "step": 27820
19572
+ },
19573
+ {
19574
+ "epoch": 0.7,
19575
+ "grad_norm": 10.912455558776855,
19576
+ "learning_rate": 1.4711864406779664e-06,
19577
+ "loss": 1.302,
19578
+ "step": 27830
19579
+ },
19580
+ {
19581
+ "epoch": 0.7,
19582
+ "grad_norm": 4.399783611297607,
19583
+ "learning_rate": 1.464406779661017e-06,
19584
+ "loss": 1.2622,
19585
+ "step": 27840
19586
+ },
19587
+ {
19588
+ "epoch": 0.7,
19589
+ "grad_norm": 14.4951171875,
19590
+ "learning_rate": 1.457627118644068e-06,
19591
+ "loss": 1.415,
19592
+ "step": 27850
19593
+ },
19594
+ {
19595
+ "epoch": 0.7,
19596
+ "grad_norm": 5.326625347137451,
19597
+ "learning_rate": 1.4508474576271187e-06,
19598
+ "loss": 1.2882,
19599
+ "step": 27860
19600
+ },
19601
+ {
19602
+ "epoch": 0.7,
19603
+ "grad_norm": 6.698885917663574,
19604
+ "learning_rate": 1.4440677966101696e-06,
19605
+ "loss": 1.3461,
19606
+ "step": 27870
19607
+ },
19608
+ {
19609
+ "epoch": 0.7,
19610
+ "grad_norm": 2.7243850231170654,
19611
+ "learning_rate": 1.4372881355932206e-06,
19612
+ "loss": 1.4233,
19613
+ "step": 27880
19614
+ },
19615
+ {
19616
+ "epoch": 0.7,
19617
+ "grad_norm": 2.273730516433716,
19618
+ "learning_rate": 1.4305084745762712e-06,
19619
+ "loss": 1.2245,
19620
+ "step": 27890
19621
+ },
19622
+ {
19623
+ "epoch": 0.7,
19624
+ "grad_norm": 3.1223866939544678,
19625
+ "learning_rate": 1.4237288135593222e-06,
19626
+ "loss": 1.3181,
19627
+ "step": 27900
19628
+ },
19629
+ {
19630
+ "epoch": 0.7,
19631
+ "grad_norm": 8.419058799743652,
19632
+ "learning_rate": 1.416949152542373e-06,
19633
+ "loss": 1.2782,
19634
+ "step": 27910
19635
+ },
19636
+ {
19637
+ "epoch": 0.7,
19638
+ "grad_norm": 4.810690879821777,
19639
+ "learning_rate": 1.410169491525424e-06,
19640
+ "loss": 1.2527,
19641
+ "step": 27920
19642
+ },
19643
+ {
19644
+ "epoch": 0.7,
19645
+ "grad_norm": 2.7679851055145264,
19646
+ "learning_rate": 1.4033898305084749e-06,
19647
+ "loss": 1.1995,
19648
+ "step": 27930
19649
+ },
19650
+ {
19651
+ "epoch": 0.7,
19652
+ "grad_norm": 1.7833703756332397,
19653
+ "learning_rate": 1.3966101694915254e-06,
19654
+ "loss": 1.3988,
19655
+ "step": 27940
19656
+ },
19657
+ {
19658
+ "epoch": 0.7,
19659
+ "grad_norm": 8.348339080810547,
19660
+ "learning_rate": 1.3898305084745764e-06,
19661
+ "loss": 1.2983,
19662
+ "step": 27950
19663
+ },
19664
+ {
19665
+ "epoch": 0.7,
19666
+ "grad_norm": 8.304106712341309,
19667
+ "learning_rate": 1.3830508474576274e-06,
19668
+ "loss": 1.2926,
19669
+ "step": 27960
19670
+ },
19671
+ {
19672
+ "epoch": 0.7,
19673
+ "grad_norm": 5.610420227050781,
19674
+ "learning_rate": 1.376271186440678e-06,
19675
+ "loss": 1.2064,
19676
+ "step": 27970
19677
+ },
19678
+ {
19679
+ "epoch": 0.7,
19680
+ "grad_norm": 13.0822114944458,
19681
+ "learning_rate": 1.369491525423729e-06,
19682
+ "loss": 1.3171,
19683
+ "step": 27980
19684
+ },
19685
+ {
19686
+ "epoch": 0.7,
19687
+ "grad_norm": 3.2502501010894775,
19688
+ "learning_rate": 1.3627118644067797e-06,
19689
+ "loss": 1.2354,
19690
+ "step": 27990
19691
+ },
19692
+ {
19693
+ "epoch": 0.7,
19694
+ "grad_norm": 19.23220443725586,
19695
+ "learning_rate": 1.3559322033898307e-06,
19696
+ "loss": 1.3304,
19697
+ "step": 28000
19698
+ },
19699
+ {
19700
+ "epoch": 0.7,
19701
+ "grad_norm": 5.642111778259277,
19702
+ "learning_rate": 1.3491525423728816e-06,
19703
+ "loss": 1.2085,
19704
+ "step": 28010
19705
+ },
19706
+ {
19707
+ "epoch": 0.7,
19708
+ "grad_norm": 5.464583396911621,
19709
+ "learning_rate": 1.3423728813559322e-06,
19710
+ "loss": 1.3588,
19711
+ "step": 28020
19712
+ },
19713
+ {
19714
+ "epoch": 0.7,
19715
+ "grad_norm": 2.51157808303833,
19716
+ "learning_rate": 1.3355932203389832e-06,
19717
+ "loss": 1.1803,
19718
+ "step": 28030
19719
+ },
19720
+ {
19721
+ "epoch": 0.7,
19722
+ "grad_norm": 10.629277229309082,
19723
+ "learning_rate": 1.328813559322034e-06,
19724
+ "loss": 1.3943,
19725
+ "step": 28040
19726
+ },
19727
+ {
19728
+ "epoch": 0.7,
19729
+ "grad_norm": 10.221532821655273,
19730
+ "learning_rate": 1.322033898305085e-06,
19731
+ "loss": 1.4424,
19732
+ "step": 28050
19733
+ },
19734
+ {
19735
+ "epoch": 0.7,
19736
+ "grad_norm": 1.1695493459701538,
19737
+ "learning_rate": 1.3152542372881359e-06,
19738
+ "loss": 1.3071,
19739
+ "step": 28060
19740
+ },
19741
+ {
19742
+ "epoch": 0.7,
19743
+ "grad_norm": 4.1398539543151855,
19744
+ "learning_rate": 1.3084745762711864e-06,
19745
+ "loss": 1.2239,
19746
+ "step": 28070
19747
+ },
19748
+ {
19749
+ "epoch": 0.7,
19750
+ "grad_norm": 20.721601486206055,
19751
+ "learning_rate": 1.3016949152542374e-06,
19752
+ "loss": 1.3761,
19753
+ "step": 28080
19754
+ },
19755
+ {
19756
+ "epoch": 0.7,
19757
+ "grad_norm": 8.006892204284668,
19758
+ "learning_rate": 1.2949152542372884e-06,
19759
+ "loss": 1.2867,
19760
+ "step": 28090
19761
+ },
19762
+ {
19763
+ "epoch": 0.7,
19764
+ "grad_norm": 4.007428169250488,
19765
+ "learning_rate": 1.288135593220339e-06,
19766
+ "loss": 1.2511,
19767
+ "step": 28100
19768
+ },
19769
+ {
19770
+ "epoch": 0.7,
19771
+ "grad_norm": 8.041426658630371,
19772
+ "learning_rate": 1.28135593220339e-06,
19773
+ "loss": 1.3597,
19774
+ "step": 28110
19775
+ },
19776
+ {
19777
+ "epoch": 0.7,
19778
+ "grad_norm": 4.553910732269287,
19779
+ "learning_rate": 1.2745762711864407e-06,
19780
+ "loss": 1.4066,
19781
+ "step": 28120
19782
+ },
19783
+ {
19784
+ "epoch": 0.7,
19785
+ "grad_norm": 8.02763843536377,
19786
+ "learning_rate": 1.2677966101694917e-06,
19787
+ "loss": 1.1639,
19788
+ "step": 28130
19789
+ },
19790
+ {
19791
+ "epoch": 0.7,
19792
+ "grad_norm": 9.010587692260742,
19793
+ "learning_rate": 1.2610169491525426e-06,
19794
+ "loss": 1.5038,
19795
+ "step": 28140
19796
+ },
19797
+ {
19798
+ "epoch": 0.7,
19799
+ "grad_norm": 4.593667984008789,
19800
+ "learning_rate": 1.2542372881355932e-06,
19801
+ "loss": 1.4871,
19802
+ "step": 28150
19803
+ },
19804
+ {
19805
+ "epoch": 0.7,
19806
+ "grad_norm": 6.7055158615112305,
19807
+ "learning_rate": 1.2474576271186442e-06,
19808
+ "loss": 1.2533,
19809
+ "step": 28160
19810
+ },
19811
+ {
19812
+ "epoch": 0.7,
19813
+ "grad_norm": 13.491644859313965,
19814
+ "learning_rate": 1.240677966101695e-06,
19815
+ "loss": 1.2957,
19816
+ "step": 28170
19817
+ },
19818
+ {
19819
+ "epoch": 0.7,
19820
+ "grad_norm": 5.513734340667725,
19821
+ "learning_rate": 1.233898305084746e-06,
19822
+ "loss": 1.2898,
19823
+ "step": 28180
19824
+ },
19825
+ {
19826
+ "epoch": 0.7,
19827
+ "grad_norm": 11.136821746826172,
19828
+ "learning_rate": 1.2271186440677967e-06,
19829
+ "loss": 1.4257,
19830
+ "step": 28190
19831
+ },
19832
+ {
19833
+ "epoch": 0.7,
19834
+ "grad_norm": 4.229417324066162,
19835
+ "learning_rate": 1.2203389830508477e-06,
19836
+ "loss": 1.322,
19837
+ "step": 28200
19838
+ },
19839
+ {
19840
+ "epoch": 0.71,
19841
+ "grad_norm": 4.852339267730713,
19842
+ "learning_rate": 1.2135593220338984e-06,
19843
+ "loss": 1.3371,
19844
+ "step": 28210
19845
+ },
19846
+ {
19847
+ "epoch": 0.71,
19848
+ "grad_norm": 9.924147605895996,
19849
+ "learning_rate": 1.2067796610169492e-06,
19850
+ "loss": 1.1373,
19851
+ "step": 28220
19852
+ },
19853
+ {
19854
+ "epoch": 0.71,
19855
+ "grad_norm": 3.660102128982544,
19856
+ "learning_rate": 1.2000000000000002e-06,
19857
+ "loss": 1.253,
19858
+ "step": 28230
19859
+ },
19860
+ {
19861
+ "epoch": 0.71,
19862
+ "grad_norm": 5.641490936279297,
19863
+ "learning_rate": 1.193220338983051e-06,
19864
+ "loss": 1.3156,
19865
+ "step": 28240
19866
+ },
19867
+ {
19868
+ "epoch": 0.71,
19869
+ "grad_norm": 7.371779441833496,
19870
+ "learning_rate": 1.186440677966102e-06,
19871
+ "loss": 1.3273,
19872
+ "step": 28250
19873
+ },
19874
+ {
19875
+ "epoch": 0.71,
19876
+ "grad_norm": 11.060622215270996,
19877
+ "learning_rate": 1.1796610169491527e-06,
19878
+ "loss": 1.2729,
19879
+ "step": 28260
19880
+ },
19881
+ {
19882
+ "epoch": 0.71,
19883
+ "grad_norm": 2.1023409366607666,
19884
+ "learning_rate": 1.1728813559322034e-06,
19885
+ "loss": 1.4814,
19886
+ "step": 28270
19887
+ },
19888
+ {
19889
+ "epoch": 0.71,
19890
+ "grad_norm": 2.1587941646575928,
19891
+ "learning_rate": 1.1661016949152542e-06,
19892
+ "loss": 1.4983,
19893
+ "step": 28280
19894
+ },
19895
+ {
19896
+ "epoch": 0.71,
19897
+ "grad_norm": 10.272698402404785,
19898
+ "learning_rate": 1.1593220338983052e-06,
19899
+ "loss": 1.2093,
19900
+ "step": 28290
19901
+ },
19902
+ {
19903
+ "epoch": 0.71,
19904
+ "grad_norm": 3.677959442138672,
19905
+ "learning_rate": 1.152542372881356e-06,
19906
+ "loss": 1.1745,
19907
+ "step": 28300
19908
+ },
19909
+ {
19910
+ "epoch": 0.71,
19911
+ "grad_norm": 2.187302350997925,
19912
+ "learning_rate": 1.145762711864407e-06,
19913
+ "loss": 1.3257,
19914
+ "step": 28310
19915
+ },
19916
+ {
19917
+ "epoch": 0.71,
19918
+ "grad_norm": 5.065845489501953,
19919
+ "learning_rate": 1.1389830508474577e-06,
19920
+ "loss": 1.3038,
19921
+ "step": 28320
19922
+ },
19923
+ {
19924
+ "epoch": 0.71,
19925
+ "grad_norm": 6.725832462310791,
19926
+ "learning_rate": 1.1322033898305087e-06,
19927
+ "loss": 1.2525,
19928
+ "step": 28330
19929
+ },
19930
+ {
19931
+ "epoch": 0.71,
19932
+ "grad_norm": 13.034323692321777,
19933
+ "learning_rate": 1.1254237288135594e-06,
19934
+ "loss": 1.3096,
19935
+ "step": 28340
19936
+ },
19937
+ {
19938
+ "epoch": 0.71,
19939
+ "grad_norm": 12.125313758850098,
19940
+ "learning_rate": 1.1186440677966102e-06,
19941
+ "loss": 1.1823,
19942
+ "step": 28350
19943
+ },
19944
+ {
19945
+ "epoch": 0.71,
19946
+ "grad_norm": 12.07535457611084,
19947
+ "learning_rate": 1.1118644067796612e-06,
19948
+ "loss": 1.2526,
19949
+ "step": 28360
19950
+ },
19951
+ {
19952
+ "epoch": 0.71,
19953
+ "grad_norm": 6.179876804351807,
19954
+ "learning_rate": 1.105084745762712e-06,
19955
+ "loss": 1.3062,
19956
+ "step": 28370
19957
+ },
19958
+ {
19959
+ "epoch": 0.71,
19960
+ "grad_norm": 2.814276933670044,
19961
+ "learning_rate": 1.098305084745763e-06,
19962
+ "loss": 1.2911,
19963
+ "step": 28380
19964
+ },
19965
+ {
19966
+ "epoch": 0.71,
19967
+ "grad_norm": 14.710918426513672,
19968
+ "learning_rate": 1.0915254237288137e-06,
19969
+ "loss": 1.1594,
19970
+ "step": 28390
19971
+ },
19972
+ {
19973
+ "epoch": 0.71,
19974
+ "grad_norm": 12.056600570678711,
19975
+ "learning_rate": 1.0847457627118644e-06,
19976
+ "loss": 1.2203,
19977
+ "step": 28400
19978
+ },
19979
+ {
19980
+ "epoch": 0.71,
19981
+ "grad_norm": 9.935985565185547,
19982
+ "learning_rate": 1.0779661016949152e-06,
19983
+ "loss": 1.4101,
19984
+ "step": 28410
19985
+ },
19986
+ {
19987
+ "epoch": 0.71,
19988
+ "grad_norm": 15.330471992492676,
19989
+ "learning_rate": 1.0711864406779662e-06,
19990
+ "loss": 1.1429,
19991
+ "step": 28420
19992
+ },
19993
+ {
19994
+ "epoch": 0.71,
19995
+ "grad_norm": 2.6031837463378906,
19996
+ "learning_rate": 1.064406779661017e-06,
19997
+ "loss": 1.336,
19998
+ "step": 28430
19999
+ },
20000
+ {
20001
+ "epoch": 0.71,
20002
+ "grad_norm": 10.921528816223145,
20003
+ "learning_rate": 1.057627118644068e-06,
20004
+ "loss": 1.3067,
20005
+ "step": 28440
20006
+ },
20007
+ {
20008
+ "epoch": 0.71,
20009
+ "grad_norm": 10.396923065185547,
20010
+ "learning_rate": 1.0508474576271187e-06,
20011
+ "loss": 1.3811,
20012
+ "step": 28450
20013
+ },
20014
+ {
20015
+ "epoch": 0.71,
20016
+ "grad_norm": 10.552454948425293,
20017
+ "learning_rate": 1.0440677966101697e-06,
20018
+ "loss": 1.3656,
20019
+ "step": 28460
20020
+ },
20021
+ {
20022
+ "epoch": 0.71,
20023
+ "grad_norm": 11.751737594604492,
20024
+ "learning_rate": 1.0372881355932204e-06,
20025
+ "loss": 1.2437,
20026
+ "step": 28470
20027
+ },
20028
+ {
20029
+ "epoch": 0.71,
20030
+ "grad_norm": 3.7091989517211914,
20031
+ "learning_rate": 1.0305084745762712e-06,
20032
+ "loss": 1.3765,
20033
+ "step": 28480
20034
+ },
20035
+ {
20036
+ "epoch": 0.71,
20037
+ "grad_norm": 5.946925163269043,
20038
+ "learning_rate": 1.0237288135593222e-06,
20039
+ "loss": 1.4501,
20040
+ "step": 28490
20041
+ },
20042
+ {
20043
+ "epoch": 0.71,
20044
+ "grad_norm": 10.043877601623535,
20045
+ "learning_rate": 1.016949152542373e-06,
20046
+ "loss": 1.3166,
20047
+ "step": 28500
20048
+ },
20049
+ {
20050
+ "epoch": 0.71,
20051
+ "grad_norm": 12.921855926513672,
20052
+ "learning_rate": 1.010169491525424e-06,
20053
+ "loss": 1.2181,
20054
+ "step": 28510
20055
+ },
20056
+ {
20057
+ "epoch": 0.71,
20058
+ "grad_norm": 8.31167984008789,
20059
+ "learning_rate": 1.0033898305084747e-06,
20060
+ "loss": 1.1729,
20061
+ "step": 28520
20062
+ },
20063
+ {
20064
+ "epoch": 0.71,
20065
+ "grad_norm": 13.391186714172363,
20066
+ "learning_rate": 9.966101694915254e-07,
20067
+ "loss": 1.4122,
20068
+ "step": 28530
20069
+ },
20070
+ {
20071
+ "epoch": 0.71,
20072
+ "grad_norm": 3.428011178970337,
20073
+ "learning_rate": 9.898305084745762e-07,
20074
+ "loss": 1.4873,
20075
+ "step": 28540
20076
+ },
20077
+ {
20078
+ "epoch": 0.71,
20079
+ "grad_norm": 5.871237754821777,
20080
+ "learning_rate": 9.830508474576272e-07,
20081
+ "loss": 1.5205,
20082
+ "step": 28550
20083
+ },
20084
+ {
20085
+ "epoch": 0.71,
20086
+ "grad_norm": 11.823165893554688,
20087
+ "learning_rate": 9.762711864406782e-07,
20088
+ "loss": 1.1785,
20089
+ "step": 28560
20090
+ },
20091
+ {
20092
+ "epoch": 0.71,
20093
+ "grad_norm": 13.24584674835205,
20094
+ "learning_rate": 9.69491525423729e-07,
20095
+ "loss": 1.3375,
20096
+ "step": 28570
20097
+ },
20098
+ {
20099
+ "epoch": 0.71,
20100
+ "grad_norm": 4.6865386962890625,
20101
+ "learning_rate": 9.627118644067797e-07,
20102
+ "loss": 1.3289,
20103
+ "step": 28580
20104
+ },
20105
+ {
20106
+ "epoch": 0.71,
20107
+ "grad_norm": 4.408345699310303,
20108
+ "learning_rate": 9.559322033898307e-07,
20109
+ "loss": 1.3748,
20110
+ "step": 28590
20111
+ },
20112
+ {
20113
+ "epoch": 0.71,
20114
+ "grad_norm": 6.063961982727051,
20115
+ "learning_rate": 9.491525423728814e-07,
20116
+ "loss": 1.2393,
20117
+ "step": 28600
20118
+ },
20119
+ {
20120
+ "epoch": 0.72,
20121
+ "grad_norm": 12.433189392089844,
20122
+ "learning_rate": 9.423728813559323e-07,
20123
+ "loss": 1.295,
20124
+ "step": 28610
20125
+ },
20126
+ {
20127
+ "epoch": 0.72,
20128
+ "grad_norm": 4.160829067230225,
20129
+ "learning_rate": 9.355932203389831e-07,
20130
+ "loss": 1.1346,
20131
+ "step": 28620
20132
+ },
20133
+ {
20134
+ "epoch": 0.72,
20135
+ "grad_norm": 4.698163032531738,
20136
+ "learning_rate": 9.28813559322034e-07,
20137
+ "loss": 1.2795,
20138
+ "step": 28630
20139
+ },
20140
+ {
20141
+ "epoch": 0.72,
20142
+ "grad_norm": 9.586724281311035,
20143
+ "learning_rate": 9.220338983050848e-07,
20144
+ "loss": 1.3546,
20145
+ "step": 28640
20146
+ },
20147
+ {
20148
+ "epoch": 0.72,
20149
+ "grad_norm": 5.39924955368042,
20150
+ "learning_rate": 9.152542372881357e-07,
20151
+ "loss": 1.3835,
20152
+ "step": 28650
20153
+ },
20154
+ {
20155
+ "epoch": 0.72,
20156
+ "grad_norm": 9.498052597045898,
20157
+ "learning_rate": 9.084745762711864e-07,
20158
+ "loss": 1.3516,
20159
+ "step": 28660
20160
+ },
20161
+ {
20162
+ "epoch": 0.72,
20163
+ "grad_norm": 7.17230749130249,
20164
+ "learning_rate": 9.016949152542373e-07,
20165
+ "loss": 1.2309,
20166
+ "step": 28670
20167
+ },
20168
+ {
20169
+ "epoch": 0.72,
20170
+ "grad_norm": 5.45039176940918,
20171
+ "learning_rate": 8.949152542372883e-07,
20172
+ "loss": 1.2632,
20173
+ "step": 28680
20174
+ },
20175
+ {
20176
+ "epoch": 0.72,
20177
+ "grad_norm": 5.3587260246276855,
20178
+ "learning_rate": 8.881355932203391e-07,
20179
+ "loss": 1.4843,
20180
+ "step": 28690
20181
+ },
20182
+ {
20183
+ "epoch": 0.72,
20184
+ "grad_norm": 1.8675847053527832,
20185
+ "learning_rate": 8.813559322033899e-07,
20186
+ "loss": 1.4973,
20187
+ "step": 28700
20188
+ },
20189
+ {
20190
+ "epoch": 0.72,
20191
+ "grad_norm": 3.713160753250122,
20192
+ "learning_rate": 8.745762711864407e-07,
20193
+ "loss": 1.309,
20194
+ "step": 28710
20195
+ },
20196
+ {
20197
+ "epoch": 0.72,
20198
+ "grad_norm": 10.668930053710938,
20199
+ "learning_rate": 8.677966101694917e-07,
20200
+ "loss": 1.3676,
20201
+ "step": 28720
20202
+ },
20203
+ {
20204
+ "epoch": 0.72,
20205
+ "grad_norm": 2.1564583778381348,
20206
+ "learning_rate": 8.610169491525424e-07,
20207
+ "loss": 1.2518,
20208
+ "step": 28730
20209
+ },
20210
+ {
20211
+ "epoch": 0.72,
20212
+ "grad_norm": 4.37905216217041,
20213
+ "learning_rate": 8.542372881355933e-07,
20214
+ "loss": 1.3369,
20215
+ "step": 28740
20216
+ },
20217
+ {
20218
+ "epoch": 0.72,
20219
+ "grad_norm": 15.10600471496582,
20220
+ "learning_rate": 8.474576271186441e-07,
20221
+ "loss": 1.1867,
20222
+ "step": 28750
20223
+ },
20224
+ {
20225
+ "epoch": 0.72,
20226
+ "grad_norm": 6.618597030639648,
20227
+ "learning_rate": 8.406779661016951e-07,
20228
+ "loss": 1.2391,
20229
+ "step": 28760
20230
+ },
20231
+ {
20232
+ "epoch": 0.72,
20233
+ "grad_norm": 16.292184829711914,
20234
+ "learning_rate": 8.338983050847458e-07,
20235
+ "loss": 1.4165,
20236
+ "step": 28770
20237
+ },
20238
+ {
20239
+ "epoch": 0.72,
20240
+ "grad_norm": 8.334046363830566,
20241
+ "learning_rate": 8.271186440677967e-07,
20242
+ "loss": 1.344,
20243
+ "step": 28780
20244
+ },
20245
+ {
20246
+ "epoch": 0.72,
20247
+ "grad_norm": 10.543550491333008,
20248
+ "learning_rate": 8.203389830508475e-07,
20249
+ "loss": 1.3328,
20250
+ "step": 28790
20251
+ },
20252
+ {
20253
+ "epoch": 0.72,
20254
+ "grad_norm": 2.6791608333587646,
20255
+ "learning_rate": 8.135593220338983e-07,
20256
+ "loss": 1.4221,
20257
+ "step": 28800
20258
+ },
20259
+ {
20260
+ "epoch": 0.72,
20261
+ "grad_norm": 3.6508536338806152,
20262
+ "learning_rate": 8.067796610169493e-07,
20263
+ "loss": 1.3269,
20264
+ "step": 28810
20265
+ },
20266
+ {
20267
+ "epoch": 0.72,
20268
+ "grad_norm": 7.089112758636475,
20269
+ "learning_rate": 8.000000000000001e-07,
20270
+ "loss": 1.3038,
20271
+ "step": 28820
20272
+ },
20273
+ {
20274
+ "epoch": 0.72,
20275
+ "grad_norm": 6.554429531097412,
20276
+ "learning_rate": 7.932203389830509e-07,
20277
+ "loss": 1.2598,
20278
+ "step": 28830
20279
+ },
20280
+ {
20281
+ "epoch": 0.72,
20282
+ "grad_norm": 5.9939799308776855,
20283
+ "learning_rate": 7.864406779661017e-07,
20284
+ "loss": 1.419,
20285
+ "step": 28840
20286
+ },
20287
+ {
20288
+ "epoch": 0.72,
20289
+ "grad_norm": 10.088254928588867,
20290
+ "learning_rate": 7.796610169491527e-07,
20291
+ "loss": 1.2858,
20292
+ "step": 28850
20293
+ },
20294
+ {
20295
+ "epoch": 0.72,
20296
+ "grad_norm": 16.476806640625,
20297
+ "learning_rate": 7.728813559322034e-07,
20298
+ "loss": 1.3498,
20299
+ "step": 28860
20300
+ },
20301
+ {
20302
+ "epoch": 0.72,
20303
+ "grad_norm": 11.453723907470703,
20304
+ "learning_rate": 7.661016949152543e-07,
20305
+ "loss": 1.2346,
20306
+ "step": 28870
20307
+ },
20308
+ {
20309
+ "epoch": 0.72,
20310
+ "grad_norm": 4.832727909088135,
20311
+ "learning_rate": 7.593220338983051e-07,
20312
+ "loss": 1.252,
20313
+ "step": 28880
20314
+ },
20315
+ {
20316
+ "epoch": 0.72,
20317
+ "grad_norm": 8.281596183776855,
20318
+ "learning_rate": 7.52542372881356e-07,
20319
+ "loss": 1.3162,
20320
+ "step": 28890
20321
+ },
20322
+ {
20323
+ "epoch": 0.72,
20324
+ "grad_norm": 8.322393417358398,
20325
+ "learning_rate": 7.457627118644069e-07,
20326
+ "loss": 1.1593,
20327
+ "step": 28900
20328
+ },
20329
+ {
20330
+ "epoch": 0.72,
20331
+ "grad_norm": 10.499345779418945,
20332
+ "learning_rate": 7.389830508474577e-07,
20333
+ "loss": 1.243,
20334
+ "step": 28910
20335
+ },
20336
+ {
20337
+ "epoch": 0.72,
20338
+ "grad_norm": 14.577847480773926,
20339
+ "learning_rate": 7.322033898305085e-07,
20340
+ "loss": 1.3396,
20341
+ "step": 28920
20342
+ },
20343
+ {
20344
+ "epoch": 0.72,
20345
+ "grad_norm": 3.6276347637176514,
20346
+ "learning_rate": 7.254237288135593e-07,
20347
+ "loss": 1.4325,
20348
+ "step": 28930
20349
+ },
20350
+ {
20351
+ "epoch": 0.72,
20352
+ "grad_norm": 10.75643539428711,
20353
+ "learning_rate": 7.186440677966103e-07,
20354
+ "loss": 1.3319,
20355
+ "step": 28940
20356
+ },
20357
+ {
20358
+ "epoch": 0.72,
20359
+ "grad_norm": 8.437240600585938,
20360
+ "learning_rate": 7.118644067796611e-07,
20361
+ "loss": 1.2917,
20362
+ "step": 28950
20363
+ },
20364
+ {
20365
+ "epoch": 0.72,
20366
+ "grad_norm": 7.0036139488220215,
20367
+ "learning_rate": 7.05084745762712e-07,
20368
+ "loss": 1.306,
20369
+ "step": 28960
20370
+ },
20371
+ {
20372
+ "epoch": 0.72,
20373
+ "grad_norm": 6.939202308654785,
20374
+ "learning_rate": 6.983050847457627e-07,
20375
+ "loss": 1.1919,
20376
+ "step": 28970
20377
+ },
20378
+ {
20379
+ "epoch": 0.72,
20380
+ "grad_norm": 5.897158145904541,
20381
+ "learning_rate": 6.915254237288137e-07,
20382
+ "loss": 1.3183,
20383
+ "step": 28980
20384
+ },
20385
+ {
20386
+ "epoch": 0.72,
20387
+ "grad_norm": 8.489838600158691,
20388
+ "learning_rate": 6.847457627118645e-07,
20389
+ "loss": 1.3525,
20390
+ "step": 28990
20391
+ },
20392
+ {
20393
+ "epoch": 0.72,
20394
+ "grad_norm": 11.85141658782959,
20395
+ "learning_rate": 6.779661016949153e-07,
20396
+ "loss": 1.3708,
20397
+ "step": 29000
20398
+ },
20399
+ {
20400
+ "epoch": 0.73,
20401
+ "grad_norm": 12.649805068969727,
20402
+ "learning_rate": 6.711864406779661e-07,
20403
+ "loss": 1.2521,
20404
+ "step": 29010
20405
+ },
20406
+ {
20407
+ "epoch": 0.73,
20408
+ "grad_norm": 4.55898380279541,
20409
+ "learning_rate": 6.64406779661017e-07,
20410
+ "loss": 1.3484,
20411
+ "step": 29020
20412
+ },
20413
+ {
20414
+ "epoch": 0.73,
20415
+ "grad_norm": 7.323060989379883,
20416
+ "learning_rate": 6.576271186440679e-07,
20417
+ "loss": 1.4211,
20418
+ "step": 29030
20419
+ },
20420
+ {
20421
+ "epoch": 0.73,
20422
+ "grad_norm": 4.497279167175293,
20423
+ "learning_rate": 6.508474576271187e-07,
20424
+ "loss": 1.3893,
20425
+ "step": 29040
20426
+ },
20427
+ {
20428
+ "epoch": 0.73,
20429
+ "grad_norm": 10.126681327819824,
20430
+ "learning_rate": 6.440677966101695e-07,
20431
+ "loss": 1.2969,
20432
+ "step": 29050
20433
+ },
20434
+ {
20435
+ "epoch": 0.73,
20436
+ "grad_norm": 4.058706760406494,
20437
+ "learning_rate": 6.372881355932203e-07,
20438
+ "loss": 1.297,
20439
+ "step": 29060
20440
+ },
20441
+ {
20442
+ "epoch": 0.73,
20443
+ "grad_norm": 3.2129762172698975,
20444
+ "learning_rate": 6.305084745762713e-07,
20445
+ "loss": 1.336,
20446
+ "step": 29070
20447
+ },
20448
+ {
20449
+ "epoch": 0.73,
20450
+ "grad_norm": 2.8219425678253174,
20451
+ "learning_rate": 6.237288135593221e-07,
20452
+ "loss": 1.2462,
20453
+ "step": 29080
20454
+ },
20455
+ {
20456
+ "epoch": 0.73,
20457
+ "grad_norm": 3.939793109893799,
20458
+ "learning_rate": 6.16949152542373e-07,
20459
+ "loss": 1.417,
20460
+ "step": 29090
20461
+ },
20462
+ {
20463
+ "epoch": 0.73,
20464
+ "grad_norm": 5.102091312408447,
20465
+ "learning_rate": 6.101694915254238e-07,
20466
+ "loss": 1.467,
20467
+ "step": 29100
20468
+ },
20469
+ {
20470
+ "epoch": 0.73,
20471
+ "grad_norm": 24.394020080566406,
20472
+ "learning_rate": 6.033898305084746e-07,
20473
+ "loss": 1.0779,
20474
+ "step": 29110
20475
+ },
20476
+ {
20477
+ "epoch": 0.73,
20478
+ "grad_norm": 9.672082901000977,
20479
+ "learning_rate": 5.966101694915255e-07,
20480
+ "loss": 1.2263,
20481
+ "step": 29120
20482
+ },
20483
+ {
20484
+ "epoch": 0.73,
20485
+ "grad_norm": 3.5495758056640625,
20486
+ "learning_rate": 5.898305084745763e-07,
20487
+ "loss": 1.3195,
20488
+ "step": 29130
20489
+ },
20490
+ {
20491
+ "epoch": 0.73,
20492
+ "grad_norm": 3.160689115524292,
20493
+ "learning_rate": 5.830508474576271e-07,
20494
+ "loss": 1.4946,
20495
+ "step": 29140
20496
+ },
20497
+ {
20498
+ "epoch": 0.73,
20499
+ "grad_norm": 7.806612014770508,
20500
+ "learning_rate": 5.76271186440678e-07,
20501
+ "loss": 1.2732,
20502
+ "step": 29150
20503
+ },
20504
+ {
20505
+ "epoch": 0.73,
20506
+ "grad_norm": 16.695268630981445,
20507
+ "learning_rate": 5.694915254237288e-07,
20508
+ "loss": 1.4132,
20509
+ "step": 29160
20510
+ },
20511
+ {
20512
+ "epoch": 0.73,
20513
+ "grad_norm": 9.606965065002441,
20514
+ "learning_rate": 5.627118644067797e-07,
20515
+ "loss": 1.2185,
20516
+ "step": 29170
20517
+ },
20518
+ {
20519
+ "epoch": 0.73,
20520
+ "grad_norm": 3.9440929889678955,
20521
+ "learning_rate": 5.559322033898306e-07,
20522
+ "loss": 1.3204,
20523
+ "step": 29180
20524
+ },
20525
+ {
20526
+ "epoch": 0.73,
20527
+ "grad_norm": 11.45638370513916,
20528
+ "learning_rate": 5.491525423728815e-07,
20529
+ "loss": 1.3867,
20530
+ "step": 29190
20531
+ },
20532
+ {
20533
+ "epoch": 0.73,
20534
+ "grad_norm": 5.335674285888672,
20535
+ "learning_rate": 5.423728813559322e-07,
20536
+ "loss": 1.2608,
20537
+ "step": 29200
20538
+ },
20539
+ {
20540
+ "epoch": 0.73,
20541
+ "grad_norm": 5.219457149505615,
20542
+ "learning_rate": 5.355932203389831e-07,
20543
+ "loss": 1.3998,
20544
+ "step": 29210
20545
+ },
20546
+ {
20547
+ "epoch": 0.73,
20548
+ "grad_norm": 8.143280029296875,
20549
+ "learning_rate": 5.28813559322034e-07,
20550
+ "loss": 1.2506,
20551
+ "step": 29220
20552
+ },
20553
+ {
20554
+ "epoch": 0.73,
20555
+ "grad_norm": 21.4418888092041,
20556
+ "learning_rate": 5.220338983050848e-07,
20557
+ "loss": 1.3894,
20558
+ "step": 29230
20559
+ },
20560
+ {
20561
+ "epoch": 0.73,
20562
+ "grad_norm": 9.489405632019043,
20563
+ "learning_rate": 5.152542372881356e-07,
20564
+ "loss": 1.3329,
20565
+ "step": 29240
20566
+ },
20567
+ {
20568
+ "epoch": 0.73,
20569
+ "grad_norm": 4.9300031661987305,
20570
+ "learning_rate": 5.084745762711865e-07,
20571
+ "loss": 1.2334,
20572
+ "step": 29250
20573
+ },
20574
+ {
20575
+ "epoch": 0.73,
20576
+ "grad_norm": 4.382762908935547,
20577
+ "learning_rate": 5.016949152542373e-07,
20578
+ "loss": 1.2921,
20579
+ "step": 29260
20580
+ },
20581
+ {
20582
+ "epoch": 0.73,
20583
+ "grad_norm": 9.448204040527344,
20584
+ "learning_rate": 4.949152542372881e-07,
20585
+ "loss": 1.2535,
20586
+ "step": 29270
20587
+ },
20588
+ {
20589
+ "epoch": 0.73,
20590
+ "grad_norm": 12.2984619140625,
20591
+ "learning_rate": 4.881355932203391e-07,
20592
+ "loss": 1.1073,
20593
+ "step": 29280
20594
+ },
20595
+ {
20596
+ "epoch": 0.73,
20597
+ "grad_norm": 8.206221580505371,
20598
+ "learning_rate": 4.813559322033898e-07,
20599
+ "loss": 1.0739,
20600
+ "step": 29290
20601
+ },
20602
+ {
20603
+ "epoch": 0.73,
20604
+ "grad_norm": 16.299449920654297,
20605
+ "learning_rate": 4.745762711864407e-07,
20606
+ "loss": 1.2505,
20607
+ "step": 29300
20608
+ },
20609
+ {
20610
+ "epoch": 0.73,
20611
+ "grad_norm": 5.297379493713379,
20612
+ "learning_rate": 4.6779661016949154e-07,
20613
+ "loss": 1.2957,
20614
+ "step": 29310
20615
+ },
20616
+ {
20617
+ "epoch": 0.73,
20618
+ "grad_norm": 11.336212158203125,
20619
+ "learning_rate": 4.610169491525424e-07,
20620
+ "loss": 1.2837,
20621
+ "step": 29320
20622
+ },
20623
+ {
20624
+ "epoch": 0.73,
20625
+ "grad_norm": 8.407357215881348,
20626
+ "learning_rate": 4.542372881355932e-07,
20627
+ "loss": 1.2719,
20628
+ "step": 29330
20629
+ },
20630
+ {
20631
+ "epoch": 0.73,
20632
+ "grad_norm": 4.997983932495117,
20633
+ "learning_rate": 4.4745762711864415e-07,
20634
+ "loss": 1.3578,
20635
+ "step": 29340
20636
+ },
20637
+ {
20638
+ "epoch": 0.73,
20639
+ "grad_norm": 1.8061422109603882,
20640
+ "learning_rate": 4.4067796610169497e-07,
20641
+ "loss": 1.2524,
20642
+ "step": 29350
20643
+ },
20644
+ {
20645
+ "epoch": 0.73,
20646
+ "grad_norm": 7.534559726715088,
20647
+ "learning_rate": 4.3389830508474584e-07,
20648
+ "loss": 1.2727,
20649
+ "step": 29360
20650
+ },
20651
+ {
20652
+ "epoch": 0.73,
20653
+ "grad_norm": 2.897261619567871,
20654
+ "learning_rate": 4.2711864406779666e-07,
20655
+ "loss": 1.35,
20656
+ "step": 29370
20657
+ },
20658
+ {
20659
+ "epoch": 0.73,
20660
+ "grad_norm": 11.500360488891602,
20661
+ "learning_rate": 4.2033898305084753e-07,
20662
+ "loss": 1.3056,
20663
+ "step": 29380
20664
+ },
20665
+ {
20666
+ "epoch": 0.73,
20667
+ "grad_norm": 11.723714828491211,
20668
+ "learning_rate": 4.1355932203389835e-07,
20669
+ "loss": 1.3156,
20670
+ "step": 29390
20671
+ },
20672
+ {
20673
+ "epoch": 0.73,
20674
+ "grad_norm": 3.963747978210449,
20675
+ "learning_rate": 4.0677966101694916e-07,
20676
+ "loss": 1.3509,
20677
+ "step": 29400
20678
+ },
20679
+ {
20680
+ "epoch": 0.74,
20681
+ "grad_norm": 1.9269698858261108,
20682
+ "learning_rate": 4.0000000000000003e-07,
20683
+ "loss": 1.3019,
20684
+ "step": 29410
20685
+ },
20686
+ {
20687
+ "epoch": 0.74,
20688
+ "grad_norm": 6.010022163391113,
20689
+ "learning_rate": 3.9322033898305085e-07,
20690
+ "loss": 1.2579,
20691
+ "step": 29420
20692
+ },
20693
+ {
20694
+ "epoch": 0.74,
20695
+ "grad_norm": 11.123760223388672,
20696
+ "learning_rate": 3.864406779661017e-07,
20697
+ "loss": 1.3221,
20698
+ "step": 29430
20699
+ },
20700
+ {
20701
+ "epoch": 0.74,
20702
+ "grad_norm": 11.927657127380371,
20703
+ "learning_rate": 3.7966101694915254e-07,
20704
+ "loss": 1.2219,
20705
+ "step": 29440
20706
+ },
20707
+ {
20708
+ "epoch": 0.74,
20709
+ "grad_norm": 8.796635627746582,
20710
+ "learning_rate": 3.7288135593220347e-07,
20711
+ "loss": 1.2437,
20712
+ "step": 29450
20713
+ },
20714
+ {
20715
+ "epoch": 0.74,
20716
+ "grad_norm": 15.227128028869629,
20717
+ "learning_rate": 3.6610169491525423e-07,
20718
+ "loss": 1.2653,
20719
+ "step": 29460
20720
+ },
20721
+ {
20722
+ "epoch": 0.74,
20723
+ "grad_norm": 3.172653913497925,
20724
+ "learning_rate": 3.5932203389830516e-07,
20725
+ "loss": 1.4635,
20726
+ "step": 29470
20727
+ },
20728
+ {
20729
+ "epoch": 0.74,
20730
+ "grad_norm": 5.841287612915039,
20731
+ "learning_rate": 3.52542372881356e-07,
20732
+ "loss": 1.2766,
20733
+ "step": 29480
20734
+ },
20735
+ {
20736
+ "epoch": 0.74,
20737
+ "grad_norm": 9.996382713317871,
20738
+ "learning_rate": 3.4576271186440684e-07,
20739
+ "loss": 1.3705,
20740
+ "step": 29490
20741
+ },
20742
+ {
20743
+ "epoch": 0.74,
20744
+ "grad_norm": 9.059891700744629,
20745
+ "learning_rate": 3.3898305084745766e-07,
20746
+ "loss": 1.282,
20747
+ "step": 29500
20748
+ },
20749
+ {
20750
+ "epoch": 0.74,
20751
+ "grad_norm": 6.326866626739502,
20752
+ "learning_rate": 3.322033898305085e-07,
20753
+ "loss": 1.1478,
20754
+ "step": 29510
20755
+ },
20756
+ {
20757
+ "epoch": 0.74,
20758
+ "grad_norm": 4.546393394470215,
20759
+ "learning_rate": 3.2542372881355935e-07,
20760
+ "loss": 1.3345,
20761
+ "step": 29520
20762
+ },
20763
+ {
20764
+ "epoch": 0.74,
20765
+ "grad_norm": 3.654453992843628,
20766
+ "learning_rate": 3.1864406779661017e-07,
20767
+ "loss": 1.5465,
20768
+ "step": 29530
20769
+ },
20770
+ {
20771
+ "epoch": 0.74,
20772
+ "grad_norm": 8.529802322387695,
20773
+ "learning_rate": 3.1186440677966104e-07,
20774
+ "loss": 1.5145,
20775
+ "step": 29540
20776
+ },
20777
+ {
20778
+ "epoch": 0.74,
20779
+ "grad_norm": 4.871727466583252,
20780
+ "learning_rate": 3.050847457627119e-07,
20781
+ "loss": 1.3838,
20782
+ "step": 29550
20783
+ },
20784
+ {
20785
+ "epoch": 0.74,
20786
+ "grad_norm": 6.172389507293701,
20787
+ "learning_rate": 2.9830508474576273e-07,
20788
+ "loss": 1.3202,
20789
+ "step": 29560
20790
+ },
20791
+ {
20792
+ "epoch": 0.74,
20793
+ "grad_norm": 9.824962615966797,
20794
+ "learning_rate": 2.9152542372881355e-07,
20795
+ "loss": 1.2167,
20796
+ "step": 29570
20797
+ },
20798
+ {
20799
+ "epoch": 0.74,
20800
+ "grad_norm": 10.725250244140625,
20801
+ "learning_rate": 2.847457627118644e-07,
20802
+ "loss": 1.2796,
20803
+ "step": 29580
20804
+ },
20805
+ {
20806
+ "epoch": 0.74,
20807
+ "grad_norm": 14.172257423400879,
20808
+ "learning_rate": 2.779661016949153e-07,
20809
+ "loss": 1.1175,
20810
+ "step": 29590
20811
+ },
20812
+ {
20813
+ "epoch": 0.74,
20814
+ "grad_norm": 6.030600547790527,
20815
+ "learning_rate": 2.711864406779661e-07,
20816
+ "loss": 1.0479,
20817
+ "step": 29600
20818
+ },
20819
+ {
20820
+ "epoch": 0.74,
20821
+ "grad_norm": 3.860551357269287,
20822
+ "learning_rate": 2.64406779661017e-07,
20823
+ "loss": 1.1551,
20824
+ "step": 29610
20825
+ },
20826
+ {
20827
+ "epoch": 0.74,
20828
+ "grad_norm": 8.990274429321289,
20829
+ "learning_rate": 2.576271186440678e-07,
20830
+ "loss": 1.2577,
20831
+ "step": 29620
20832
+ },
20833
+ {
20834
+ "epoch": 0.74,
20835
+ "grad_norm": 9.873722076416016,
20836
+ "learning_rate": 2.5084745762711867e-07,
20837
+ "loss": 1.333,
20838
+ "step": 29630
20839
+ },
20840
+ {
20841
+ "epoch": 0.74,
20842
+ "grad_norm": 10.069340705871582,
20843
+ "learning_rate": 2.4406779661016954e-07,
20844
+ "loss": 1.4405,
20845
+ "step": 29640
20846
+ },
20847
+ {
20848
+ "epoch": 0.74,
20849
+ "grad_norm": 4.778994083404541,
20850
+ "learning_rate": 2.3728813559322036e-07,
20851
+ "loss": 1.5492,
20852
+ "step": 29650
20853
+ },
20854
+ {
20855
+ "epoch": 0.74,
20856
+ "grad_norm": 14.848403930664062,
20857
+ "learning_rate": 2.305084745762712e-07,
20858
+ "loss": 1.136,
20859
+ "step": 29660
20860
+ },
20861
+ {
20862
+ "epoch": 0.74,
20863
+ "grad_norm": 17.621976852416992,
20864
+ "learning_rate": 2.2372881355932207e-07,
20865
+ "loss": 1.3884,
20866
+ "step": 29670
20867
+ },
20868
+ {
20869
+ "epoch": 0.74,
20870
+ "grad_norm": 5.0682597160339355,
20871
+ "learning_rate": 2.1694915254237292e-07,
20872
+ "loss": 1.2395,
20873
+ "step": 29680
20874
+ },
20875
+ {
20876
+ "epoch": 0.74,
20877
+ "grad_norm": 10.465557098388672,
20878
+ "learning_rate": 2.1016949152542376e-07,
20879
+ "loss": 1.3657,
20880
+ "step": 29690
20881
+ },
20882
+ {
20883
+ "epoch": 0.74,
20884
+ "grad_norm": 16.993274688720703,
20885
+ "learning_rate": 2.0338983050847458e-07,
20886
+ "loss": 1.2805,
20887
+ "step": 29700
20888
+ },
20889
+ {
20890
+ "epoch": 0.74,
20891
+ "grad_norm": 8.59652042388916,
20892
+ "learning_rate": 1.9661016949152543e-07,
20893
+ "loss": 1.0588,
20894
+ "step": 29710
20895
+ },
20896
+ {
20897
+ "epoch": 0.74,
20898
+ "grad_norm": 3.580928087234497,
20899
+ "learning_rate": 1.8983050847457627e-07,
20900
+ "loss": 1.3986,
20901
+ "step": 29720
20902
+ },
20903
+ {
20904
+ "epoch": 0.74,
20905
+ "grad_norm": 5.2861552238464355,
20906
+ "learning_rate": 1.8305084745762712e-07,
20907
+ "loss": 1.1782,
20908
+ "step": 29730
20909
+ },
20910
+ {
20911
+ "epoch": 0.74,
20912
+ "grad_norm": 16.308183670043945,
20913
+ "learning_rate": 1.76271186440678e-07,
20914
+ "loss": 1.153,
20915
+ "step": 29740
20916
+ },
20917
+ {
20918
+ "epoch": 0.74,
20919
+ "grad_norm": 4.2665863037109375,
20920
+ "learning_rate": 1.6949152542372883e-07,
20921
+ "loss": 1.3667,
20922
+ "step": 29750
20923
+ },
20924
+ {
20925
+ "epoch": 0.74,
20926
+ "grad_norm": 7.868191719055176,
20927
+ "learning_rate": 1.6271186440677968e-07,
20928
+ "loss": 1.2511,
20929
+ "step": 29760
20930
+ },
20931
+ {
20932
+ "epoch": 0.74,
20933
+ "grad_norm": 6.3579936027526855,
20934
+ "learning_rate": 1.5593220338983052e-07,
20935
+ "loss": 1.4062,
20936
+ "step": 29770
20937
+ },
20938
+ {
20939
+ "epoch": 0.74,
20940
+ "grad_norm": 9.477784156799316,
20941
+ "learning_rate": 1.4915254237288137e-07,
20942
+ "loss": 1.2801,
20943
+ "step": 29780
20944
+ },
20945
+ {
20946
+ "epoch": 0.74,
20947
+ "grad_norm": 5.881575584411621,
20948
+ "learning_rate": 1.423728813559322e-07,
20949
+ "loss": 1.343,
20950
+ "step": 29790
20951
+ },
20952
+ {
20953
+ "epoch": 0.74,
20954
+ "grad_norm": 6.489737510681152,
20955
+ "learning_rate": 1.3559322033898305e-07,
20956
+ "loss": 1.302,
20957
+ "step": 29800
20958
+ },
20959
+ {
20960
+ "epoch": 0.75,
20961
+ "grad_norm": 14.81570816040039,
20962
+ "learning_rate": 1.288135593220339e-07,
20963
+ "loss": 1.251,
20964
+ "step": 29810
20965
+ },
20966
+ {
20967
+ "epoch": 0.75,
20968
+ "grad_norm": 15.79887866973877,
20969
+ "learning_rate": 1.2203389830508477e-07,
20970
+ "loss": 1.3992,
20971
+ "step": 29820
20972
+ },
20973
+ {
20974
+ "epoch": 0.75,
20975
+ "grad_norm": 5.4701828956604,
20976
+ "learning_rate": 1.152542372881356e-07,
20977
+ "loss": 1.3728,
20978
+ "step": 29830
20979
+ },
20980
+ {
20981
+ "epoch": 0.75,
20982
+ "grad_norm": 5.326042175292969,
20983
+ "learning_rate": 1.0847457627118646e-07,
20984
+ "loss": 1.3645,
20985
+ "step": 29840
20986
+ },
20987
+ {
20988
+ "epoch": 0.75,
20989
+ "grad_norm": 3.9920434951782227,
20990
+ "learning_rate": 1.0169491525423729e-07,
20991
+ "loss": 1.4768,
20992
+ "step": 29850
20993
+ },
20994
+ {
20995
+ "epoch": 0.75,
20996
+ "grad_norm": 10.122292518615723,
20997
+ "learning_rate": 9.491525423728814e-08,
20998
+ "loss": 1.4596,
20999
+ "step": 29860
21000
+ },
21001
+ {
21002
+ "epoch": 0.75,
21003
+ "grad_norm": 6.636433124542236,
21004
+ "learning_rate": 8.8135593220339e-08,
21005
+ "loss": 1.3858,
21006
+ "step": 29870
21007
+ },
21008
+ {
21009
+ "epoch": 0.75,
21010
+ "grad_norm": 6.18862247467041,
21011
+ "learning_rate": 8.135593220338984e-08,
21012
+ "loss": 1.3912,
21013
+ "step": 29880
21014
+ },
21015
+ {
21016
+ "epoch": 0.75,
21017
+ "grad_norm": 10.676321983337402,
21018
+ "learning_rate": 7.457627118644068e-08,
21019
+ "loss": 1.3445,
21020
+ "step": 29890
21021
+ },
21022
+ {
21023
+ "epoch": 0.75,
21024
+ "grad_norm": 14.703264236450195,
21025
+ "learning_rate": 6.779661016949153e-08,
21026
+ "loss": 1.3391,
21027
+ "step": 29900
21028
+ },
21029
+ {
21030
+ "epoch": 0.75,
21031
+ "grad_norm": 11.101775169372559,
21032
+ "learning_rate": 6.101694915254239e-08,
21033
+ "loss": 1.2044,
21034
+ "step": 29910
21035
+ },
21036
+ {
21037
+ "epoch": 0.75,
21038
+ "grad_norm": 5.793562889099121,
21039
+ "learning_rate": 5.423728813559323e-08,
21040
+ "loss": 1.3205,
21041
+ "step": 29920
21042
+ },
21043
+ {
21044
+ "epoch": 0.75,
21045
+ "grad_norm": 4.186032295227051,
21046
+ "learning_rate": 4.745762711864407e-08,
21047
+ "loss": 1.3155,
21048
+ "step": 29930
21049
+ },
21050
+ {
21051
+ "epoch": 0.75,
21052
+ "grad_norm": 5.155704021453857,
21053
+ "learning_rate": 4.067796610169492e-08,
21054
+ "loss": 1.3661,
21055
+ "step": 29940
21056
+ },
21057
+ {
21058
+ "epoch": 0.75,
21059
+ "grad_norm": 10.625030517578125,
21060
+ "learning_rate": 3.3898305084745764e-08,
21061
+ "loss": 1.2094,
21062
+ "step": 29950
21063
+ },
21064
+ {
21065
+ "epoch": 0.75,
21066
+ "grad_norm": 9.991485595703125,
21067
+ "learning_rate": 2.7118644067796615e-08,
21068
+ "loss": 1.2198,
21069
+ "step": 29960
21070
+ },
21071
+ {
21072
+ "epoch": 0.75,
21073
+ "grad_norm": 3.559561014175415,
21074
+ "learning_rate": 2.033898305084746e-08,
21075
+ "loss": 1.33,
21076
+ "step": 29970
21077
+ },
21078
+ {
21079
+ "epoch": 0.75,
21080
+ "grad_norm": 1.8645702600479126,
21081
+ "learning_rate": 1.3559322033898307e-08,
21082
+ "loss": 1.3138,
21083
+ "step": 29980
21084
+ },
21085
+ {
21086
+ "epoch": 0.75,
21087
+ "grad_norm": 4.156149387359619,
21088
+ "learning_rate": 6.779661016949154e-09,
21089
+ "loss": 1.2674,
21090
+ "step": 29990
21091
+ },
21092
+ {
21093
+ "epoch": 0.75,
21094
+ "grad_norm": 4.104331016540527,
21095
+ "learning_rate": 0.0,
21096
+ "loss": 1.2401,
21097
+ "step": 30000
21098
+ },
21099
+ {
21100
+ "epoch": 0.75,
21101
+ "eval_loss": 1.2914531230926514,
21102
+ "eval_runtime": 122.439,
21103
+ "eval_samples_per_second": 8.167,
21104
+ "eval_steps_per_second": 8.167,
21105
+ "step": 30000
21106
  }
21107
  ],
21108
  "logging_steps": 10,
 
21110
  "num_input_tokens_seen": 0,
21111
  "num_train_epochs": 1,
21112
  "save_steps": 2500,
21113
+ "total_flos": 4.8306377981952e+17,
21114
  "train_batch_size": 1,
21115
  "trial_name": null,
21116
  "trial_params": null