jssky commited on
Commit
8bb560d
·
verified ·
1 Parent(s): cece44c

Training in progress, step 800, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa28f59d93f04e04a1e45440094a3b936698a0d6a1ee4d566dbe8f5be90460af
3
  size 335604696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1e1a151a5741a3af49bc404c3435bb2c87de0f0975a6c1caef52d07bc4df9bc
3
  size 335604696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ed5d84e0088ac7a62079f42f353570d1ed6bd83812327741d71bba4506a6649
3
  size 170920532
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5ad5b608c3afe09851aad5be2f15a0fabc7e6f8e8b50eeb1b0b66f63b50b862
3
  size 170920532
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c09c2af13ae0132a5012fca2b977f46dea262205d7733c4e2a8df3a3af5574f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f984006bb3837e6a89eec0a2e65a93c4eb3466bdfa3832a69232239f8b636824
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c050dea93eec6d92c9f584b053974771b61864e11a4a0406e69ef4b26a324c78
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cf13def61641dc944e9edbf416e7eb9f3aaa5a8dc09cbff5b8590b9b4880ce1
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.2474220246076584,
3
- "best_model_checkpoint": "miner_id_24_1/checkpoint-600",
4
- "epoch": 0.305537873965627,
5
  "eval_steps": 200,
6
- "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4231,6 +4231,1414 @@
4231
  "eval_samples_per_second": 8.764,
4232
  "eval_steps_per_second": 2.191,
4233
  "step": 600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4234
  }
4235
  ],
4236
  "logging_steps": 1,
@@ -4259,7 +5667,7 @@
4259
  "attributes": {}
4260
  }
4261
  },
4262
- "total_flos": 1.2490082104262e+18,
4263
  "train_batch_size": 8,
4264
  "trial_name": null,
4265
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.23512445390224457,
3
+ "best_model_checkpoint": "miner_id_24_1/checkpoint-800",
4
+ "epoch": 0.40738383195416933,
5
  "eval_steps": 200,
6
+ "global_step": 800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4231
  "eval_samples_per_second": 8.764,
4232
  "eval_steps_per_second": 2.191,
4233
  "step": 600
4234
+ },
4235
+ {
4236
+ "epoch": 0.3060471037555697,
4237
+ "grad_norm": 5.079921722412109,
4238
+ "learning_rate": 3.5002545436149474e-05,
4239
+ "loss": 0.8877,
4240
+ "step": 601
4241
+ },
4242
+ {
4243
+ "epoch": 0.3065563335455124,
4244
+ "grad_norm": 5.308548927307129,
4245
+ "learning_rate": 3.485126066291364e-05,
4246
+ "loss": 0.8411,
4247
+ "step": 602
4248
+ },
4249
+ {
4250
+ "epoch": 0.30706556333545515,
4251
+ "grad_norm": 7.06653356552124,
4252
+ "learning_rate": 3.470012843731476e-05,
4253
+ "loss": 0.8479,
4254
+ "step": 603
4255
+ },
4256
+ {
4257
+ "epoch": 0.3075747931253978,
4258
+ "grad_norm": 2.4465208053588867,
4259
+ "learning_rate": 3.4549150281252636e-05,
4260
+ "loss": 0.0402,
4261
+ "step": 604
4262
+ },
4263
+ {
4264
+ "epoch": 0.30808402291534054,
4265
+ "grad_norm": 7.19598913192749,
4266
+ "learning_rate": 3.439832771507565e-05,
4267
+ "loss": 0.5183,
4268
+ "step": 605
4269
+ },
4270
+ {
4271
+ "epoch": 0.30859325270528326,
4272
+ "grad_norm": 10.159460067749023,
4273
+ "learning_rate": 3.424766225756537e-05,
4274
+ "loss": 0.6912,
4275
+ "step": 606
4276
+ },
4277
+ {
4278
+ "epoch": 0.309102482495226,
4279
+ "grad_norm": 6.36456298828125,
4280
+ "learning_rate": 3.4097155425921254e-05,
4281
+ "loss": 0.4891,
4282
+ "step": 607
4283
+ },
4284
+ {
4285
+ "epoch": 0.3096117122851687,
4286
+ "grad_norm": 28.50994873046875,
4287
+ "learning_rate": 3.394680873574546e-05,
4288
+ "loss": 1.2461,
4289
+ "step": 608
4290
+ },
4291
+ {
4292
+ "epoch": 0.31012094207511137,
4293
+ "grad_norm": 11.220614433288574,
4294
+ "learning_rate": 3.3796623701027476e-05,
4295
+ "loss": 0.8775,
4296
+ "step": 609
4297
+ },
4298
+ {
4299
+ "epoch": 0.3106301718650541,
4300
+ "grad_norm": 9.175284385681152,
4301
+ "learning_rate": 3.364660183412892e-05,
4302
+ "loss": 0.8295,
4303
+ "step": 610
4304
+ },
4305
+ {
4306
+ "epoch": 0.3111394016549968,
4307
+ "grad_norm": 13.259340286254883,
4308
+ "learning_rate": 3.349674464576834e-05,
4309
+ "loss": 0.8886,
4310
+ "step": 611
4311
+ },
4312
+ {
4313
+ "epoch": 0.31164863144493954,
4314
+ "grad_norm": 10.061495780944824,
4315
+ "learning_rate": 3.334705364500596e-05,
4316
+ "loss": 0.7599,
4317
+ "step": 612
4318
+ },
4319
+ {
4320
+ "epoch": 0.31215786123488226,
4321
+ "grad_norm": 4.028329849243164,
4322
+ "learning_rate": 3.3197530339228487e-05,
4323
+ "loss": 0.1656,
4324
+ "step": 613
4325
+ },
4326
+ {
4327
+ "epoch": 0.31266709102482493,
4328
+ "grad_norm": 2.669538736343384,
4329
+ "learning_rate": 3.304817623413397e-05,
4330
+ "loss": 0.124,
4331
+ "step": 614
4332
+ },
4333
+ {
4334
+ "epoch": 0.31317632081476765,
4335
+ "grad_norm": 3.3735191822052,
4336
+ "learning_rate": 3.289899283371657e-05,
4337
+ "loss": 0.1331,
4338
+ "step": 615
4339
+ },
4340
+ {
4341
+ "epoch": 0.3136855506047104,
4342
+ "grad_norm": 1.7686545848846436,
4343
+ "learning_rate": 3.274998164025148e-05,
4344
+ "loss": 0.1367,
4345
+ "step": 616
4346
+ },
4347
+ {
4348
+ "epoch": 0.3141947803946531,
4349
+ "grad_norm": 2.966285467147827,
4350
+ "learning_rate": 3.260114415427975e-05,
4351
+ "loss": 0.1389,
4352
+ "step": 617
4353
+ },
4354
+ {
4355
+ "epoch": 0.3147040101845958,
4356
+ "grad_norm": 6.92048454284668,
4357
+ "learning_rate": 3.2452481874593234e-05,
4358
+ "loss": 0.1157,
4359
+ "step": 618
4360
+ },
4361
+ {
4362
+ "epoch": 0.3152132399745385,
4363
+ "grad_norm": 8.68355655670166,
4364
+ "learning_rate": 3.230399629821942e-05,
4365
+ "loss": 0.1763,
4366
+ "step": 619
4367
+ },
4368
+ {
4369
+ "epoch": 0.3157224697644812,
4370
+ "grad_norm": 3.1240475177764893,
4371
+ "learning_rate": 3.215568892040641e-05,
4372
+ "loss": 0.1004,
4373
+ "step": 620
4374
+ },
4375
+ {
4376
+ "epoch": 0.31623169955442393,
4377
+ "grad_norm": 1.7762776613235474,
4378
+ "learning_rate": 3.200756123460788e-05,
4379
+ "loss": 0.0993,
4380
+ "step": 621
4381
+ },
4382
+ {
4383
+ "epoch": 0.31674092934436665,
4384
+ "grad_norm": 2.550874710083008,
4385
+ "learning_rate": 3.1859614732467954e-05,
4386
+ "loss": 0.0514,
4387
+ "step": 622
4388
+ },
4389
+ {
4390
+ "epoch": 0.3172501591343094,
4391
+ "grad_norm": 3.7702434062957764,
4392
+ "learning_rate": 3.171185090380628e-05,
4393
+ "loss": 0.0691,
4394
+ "step": 623
4395
+ },
4396
+ {
4397
+ "epoch": 0.3177593889242521,
4398
+ "grad_norm": 4.985774517059326,
4399
+ "learning_rate": 3.156427123660297e-05,
4400
+ "loss": 0.0982,
4401
+ "step": 624
4402
+ },
4403
+ {
4404
+ "epoch": 0.31826861871419476,
4405
+ "grad_norm": 9.22007942199707,
4406
+ "learning_rate": 3.141687721698363e-05,
4407
+ "loss": 0.123,
4408
+ "step": 625
4409
+ },
4410
+ {
4411
+ "epoch": 0.3187778485041375,
4412
+ "grad_norm": 3.031754970550537,
4413
+ "learning_rate": 3.12696703292044e-05,
4414
+ "loss": 0.0507,
4415
+ "step": 626
4416
+ },
4417
+ {
4418
+ "epoch": 0.3192870782940802,
4419
+ "grad_norm": 6.516983509063721,
4420
+ "learning_rate": 3.1122652055637015e-05,
4421
+ "loss": 0.0944,
4422
+ "step": 627
4423
+ },
4424
+ {
4425
+ "epoch": 0.31979630808402293,
4426
+ "grad_norm": 4.626400470733643,
4427
+ "learning_rate": 3.097582387675385e-05,
4428
+ "loss": 0.1333,
4429
+ "step": 628
4430
+ },
4431
+ {
4432
+ "epoch": 0.32030553787396565,
4433
+ "grad_norm": 5.953018665313721,
4434
+ "learning_rate": 3.082918727111304e-05,
4435
+ "loss": 0.1317,
4436
+ "step": 629
4437
+ },
4438
+ {
4439
+ "epoch": 0.3208147676639083,
4440
+ "grad_norm": 4.307436466217041,
4441
+ "learning_rate": 3.0682743715343564e-05,
4442
+ "loss": 0.0859,
4443
+ "step": 630
4444
+ },
4445
+ {
4446
+ "epoch": 0.32132399745385104,
4447
+ "grad_norm": 5.189202308654785,
4448
+ "learning_rate": 3.053649468413043e-05,
4449
+ "loss": 0.1205,
4450
+ "step": 631
4451
+ },
4452
+ {
4453
+ "epoch": 0.32183322724379376,
4454
+ "grad_norm": 7.541786193847656,
4455
+ "learning_rate": 3.0390441650199724e-05,
4456
+ "loss": 0.1567,
4457
+ "step": 632
4458
+ },
4459
+ {
4460
+ "epoch": 0.3223424570337365,
4461
+ "grad_norm": 5.107945442199707,
4462
+ "learning_rate": 3.0244586084303905e-05,
4463
+ "loss": 0.0836,
4464
+ "step": 633
4465
+ },
4466
+ {
4467
+ "epoch": 0.3228516868236792,
4468
+ "grad_norm": 5.2220563888549805,
4469
+ "learning_rate": 3.0098929455206904e-05,
4470
+ "loss": 0.0487,
4471
+ "step": 634
4472
+ },
4473
+ {
4474
+ "epoch": 0.3233609166136219,
4475
+ "grad_norm": 3.8281970024108887,
4476
+ "learning_rate": 2.9953473229669328e-05,
4477
+ "loss": 0.0783,
4478
+ "step": 635
4479
+ },
4480
+ {
4481
+ "epoch": 0.3238701464035646,
4482
+ "grad_norm": 4.100976943969727,
4483
+ "learning_rate": 2.9808218872433767e-05,
4484
+ "loss": 0.1336,
4485
+ "step": 636
4486
+ },
4487
+ {
4488
+ "epoch": 0.3243793761935073,
4489
+ "grad_norm": 4.79497766494751,
4490
+ "learning_rate": 2.9663167846209998e-05,
4491
+ "loss": 0.0657,
4492
+ "step": 637
4493
+ },
4494
+ {
4495
+ "epoch": 0.32488860598345004,
4496
+ "grad_norm": 5.689431190490723,
4497
+ "learning_rate": 2.9518321611660237e-05,
4498
+ "loss": 0.033,
4499
+ "step": 638
4500
+ },
4501
+ {
4502
+ "epoch": 0.32539783577339276,
4503
+ "grad_norm": 7.910229206085205,
4504
+ "learning_rate": 2.9373681627384447e-05,
4505
+ "loss": 0.0893,
4506
+ "step": 639
4507
+ },
4508
+ {
4509
+ "epoch": 0.32590706556333543,
4510
+ "grad_norm": 3.019529104232788,
4511
+ "learning_rate": 2.9229249349905684e-05,
4512
+ "loss": 0.0205,
4513
+ "step": 640
4514
+ },
4515
+ {
4516
+ "epoch": 0.32641629535327815,
4517
+ "grad_norm": 2.8111181259155273,
4518
+ "learning_rate": 2.9085026233655365e-05,
4519
+ "loss": 0.0133,
4520
+ "step": 641
4521
+ },
4522
+ {
4523
+ "epoch": 0.3269255251432209,
4524
+ "grad_norm": 4.1778974533081055,
4525
+ "learning_rate": 2.894101373095867e-05,
4526
+ "loss": 0.0162,
4527
+ "step": 642
4528
+ },
4529
+ {
4530
+ "epoch": 0.3274347549331636,
4531
+ "grad_norm": 40.197940826416016,
4532
+ "learning_rate": 2.8797213292019926e-05,
4533
+ "loss": 0.0561,
4534
+ "step": 643
4535
+ },
4536
+ {
4537
+ "epoch": 0.3279439847231063,
4538
+ "grad_norm": 5.798105239868164,
4539
+ "learning_rate": 2.8653626364907917e-05,
4540
+ "loss": 0.0468,
4541
+ "step": 644
4542
+ },
4543
+ {
4544
+ "epoch": 0.32845321451304904,
4545
+ "grad_norm": 0.09037820249795914,
4546
+ "learning_rate": 2.851025439554142e-05,
4547
+ "loss": 0.0004,
4548
+ "step": 645
4549
+ },
4550
+ {
4551
+ "epoch": 0.3289624443029917,
4552
+ "grad_norm": 6.750631809234619,
4553
+ "learning_rate": 2.8367098827674578e-05,
4554
+ "loss": 0.12,
4555
+ "step": 646
4556
+ },
4557
+ {
4558
+ "epoch": 0.32947167409293443,
4559
+ "grad_norm": 8.365553855895996,
4560
+ "learning_rate": 2.8224161102882397e-05,
4561
+ "loss": 0.0613,
4562
+ "step": 647
4563
+ },
4564
+ {
4565
+ "epoch": 0.32998090388287715,
4566
+ "grad_norm": 8.15538501739502,
4567
+ "learning_rate": 2.8081442660546125e-05,
4568
+ "loss": 0.1204,
4569
+ "step": 648
4570
+ },
4571
+ {
4572
+ "epoch": 0.3304901336728199,
4573
+ "grad_norm": 8.706940650939941,
4574
+ "learning_rate": 2.7938944937838923e-05,
4575
+ "loss": 0.0566,
4576
+ "step": 649
4577
+ },
4578
+ {
4579
+ "epoch": 0.3309993634627626,
4580
+ "grad_norm": 4.927478790283203,
4581
+ "learning_rate": 2.7796669369711294e-05,
4582
+ "loss": 0.1079,
4583
+ "step": 650
4584
+ },
4585
+ {
4586
+ "epoch": 0.33150859325270526,
4587
+ "grad_norm": 4.741428852081299,
4588
+ "learning_rate": 2.7654617388876615e-05,
4589
+ "loss": 0.7197,
4590
+ "step": 651
4591
+ },
4592
+ {
4593
+ "epoch": 0.332017823042648,
4594
+ "grad_norm": 4.9175944328308105,
4595
+ "learning_rate": 2.7512790425796718e-05,
4596
+ "loss": 0.5017,
4597
+ "step": 652
4598
+ },
4599
+ {
4600
+ "epoch": 0.3325270528325907,
4601
+ "grad_norm": 3.583587884902954,
4602
+ "learning_rate": 2.7371189908667604e-05,
4603
+ "loss": 0.2062,
4604
+ "step": 653
4605
+ },
4606
+ {
4607
+ "epoch": 0.33303628262253343,
4608
+ "grad_norm": 9.022587776184082,
4609
+ "learning_rate": 2.7229817263404866e-05,
4610
+ "loss": 0.7837,
4611
+ "step": 654
4612
+ },
4613
+ {
4614
+ "epoch": 0.33354551241247615,
4615
+ "grad_norm": 9.906291007995605,
4616
+ "learning_rate": 2.708867391362948e-05,
4617
+ "loss": 0.6524,
4618
+ "step": 655
4619
+ },
4620
+ {
4621
+ "epoch": 0.3340547422024188,
4622
+ "grad_norm": 5.258704662322998,
4623
+ "learning_rate": 2.694776128065345e-05,
4624
+ "loss": 0.3687,
4625
+ "step": 656
4626
+ },
4627
+ {
4628
+ "epoch": 0.33456397199236154,
4629
+ "grad_norm": 7.302427768707275,
4630
+ "learning_rate": 2.6807080783465376e-05,
4631
+ "loss": 0.4876,
4632
+ "step": 657
4633
+ },
4634
+ {
4635
+ "epoch": 0.33507320178230426,
4636
+ "grad_norm": 13.841546058654785,
4637
+ "learning_rate": 2.6666633838716314e-05,
4638
+ "loss": 1.4033,
4639
+ "step": 658
4640
+ },
4641
+ {
4642
+ "epoch": 0.335582431572247,
4643
+ "grad_norm": 11.859368324279785,
4644
+ "learning_rate": 2.6526421860705473e-05,
4645
+ "loss": 0.9077,
4646
+ "step": 659
4647
+ },
4648
+ {
4649
+ "epoch": 0.3360916613621897,
4650
+ "grad_norm": 13.534501075744629,
4651
+ "learning_rate": 2.638644626136587e-05,
4652
+ "loss": 0.7607,
4653
+ "step": 660
4654
+ },
4655
+ {
4656
+ "epoch": 0.3366008911521324,
4657
+ "grad_norm": 7.9164958000183105,
4658
+ "learning_rate": 2.6246708450250256e-05,
4659
+ "loss": 0.599,
4660
+ "step": 661
4661
+ },
4662
+ {
4663
+ "epoch": 0.3371101209420751,
4664
+ "grad_norm": 8.299764633178711,
4665
+ "learning_rate": 2.6107209834516854e-05,
4666
+ "loss": 1.0367,
4667
+ "step": 662
4668
+ },
4669
+ {
4670
+ "epoch": 0.3376193507320178,
4671
+ "grad_norm": 4.36458683013916,
4672
+ "learning_rate": 2.596795181891514e-05,
4673
+ "loss": 0.4012,
4674
+ "step": 663
4675
+ },
4676
+ {
4677
+ "epoch": 0.33812858052196054,
4678
+ "grad_norm": 2.5791258811950684,
4679
+ "learning_rate": 2.5828935805771802e-05,
4680
+ "loss": 0.1412,
4681
+ "step": 664
4682
+ },
4683
+ {
4684
+ "epoch": 0.33863781031190326,
4685
+ "grad_norm": 4.219427585601807,
4686
+ "learning_rate": 2.5690163194976575e-05,
4687
+ "loss": 0.1824,
4688
+ "step": 665
4689
+ },
4690
+ {
4691
+ "epoch": 0.33914704010184593,
4692
+ "grad_norm": 2.9232358932495117,
4693
+ "learning_rate": 2.5551635383968065e-05,
4694
+ "loss": 0.17,
4695
+ "step": 666
4696
+ },
4697
+ {
4698
+ "epoch": 0.33965626989178865,
4699
+ "grad_norm": 2.1133458614349365,
4700
+ "learning_rate": 2.5413353767719805e-05,
4701
+ "loss": 0.1109,
4702
+ "step": 667
4703
+ },
4704
+ {
4705
+ "epoch": 0.3401654996817314,
4706
+ "grad_norm": 0.8237698078155518,
4707
+ "learning_rate": 2.5275319738726165e-05,
4708
+ "loss": 0.0692,
4709
+ "step": 668
4710
+ },
4711
+ {
4712
+ "epoch": 0.3406747294716741,
4713
+ "grad_norm": 2.2106404304504395,
4714
+ "learning_rate": 2.513753468698826e-05,
4715
+ "loss": 0.1241,
4716
+ "step": 669
4717
+ },
4718
+ {
4719
+ "epoch": 0.3411839592616168,
4720
+ "grad_norm": 2.5900228023529053,
4721
+ "learning_rate": 2.500000000000001e-05,
4722
+ "loss": 0.1617,
4723
+ "step": 670
4724
+ },
4725
+ {
4726
+ "epoch": 0.34169318905155954,
4727
+ "grad_norm": 0.9092320799827576,
4728
+ "learning_rate": 2.486271706273421e-05,
4729
+ "loss": 0.1002,
4730
+ "step": 671
4731
+ },
4732
+ {
4733
+ "epoch": 0.3422024188415022,
4734
+ "grad_norm": 3.5675482749938965,
4735
+ "learning_rate": 2.4725687257628534e-05,
4736
+ "loss": 0.0889,
4737
+ "step": 672
4738
+ },
4739
+ {
4740
+ "epoch": 0.34271164863144493,
4741
+ "grad_norm": 4.1621479988098145,
4742
+ "learning_rate": 2.4588911964571553e-05,
4743
+ "loss": 0.119,
4744
+ "step": 673
4745
+ },
4746
+ {
4747
+ "epoch": 0.34322087842138765,
4748
+ "grad_norm": 1.428825855255127,
4749
+ "learning_rate": 2.4452392560888976e-05,
4750
+ "loss": 0.0871,
4751
+ "step": 674
4752
+ },
4753
+ {
4754
+ "epoch": 0.3437301082113304,
4755
+ "grad_norm": 4.961071968078613,
4756
+ "learning_rate": 2.4316130421329697e-05,
4757
+ "loss": 0.0718,
4758
+ "step": 675
4759
+ },
4760
+ {
4761
+ "epoch": 0.3442393380012731,
4762
+ "grad_norm": 5.432969570159912,
4763
+ "learning_rate": 2.418012691805191e-05,
4764
+ "loss": 0.0782,
4765
+ "step": 676
4766
+ },
4767
+ {
4768
+ "epoch": 0.34474856779121577,
4769
+ "grad_norm": 4.57743501663208,
4770
+ "learning_rate": 2.4044383420609406e-05,
4771
+ "loss": 0.1596,
4772
+ "step": 677
4773
+ },
4774
+ {
4775
+ "epoch": 0.3452577975811585,
4776
+ "grad_norm": 7.462673187255859,
4777
+ "learning_rate": 2.3908901295937713e-05,
4778
+ "loss": 0.1173,
4779
+ "step": 678
4780
+ },
4781
+ {
4782
+ "epoch": 0.3457670273711012,
4783
+ "grad_norm": 4.732002258300781,
4784
+ "learning_rate": 2.3773681908340284e-05,
4785
+ "loss": 0.111,
4786
+ "step": 679
4787
+ },
4788
+ {
4789
+ "epoch": 0.34627625716104393,
4790
+ "grad_norm": 5.835433483123779,
4791
+ "learning_rate": 2.363872661947488e-05,
4792
+ "loss": 0.1395,
4793
+ "step": 680
4794
+ },
4795
+ {
4796
+ "epoch": 0.34678548695098665,
4797
+ "grad_norm": 4.240106105804443,
4798
+ "learning_rate": 2.350403678833976e-05,
4799
+ "loss": 0.0979,
4800
+ "step": 681
4801
+ },
4802
+ {
4803
+ "epoch": 0.3472947167409293,
4804
+ "grad_norm": 6.862554550170898,
4805
+ "learning_rate": 2.336961377126001e-05,
4806
+ "loss": 0.1356,
4807
+ "step": 682
4808
+ },
4809
+ {
4810
+ "epoch": 0.34780394653087204,
4811
+ "grad_norm": 5.05124568939209,
4812
+ "learning_rate": 2.3235458921873925e-05,
4813
+ "loss": 0.1112,
4814
+ "step": 683
4815
+ },
4816
+ {
4817
+ "epoch": 0.34831317632081477,
4818
+ "grad_norm": 5.341157913208008,
4819
+ "learning_rate": 2.310157359111938e-05,
4820
+ "loss": 0.0908,
4821
+ "step": 684
4822
+ },
4823
+ {
4824
+ "epoch": 0.3488224061107575,
4825
+ "grad_norm": 3.359665870666504,
4826
+ "learning_rate": 2.296795912722014e-05,
4827
+ "loss": 0.081,
4828
+ "step": 685
4829
+ },
4830
+ {
4831
+ "epoch": 0.3493316359007002,
4832
+ "grad_norm": 4.865165710449219,
4833
+ "learning_rate": 2.283461687567236e-05,
4834
+ "loss": 0.1172,
4835
+ "step": 686
4836
+ },
4837
+ {
4838
+ "epoch": 0.3498408656906429,
4839
+ "grad_norm": 8.436277389526367,
4840
+ "learning_rate": 2.2701548179231048e-05,
4841
+ "loss": 0.1962,
4842
+ "step": 687
4843
+ },
4844
+ {
4845
+ "epoch": 0.3503500954805856,
4846
+ "grad_norm": 6.217014312744141,
4847
+ "learning_rate": 2.2568754377896516e-05,
4848
+ "loss": 0.0362,
4849
+ "step": 688
4850
+ },
4851
+ {
4852
+ "epoch": 0.3508593252705283,
4853
+ "grad_norm": 19.41240119934082,
4854
+ "learning_rate": 2.2436236808900844e-05,
4855
+ "loss": 0.275,
4856
+ "step": 689
4857
+ },
4858
+ {
4859
+ "epoch": 0.35136855506047104,
4860
+ "grad_norm": 6.9711456298828125,
4861
+ "learning_rate": 2.2303996806694488e-05,
4862
+ "loss": 0.0658,
4863
+ "step": 690
4864
+ },
4865
+ {
4866
+ "epoch": 0.35187778485041377,
4867
+ "grad_norm": 6.803055763244629,
4868
+ "learning_rate": 2.2172035702932825e-05,
4869
+ "loss": 0.0384,
4870
+ "step": 691
4871
+ },
4872
+ {
4873
+ "epoch": 0.3523870146403565,
4874
+ "grad_norm": 3.9196035861968994,
4875
+ "learning_rate": 2.2040354826462668e-05,
4876
+ "loss": 0.0957,
4877
+ "step": 692
4878
+ },
4879
+ {
4880
+ "epoch": 0.35289624443029916,
4881
+ "grad_norm": 3.145606756210327,
4882
+ "learning_rate": 2.1908955503308993e-05,
4883
+ "loss": 0.0623,
4884
+ "step": 693
4885
+ },
4886
+ {
4887
+ "epoch": 0.3534054742202419,
4888
+ "grad_norm": 16.734477996826172,
4889
+ "learning_rate": 2.1777839056661554e-05,
4890
+ "loss": 0.0769,
4891
+ "step": 694
4892
+ },
4893
+ {
4894
+ "epoch": 0.3539147040101846,
4895
+ "grad_norm": 9.658271789550781,
4896
+ "learning_rate": 2.164700680686147e-05,
4897
+ "loss": 0.0265,
4898
+ "step": 695
4899
+ },
4900
+ {
4901
+ "epoch": 0.3544239338001273,
4902
+ "grad_norm": 1.6071585416793823,
4903
+ "learning_rate": 2.1516460071388062e-05,
4904
+ "loss": 0.0115,
4905
+ "step": 696
4906
+ },
4907
+ {
4908
+ "epoch": 0.35493316359007004,
4909
+ "grad_norm": 4.15781307220459,
4910
+ "learning_rate": 2.1386200164845526e-05,
4911
+ "loss": 0.0719,
4912
+ "step": 697
4913
+ },
4914
+ {
4915
+ "epoch": 0.3554423933800127,
4916
+ "grad_norm": 1.0766063928604126,
4917
+ "learning_rate": 2.125622839894964e-05,
4918
+ "loss": 0.0062,
4919
+ "step": 698
4920
+ },
4921
+ {
4922
+ "epoch": 0.35595162316995543,
4923
+ "grad_norm": 3.8494861125946045,
4924
+ "learning_rate": 2.1126546082514664e-05,
4925
+ "loss": 0.0104,
4926
+ "step": 699
4927
+ },
4928
+ {
4929
+ "epoch": 0.35646085295989816,
4930
+ "grad_norm": 4.333930015563965,
4931
+ "learning_rate": 2.09971545214401e-05,
4932
+ "loss": 0.069,
4933
+ "step": 700
4934
+ },
4935
+ {
4936
+ "epoch": 0.3569700827498409,
4937
+ "grad_norm": 5.104018211364746,
4938
+ "learning_rate": 2.086805501869749e-05,
4939
+ "loss": 0.735,
4940
+ "step": 701
4941
+ },
4942
+ {
4943
+ "epoch": 0.3574793125397836,
4944
+ "grad_norm": 4.622631072998047,
4945
+ "learning_rate": 2.073924887431744e-05,
4946
+ "loss": 0.6267,
4947
+ "step": 702
4948
+ },
4949
+ {
4950
+ "epoch": 0.35798854232972627,
4951
+ "grad_norm": 2.708951234817505,
4952
+ "learning_rate": 2.061073738537635e-05,
4953
+ "loss": 0.1031,
4954
+ "step": 703
4955
+ },
4956
+ {
4957
+ "epoch": 0.358497772119669,
4958
+ "grad_norm": 6.911785125732422,
4959
+ "learning_rate": 2.048252184598352e-05,
4960
+ "loss": 0.5889,
4961
+ "step": 704
4962
+ },
4963
+ {
4964
+ "epoch": 0.3590070019096117,
4965
+ "grad_norm": 8.187259674072266,
4966
+ "learning_rate": 2.0354603547267985e-05,
4967
+ "loss": 0.5466,
4968
+ "step": 705
4969
+ },
4970
+ {
4971
+ "epoch": 0.35951623169955443,
4972
+ "grad_norm": 6.681227207183838,
4973
+ "learning_rate": 2.0226983777365604e-05,
4974
+ "loss": 0.5335,
4975
+ "step": 706
4976
+ },
4977
+ {
4978
+ "epoch": 0.36002546148949716,
4979
+ "grad_norm": 13.172379493713379,
4980
+ "learning_rate": 2.0099663821406056e-05,
4981
+ "loss": 0.6291,
4982
+ "step": 707
4983
+ },
4984
+ {
4985
+ "epoch": 0.3605346912794398,
4986
+ "grad_norm": 10.422917366027832,
4987
+ "learning_rate": 1.9972644961499854e-05,
4988
+ "loss": 0.8471,
4989
+ "step": 708
4990
+ },
4991
+ {
4992
+ "epoch": 0.36104392106938255,
4993
+ "grad_norm": 13.669676780700684,
4994
+ "learning_rate": 1.9845928476725524e-05,
4995
+ "loss": 1.0402,
4996
+ "step": 709
4997
+ },
4998
+ {
4999
+ "epoch": 0.36155315085932527,
5000
+ "grad_norm": 6.9141716957092285,
5001
+ "learning_rate": 1.9719515643116674e-05,
5002
+ "loss": 0.6035,
5003
+ "step": 710
5004
+ },
5005
+ {
5006
+ "epoch": 0.362062380649268,
5007
+ "grad_norm": 8.188703536987305,
5008
+ "learning_rate": 1.959340773364911e-05,
5009
+ "loss": 0.6699,
5010
+ "step": 711
5011
+ },
5012
+ {
5013
+ "epoch": 0.3625716104392107,
5014
+ "grad_norm": 7.684673309326172,
5015
+ "learning_rate": 1.946760601822809e-05,
5016
+ "loss": 0.7657,
5017
+ "step": 712
5018
+ },
5019
+ {
5020
+ "epoch": 0.3630808402291534,
5021
+ "grad_norm": 2.5300745964050293,
5022
+ "learning_rate": 1.9342111763675512e-05,
5023
+ "loss": 0.1569,
5024
+ "step": 713
5025
+ },
5026
+ {
5027
+ "epoch": 0.3635900700190961,
5028
+ "grad_norm": 1.3801597356796265,
5029
+ "learning_rate": 1.9216926233717085e-05,
5030
+ "loss": 0.081,
5031
+ "step": 714
5032
+ },
5033
+ {
5034
+ "epoch": 0.3640992998090388,
5035
+ "grad_norm": 1.2587306499481201,
5036
+ "learning_rate": 1.9092050688969738e-05,
5037
+ "loss": 0.1421,
5038
+ "step": 715
5039
+ },
5040
+ {
5041
+ "epoch": 0.36460852959898155,
5042
+ "grad_norm": 1.1372148990631104,
5043
+ "learning_rate": 1.8967486386928817e-05,
5044
+ "loss": 0.0961,
5045
+ "step": 716
5046
+ },
5047
+ {
5048
+ "epoch": 0.36511775938892427,
5049
+ "grad_norm": 0.7794922590255737,
5050
+ "learning_rate": 1.8843234581955442e-05,
5051
+ "loss": 0.1008,
5052
+ "step": 717
5053
+ },
5054
+ {
5055
+ "epoch": 0.365626989178867,
5056
+ "grad_norm": 1.1420189142227173,
5057
+ "learning_rate": 1.8719296525263922e-05,
5058
+ "loss": 0.1071,
5059
+ "step": 718
5060
+ },
5061
+ {
5062
+ "epoch": 0.36613621896880966,
5063
+ "grad_norm": 1.6180529594421387,
5064
+ "learning_rate": 1.859567346490913e-05,
5065
+ "loss": 0.0875,
5066
+ "step": 719
5067
+ },
5068
+ {
5069
+ "epoch": 0.3666454487587524,
5070
+ "grad_norm": 1.0748051404953003,
5071
+ "learning_rate": 1.847236664577389e-05,
5072
+ "loss": 0.1288,
5073
+ "step": 720
5074
+ },
5075
+ {
5076
+ "epoch": 0.3671546785486951,
5077
+ "grad_norm": 1.0483026504516602,
5078
+ "learning_rate": 1.8349377309556486e-05,
5079
+ "loss": 0.0774,
5080
+ "step": 721
5081
+ },
5082
+ {
5083
+ "epoch": 0.3676639083386378,
5084
+ "grad_norm": 1.506415843963623,
5085
+ "learning_rate": 1.8226706694758195e-05,
5086
+ "loss": 0.1362,
5087
+ "step": 722
5088
+ },
5089
+ {
5090
+ "epoch": 0.36817313812858055,
5091
+ "grad_norm": 1.0291399955749512,
5092
+ "learning_rate": 1.810435603667075e-05,
5093
+ "loss": 0.1071,
5094
+ "step": 723
5095
+ },
5096
+ {
5097
+ "epoch": 0.3686823679185232,
5098
+ "grad_norm": 5.942626953125,
5099
+ "learning_rate": 1.7982326567363888e-05,
5100
+ "loss": 0.1614,
5101
+ "step": 724
5102
+ },
5103
+ {
5104
+ "epoch": 0.36919159770846594,
5105
+ "grad_norm": 12.163902282714844,
5106
+ "learning_rate": 1.7860619515673033e-05,
5107
+ "loss": 0.1458,
5108
+ "step": 725
5109
+ },
5110
+ {
5111
+ "epoch": 0.36970082749840866,
5112
+ "grad_norm": 3.687087059020996,
5113
+ "learning_rate": 1.773923610718686e-05,
5114
+ "loss": 0.0919,
5115
+ "step": 726
5116
+ },
5117
+ {
5118
+ "epoch": 0.3702100572883514,
5119
+ "grad_norm": 4.903958320617676,
5120
+ "learning_rate": 1.7618177564234905e-05,
5121
+ "loss": 0.1254,
5122
+ "step": 727
5123
+ },
5124
+ {
5125
+ "epoch": 0.3707192870782941,
5126
+ "grad_norm": 4.808300495147705,
5127
+ "learning_rate": 1.7497445105875377e-05,
5128
+ "loss": 0.1473,
5129
+ "step": 728
5130
+ },
5131
+ {
5132
+ "epoch": 0.37122851686823677,
5133
+ "grad_norm": 5.668433666229248,
5134
+ "learning_rate": 1.73770399478828e-05,
5135
+ "loss": 0.1524,
5136
+ "step": 729
5137
+ },
5138
+ {
5139
+ "epoch": 0.3717377466581795,
5140
+ "grad_norm": 8.04811954498291,
5141
+ "learning_rate": 1.725696330273575e-05,
5142
+ "loss": 0.0919,
5143
+ "step": 730
5144
+ },
5145
+ {
5146
+ "epoch": 0.3722469764481222,
5147
+ "grad_norm": 4.406056880950928,
5148
+ "learning_rate": 1.7137216379604727e-05,
5149
+ "loss": 0.1046,
5150
+ "step": 731
5151
+ },
5152
+ {
5153
+ "epoch": 0.37275620623806494,
5154
+ "grad_norm": 7.855915546417236,
5155
+ "learning_rate": 1.7017800384339928e-05,
5156
+ "loss": 0.0709,
5157
+ "step": 732
5158
+ },
5159
+ {
5160
+ "epoch": 0.37326543602800766,
5161
+ "grad_norm": 3.947338342666626,
5162
+ "learning_rate": 1.6898716519459074e-05,
5163
+ "loss": 0.072,
5164
+ "step": 733
5165
+ },
5166
+ {
5167
+ "epoch": 0.3737746658179503,
5168
+ "grad_norm": 7.404916763305664,
5169
+ "learning_rate": 1.6779965984135377e-05,
5170
+ "loss": 0.1313,
5171
+ "step": 734
5172
+ },
5173
+ {
5174
+ "epoch": 0.37428389560789305,
5175
+ "grad_norm": 4.375075817108154,
5176
+ "learning_rate": 1.6661549974185424e-05,
5177
+ "loss": 0.0829,
5178
+ "step": 735
5179
+ },
5180
+ {
5181
+ "epoch": 0.37479312539783577,
5182
+ "grad_norm": 4.642003536224365,
5183
+ "learning_rate": 1.6543469682057106e-05,
5184
+ "loss": 0.0602,
5185
+ "step": 736
5186
+ },
5187
+ {
5188
+ "epoch": 0.3753023551877785,
5189
+ "grad_norm": 5.987782001495361,
5190
+ "learning_rate": 1.6425726296817633e-05,
5191
+ "loss": 0.0868,
5192
+ "step": 737
5193
+ },
5194
+ {
5195
+ "epoch": 0.3758115849777212,
5196
+ "grad_norm": 10.39226245880127,
5197
+ "learning_rate": 1.6308321004141607e-05,
5198
+ "loss": 0.1273,
5199
+ "step": 738
5200
+ },
5201
+ {
5202
+ "epoch": 0.37632081476766394,
5203
+ "grad_norm": 6.080224514007568,
5204
+ "learning_rate": 1.619125498629904e-05,
5205
+ "loss": 0.0465,
5206
+ "step": 739
5207
+ },
5208
+ {
5209
+ "epoch": 0.3768300445576066,
5210
+ "grad_norm": 4.397584915161133,
5211
+ "learning_rate": 1.60745294221434e-05,
5212
+ "loss": 0.0625,
5213
+ "step": 740
5214
+ },
5215
+ {
5216
+ "epoch": 0.3773392743475493,
5217
+ "grad_norm": 8.622246742248535,
5218
+ "learning_rate": 1.595814548709983e-05,
5219
+ "loss": 0.0664,
5220
+ "step": 741
5221
+ },
5222
+ {
5223
+ "epoch": 0.37784850413749205,
5224
+ "grad_norm": 8.313824653625488,
5225
+ "learning_rate": 1.5842104353153287e-05,
5226
+ "loss": 0.0271,
5227
+ "step": 742
5228
+ },
5229
+ {
5230
+ "epoch": 0.37835773392743477,
5231
+ "grad_norm": 2.949397325515747,
5232
+ "learning_rate": 1.5726407188836673e-05,
5233
+ "loss": 0.0345,
5234
+ "step": 743
5235
+ },
5236
+ {
5237
+ "epoch": 0.3788669637173775,
5238
+ "grad_norm": 3.27532696723938,
5239
+ "learning_rate": 1.5611055159219152e-05,
5240
+ "loss": 0.0561,
5241
+ "step": 744
5242
+ },
5243
+ {
5244
+ "epoch": 0.37937619350732016,
5245
+ "grad_norm": 0.19333244860172272,
5246
+ "learning_rate": 1.549604942589441e-05,
5247
+ "loss": 0.0013,
5248
+ "step": 745
5249
+ },
5250
+ {
5251
+ "epoch": 0.3798854232972629,
5252
+ "grad_norm": 4.911413192749023,
5253
+ "learning_rate": 1.5381391146968866e-05,
5254
+ "loss": 0.0187,
5255
+ "step": 746
5256
+ },
5257
+ {
5258
+ "epoch": 0.3803946530872056,
5259
+ "grad_norm": 6.5160627365112305,
5260
+ "learning_rate": 1.526708147705013e-05,
5261
+ "loss": 0.0928,
5262
+ "step": 747
5263
+ },
5264
+ {
5265
+ "epoch": 0.3809038828771483,
5266
+ "grad_norm": 5.048818111419678,
5267
+ "learning_rate": 1.5153121567235335e-05,
5268
+ "loss": 0.013,
5269
+ "step": 748
5270
+ },
5271
+ {
5272
+ "epoch": 0.38141311266709105,
5273
+ "grad_norm": 4.852001667022705,
5274
+ "learning_rate": 1.5039512565099467e-05,
5275
+ "loss": 0.079,
5276
+ "step": 749
5277
+ },
5278
+ {
5279
+ "epoch": 0.3819223424570337,
5280
+ "grad_norm": 3.08077335357666,
5281
+ "learning_rate": 1.4926255614683932e-05,
5282
+ "loss": 0.0403,
5283
+ "step": 750
5284
+ },
5285
+ {
5286
+ "epoch": 0.38243157224697644,
5287
+ "grad_norm": 4.824317932128906,
5288
+ "learning_rate": 1.481335185648498e-05,
5289
+ "loss": 0.7268,
5290
+ "step": 751
5291
+ },
5292
+ {
5293
+ "epoch": 0.38294080203691916,
5294
+ "grad_norm": 4.761933326721191,
5295
+ "learning_rate": 1.4700802427442179e-05,
5296
+ "loss": 0.5991,
5297
+ "step": 752
5298
+ },
5299
+ {
5300
+ "epoch": 0.3834500318268619,
5301
+ "grad_norm": 4.185091495513916,
5302
+ "learning_rate": 1.458860846092705e-05,
5303
+ "loss": 0.2459,
5304
+ "step": 753
5305
+ },
5306
+ {
5307
+ "epoch": 0.3839592616168046,
5308
+ "grad_norm": 10.620789527893066,
5309
+ "learning_rate": 1.4476771086731567e-05,
5310
+ "loss": 0.79,
5311
+ "step": 754
5312
+ },
5313
+ {
5314
+ "epoch": 0.38446849140674727,
5315
+ "grad_norm": 11.214977264404297,
5316
+ "learning_rate": 1.4365291431056871e-05,
5317
+ "loss": 0.7926,
5318
+ "step": 755
5319
+ },
5320
+ {
5321
+ "epoch": 0.38497772119669,
5322
+ "grad_norm": 5.3124895095825195,
5323
+ "learning_rate": 1.4254170616501827e-05,
5324
+ "loss": 0.4243,
5325
+ "step": 756
5326
+ },
5327
+ {
5328
+ "epoch": 0.3854869509866327,
5329
+ "grad_norm": 11.870231628417969,
5330
+ "learning_rate": 1.414340976205183e-05,
5331
+ "loss": 0.6992,
5332
+ "step": 757
5333
+ },
5334
+ {
5335
+ "epoch": 0.38599618077657544,
5336
+ "grad_norm": 9.170869827270508,
5337
+ "learning_rate": 1.4033009983067452e-05,
5338
+ "loss": 0.8608,
5339
+ "step": 758
5340
+ },
5341
+ {
5342
+ "epoch": 0.38650541056651816,
5343
+ "grad_norm": 10.806239128112793,
5344
+ "learning_rate": 1.3922972391273226e-05,
5345
+ "loss": 0.8629,
5346
+ "step": 759
5347
+ },
5348
+ {
5349
+ "epoch": 0.3870146403564608,
5350
+ "grad_norm": 9.26331901550293,
5351
+ "learning_rate": 1.3813298094746491e-05,
5352
+ "loss": 0.5738,
5353
+ "step": 760
5354
+ },
5355
+ {
5356
+ "epoch": 0.38752387014640355,
5357
+ "grad_norm": 10.236473083496094,
5358
+ "learning_rate": 1.3703988197906209e-05,
5359
+ "loss": 0.6671,
5360
+ "step": 761
5361
+ },
5362
+ {
5363
+ "epoch": 0.38803309993634627,
5364
+ "grad_norm": 6.966268539428711,
5365
+ "learning_rate": 1.3595043801501794e-05,
5366
+ "loss": 0.7432,
5367
+ "step": 762
5368
+ },
5369
+ {
5370
+ "epoch": 0.388542329726289,
5371
+ "grad_norm": 5.45960807800293,
5372
+ "learning_rate": 1.3486466002602133e-05,
5373
+ "loss": 0.5434,
5374
+ "step": 763
5375
+ },
5376
+ {
5377
+ "epoch": 0.3890515595162317,
5378
+ "grad_norm": 1.5403192043304443,
5379
+ "learning_rate": 1.3378255894584463e-05,
5380
+ "loss": 0.1034,
5381
+ "step": 764
5382
+ },
5383
+ {
5384
+ "epoch": 0.38956078930617444,
5385
+ "grad_norm": 1.8921170234680176,
5386
+ "learning_rate": 1.327041456712334e-05,
5387
+ "loss": 0.084,
5388
+ "step": 765
5389
+ },
5390
+ {
5391
+ "epoch": 0.3900700190961171,
5392
+ "grad_norm": 4.200987815856934,
5393
+ "learning_rate": 1.3162943106179749e-05,
5394
+ "loss": 0.1189,
5395
+ "step": 766
5396
+ },
5397
+ {
5398
+ "epoch": 0.3905792488860598,
5399
+ "grad_norm": 2.609253406524658,
5400
+ "learning_rate": 1.3055842593990131e-05,
5401
+ "loss": 0.2064,
5402
+ "step": 767
5403
+ },
5404
+ {
5405
+ "epoch": 0.39108847867600255,
5406
+ "grad_norm": 2.868013858795166,
5407
+ "learning_rate": 1.2949114109055415e-05,
5408
+ "loss": 0.1238,
5409
+ "step": 768
5410
+ },
5411
+ {
5412
+ "epoch": 0.39159770846594527,
5413
+ "grad_norm": 2.5417683124542236,
5414
+ "learning_rate": 1.2842758726130283e-05,
5415
+ "loss": 0.0929,
5416
+ "step": 769
5417
+ },
5418
+ {
5419
+ "epoch": 0.392106938255888,
5420
+ "grad_norm": 1.0600427389144897,
5421
+ "learning_rate": 1.2736777516212266e-05,
5422
+ "loss": 0.1323,
5423
+ "step": 770
5424
+ },
5425
+ {
5426
+ "epoch": 0.39261616804583066,
5427
+ "grad_norm": 1.9886938333511353,
5428
+ "learning_rate": 1.2631171546530968e-05,
5429
+ "loss": 0.0904,
5430
+ "step": 771
5431
+ },
5432
+ {
5433
+ "epoch": 0.3931253978357734,
5434
+ "grad_norm": 0.7915336489677429,
5435
+ "learning_rate": 1.2525941880537307e-05,
5436
+ "loss": 0.0964,
5437
+ "step": 772
5438
+ },
5439
+ {
5440
+ "epoch": 0.3936346276257161,
5441
+ "grad_norm": 2.3213069438934326,
5442
+ "learning_rate": 1.2421089577892869e-05,
5443
+ "loss": 0.1248,
5444
+ "step": 773
5445
+ },
5446
+ {
5447
+ "epoch": 0.3941438574156588,
5448
+ "grad_norm": 3.6811985969543457,
5449
+ "learning_rate": 1.2316615694459189e-05,
5450
+ "loss": 0.0861,
5451
+ "step": 774
5452
+ },
5453
+ {
5454
+ "epoch": 0.39465308720560155,
5455
+ "grad_norm": 4.516244411468506,
5456
+ "learning_rate": 1.2212521282287092e-05,
5457
+ "loss": 0.1168,
5458
+ "step": 775
5459
+ },
5460
+ {
5461
+ "epoch": 0.3951623169955442,
5462
+ "grad_norm": 4.109873294830322,
5463
+ "learning_rate": 1.2108807389606158e-05,
5464
+ "loss": 0.1002,
5465
+ "step": 776
5466
+ },
5467
+ {
5468
+ "epoch": 0.39567154678548694,
5469
+ "grad_norm": 4.258279323577881,
5470
+ "learning_rate": 1.2005475060814159e-05,
5471
+ "loss": 0.1388,
5472
+ "step": 777
5473
+ },
5474
+ {
5475
+ "epoch": 0.39618077657542966,
5476
+ "grad_norm": 3.9544007778167725,
5477
+ "learning_rate": 1.1902525336466464e-05,
5478
+ "loss": 0.1318,
5479
+ "step": 778
5480
+ },
5481
+ {
5482
+ "epoch": 0.3966900063653724,
5483
+ "grad_norm": 5.926074981689453,
5484
+ "learning_rate": 1.1799959253265668e-05,
5485
+ "loss": 0.1125,
5486
+ "step": 779
5487
+ },
5488
+ {
5489
+ "epoch": 0.3971992361553151,
5490
+ "grad_norm": 8.683218955993652,
5491
+ "learning_rate": 1.1697777844051105e-05,
5492
+ "loss": 0.0995,
5493
+ "step": 780
5494
+ },
5495
+ {
5496
+ "epoch": 0.39770846594525777,
5497
+ "grad_norm": 3.9073028564453125,
5498
+ "learning_rate": 1.1595982137788403e-05,
5499
+ "loss": 0.0903,
5500
+ "step": 781
5501
+ },
5502
+ {
5503
+ "epoch": 0.3982176957352005,
5504
+ "grad_norm": 5.088838577270508,
5505
+ "learning_rate": 1.1494573159559213e-05,
5506
+ "loss": 0.1193,
5507
+ "step": 782
5508
+ },
5509
+ {
5510
+ "epoch": 0.3987269255251432,
5511
+ "grad_norm": 5.782503128051758,
5512
+ "learning_rate": 1.1393551930550828e-05,
5513
+ "loss": 0.0823,
5514
+ "step": 783
5515
+ },
5516
+ {
5517
+ "epoch": 0.39923615531508594,
5518
+ "grad_norm": 5.548112869262695,
5519
+ "learning_rate": 1.1292919468045877e-05,
5520
+ "loss": 0.0797,
5521
+ "step": 784
5522
+ },
5523
+ {
5524
+ "epoch": 0.39974538510502866,
5525
+ "grad_norm": 6.514894962310791,
5526
+ "learning_rate": 1.1192676785412154e-05,
5527
+ "loss": 0.1448,
5528
+ "step": 785
5529
+ },
5530
+ {
5531
+ "epoch": 0.4002546148949714,
5532
+ "grad_norm": 4.215356826782227,
5533
+ "learning_rate": 1.1092824892092373e-05,
5534
+ "loss": 0.0659,
5535
+ "step": 786
5536
+ },
5537
+ {
5538
+ "epoch": 0.40076384468491405,
5539
+ "grad_norm": 7.318501949310303,
5540
+ "learning_rate": 1.099336479359398e-05,
5541
+ "loss": 0.1557,
5542
+ "step": 787
5543
+ },
5544
+ {
5545
+ "epoch": 0.4012730744748568,
5546
+ "grad_norm": 4.508605003356934,
5547
+ "learning_rate": 1.0894297491479045e-05,
5548
+ "loss": 0.0383,
5549
+ "step": 788
5550
+ },
5551
+ {
5552
+ "epoch": 0.4017823042647995,
5553
+ "grad_norm": 2.6559200286865234,
5554
+ "learning_rate": 1.0795623983354215e-05,
5555
+ "loss": 0.0212,
5556
+ "step": 789
5557
+ },
5558
+ {
5559
+ "epoch": 0.4022915340547422,
5560
+ "grad_norm": 1.4912053346633911,
5561
+ "learning_rate": 1.0697345262860636e-05,
5562
+ "loss": 0.0104,
5563
+ "step": 790
5564
+ },
5565
+ {
5566
+ "epoch": 0.40280076384468494,
5567
+ "grad_norm": 7.236735820770264,
5568
+ "learning_rate": 1.0599462319663905e-05,
5569
+ "loss": 0.0706,
5570
+ "step": 791
5571
+ },
5572
+ {
5573
+ "epoch": 0.4033099936346276,
5574
+ "grad_norm": 4.715954303741455,
5575
+ "learning_rate": 1.0501976139444191e-05,
5576
+ "loss": 0.048,
5577
+ "step": 792
5578
+ },
5579
+ {
5580
+ "epoch": 0.40381922342457033,
5581
+ "grad_norm": 2.1985514163970947,
5582
+ "learning_rate": 1.0404887703886251e-05,
5583
+ "loss": 0.0125,
5584
+ "step": 793
5585
+ },
5586
+ {
5587
+ "epoch": 0.40432845321451305,
5588
+ "grad_norm": 7.244698524475098,
5589
+ "learning_rate": 1.0308197990669538e-05,
5590
+ "loss": 0.0468,
5591
+ "step": 794
5592
+ },
5593
+ {
5594
+ "epoch": 0.4048376830044558,
5595
+ "grad_norm": 1.7899662256240845,
5596
+ "learning_rate": 1.021190797345839e-05,
5597
+ "loss": 0.0033,
5598
+ "step": 795
5599
+ },
5600
+ {
5601
+ "epoch": 0.4053469127943985,
5602
+ "grad_norm": 7.417436122894287,
5603
+ "learning_rate": 1.0116018621892237e-05,
5604
+ "loss": 0.1219,
5605
+ "step": 796
5606
+ },
5607
+ {
5608
+ "epoch": 0.40585614258434116,
5609
+ "grad_norm": 2.332493782043457,
5610
+ "learning_rate": 1.0020530901575754e-05,
5611
+ "loss": 0.0177,
5612
+ "step": 797
5613
+ },
5614
+ {
5615
+ "epoch": 0.4063653723742839,
5616
+ "grad_norm": 6.933743000030518,
5617
+ "learning_rate": 9.92544577406923e-06,
5618
+ "loss": 0.0993,
5619
+ "step": 798
5620
+ },
5621
+ {
5622
+ "epoch": 0.4068746021642266,
5623
+ "grad_norm": 6.604279518127441,
5624
+ "learning_rate": 9.830764196878872e-06,
5625
+ "loss": 0.0523,
5626
+ "step": 799
5627
+ },
5628
+ {
5629
+ "epoch": 0.40738383195416933,
5630
+ "grad_norm": 1.6020522117614746,
5631
+ "learning_rate": 9.73648712344707e-06,
5632
+ "loss": 0.0222,
5633
+ "step": 800
5634
+ },
5635
+ {
5636
+ "epoch": 0.40738383195416933,
5637
+ "eval_loss": 0.23512445390224457,
5638
+ "eval_runtime": 376.2064,
5639
+ "eval_samples_per_second": 8.793,
5640
+ "eval_steps_per_second": 2.198,
5641
+ "step": 800
5642
  }
5643
  ],
5644
  "logging_steps": 1,
 
5667
  "attributes": {}
5668
  }
5669
  },
5670
+ "total_flos": 1.6645191524564337e+18,
5671
  "train_batch_size": 8,
5672
  "trial_name": null,
5673
  "trial_params": null