mtzig commited on
Commit
1f15dd5
·
verified ·
1 Parent(s): 9256123

Training in progress, step 1200, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd4094f5600490ddf6d9dc86706a89c258972627eabb482c09db8601aaa408b6
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e285698db4156337898b7507bc447cf892df1b2e2b1f627fbfa7fcf49ead7fe
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a5f19e172755f2ce57b999b6bf91cc0cd71f655dfb983069bc0cf1f20c1a06d
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9347512a71b948ad7d0474b073744a28f38ea1b0f4808b47eaeee3bb038ee2a
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c2c51b674c05b19e09a6f9dc112d8aed01c92bba25c7ff3c02cc7e583e58316
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61fe8222129691fd1c629440ebc055a5e22b32348d82bc6fb97d18d537ba38e6
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87060ce519920d72bc5688fe4b87ba053fe5674703e0cfb88414391c60a767ad
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb287ee7b4f22bfca83b3038b7765964ff726a01edfa1c77cefcecc5baaede6f
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:223e910b7b1616737b5bc86e1ebfb716e2a5b926a5993ed4e39db2e7651a4478
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8135e2cfc3f870ad4d1b9488a555f6cbbcb61951312e0f574806197a3d04752
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:43b45ba64474b3b8076d79d336bf19cbfbf47a3077e59ccccda7247f6abf0ebd
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd0182149b3046646213abcc88b729a39d44a31db12d71321dcf1672762dc92
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02d133caa8a7c8c9f20ecb1f747d463913eb8e7adea3e916057db45aee893c68
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ecceaf4d23428de4f6eaf8a4db08e58b3b9e512e0fc350f3d39b90547824dde
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81cb5bfebcce1a4979fbe0cfe517ce8ed3410829b5d415fc3687e8cd5e5c8a63
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93904d1910182fe133491da7a6c8bc9c6713b5f0c66d57fd0a846b185647198d
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:435d97755e037c527a64ed888b5fccf61252600460ddb0a957d40f8cf8984322
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f18ad258e576a1beb656290ab7d2a2eb5c1c200ce0d83645abdc17af01ce6b3
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e4a7ce1edd4170f6d4eb155e5e19998fd066ff3bfebd60f589551b3e6deedd6
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4328b792cfa04ae062613c520f6291678aade826256d6a52acb864dcba8e97aa
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7b6f523269f600825123123e93e374bafdb4065da7c3500423ba2da40982a17
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8758a5d59dbad9a4b9628b626e50cf69861f409943163aab71d6b7d54040e68
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36347d57510ac50d1215fd7cfb5a25f5354d812e876333fa5409094c79836493
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37d46b3ff156d0196e9a5d0a8efb49f4baca17f2c23d7f5843e853b9795049d4
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30bfb6fa4fd21ac286df0550c82cdbf8a597994647e5b3f5b958394e3a125a12
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:477a17a25cc7623279d8aa8946f887744ea0510845075294476c6dcaa37cf69c
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8419441255262151,
5
  "eval_steps": 20,
6
- "global_step": 1100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -8379,6 +8379,766 @@
8379
  "eval_samples_per_second": 7.001,
8380
  "eval_steps_per_second": 0.233,
8381
  "step": 1100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8382
  }
8383
  ],
8384
  "logging_steps": 1,
@@ -8398,7 +9158,7 @@
8398
  "attributes": {}
8399
  }
8400
  },
8401
- "total_flos": 1.6907983391188582e+17,
8402
  "train_batch_size": 8,
8403
  "trial_name": null,
8404
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9184845005740528,
5
  "eval_steps": 20,
6
+ "global_step": 1200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
8379
  "eval_samples_per_second": 7.001,
8380
  "eval_steps_per_second": 0.233,
8381
  "step": 1100
8382
+ },
8383
+ {
8384
+ "epoch": 0.8427095292766934,
8385
+ "grad_norm": 5.925868034362793,
8386
+ "learning_rate": 1.4648790011491544e-06,
8387
+ "loss": 0.3003,
8388
+ "step": 1101
8389
+ },
8390
+ {
8391
+ "epoch": 0.8434749330271718,
8392
+ "grad_norm": 6.9676995277404785,
8393
+ "learning_rate": 1.4509776170997625e-06,
8394
+ "loss": 0.3329,
8395
+ "step": 1102
8396
+ },
8397
+ {
8398
+ "epoch": 0.8442403367776502,
8399
+ "grad_norm": 6.943129539489746,
8400
+ "learning_rate": 1.4371373469796956e-06,
8401
+ "loss": 0.2854,
8402
+ "step": 1103
8403
+ },
8404
+ {
8405
+ "epoch": 0.8450057405281286,
8406
+ "grad_norm": 7.517777919769287,
8407
+ "learning_rate": 1.4233582897281328e-06,
8408
+ "loss": 0.2719,
8409
+ "step": 1104
8410
+ },
8411
+ {
8412
+ "epoch": 0.845771144278607,
8413
+ "grad_norm": 7.715554237365723,
8414
+ "learning_rate": 1.4096405438466687e-06,
8415
+ "loss": 0.3078,
8416
+ "step": 1105
8417
+ },
8418
+ {
8419
+ "epoch": 0.8465365480290853,
8420
+ "grad_norm": 7.961045742034912,
8421
+ "learning_rate": 1.3959842073986085e-06,
8422
+ "loss": 0.2775,
8423
+ "step": 1106
8424
+ },
8425
+ {
8426
+ "epoch": 0.8473019517795637,
8427
+ "grad_norm": 5.7302751541137695,
8428
+ "learning_rate": 1.3823893780082508e-06,
8429
+ "loss": 0.2433,
8430
+ "step": 1107
8431
+ },
8432
+ {
8433
+ "epoch": 0.8480673555300421,
8434
+ "grad_norm": 7.472646236419678,
8435
+ "learning_rate": 1.368856152860215e-06,
8436
+ "loss": 0.3264,
8437
+ "step": 1108
8438
+ },
8439
+ {
8440
+ "epoch": 0.8488327592805205,
8441
+ "grad_norm": 8.17573070526123,
8442
+ "learning_rate": 1.3553846286987271e-06,
8443
+ "loss": 0.2075,
8444
+ "step": 1109
8445
+ },
8446
+ {
8447
+ "epoch": 0.8495981630309989,
8448
+ "grad_norm": 8.035270690917969,
8449
+ "learning_rate": 1.3419749018269368e-06,
8450
+ "loss": 0.3511,
8451
+ "step": 1110
8452
+ },
8453
+ {
8454
+ "epoch": 0.8503635667814772,
8455
+ "grad_norm": 7.398448467254639,
8456
+ "learning_rate": 1.3286270681062275e-06,
8457
+ "loss": 0.2243,
8458
+ "step": 1111
8459
+ },
8460
+ {
8461
+ "epoch": 0.8511289705319556,
8462
+ "grad_norm": 5.270333290100098,
8463
+ "learning_rate": 1.3153412229555251e-06,
8464
+ "loss": 0.2921,
8465
+ "step": 1112
8466
+ },
8467
+ {
8468
+ "epoch": 0.851894374282434,
8469
+ "grad_norm": 5.221624374389648,
8470
+ "learning_rate": 1.302117461350627e-06,
8471
+ "loss": 0.3181,
8472
+ "step": 1113
8473
+ },
8474
+ {
8475
+ "epoch": 0.8526597780329124,
8476
+ "grad_norm": 8.733942985534668,
8477
+ "learning_rate": 1.2889558778235157e-06,
8478
+ "loss": 0.2652,
8479
+ "step": 1114
8480
+ },
8481
+ {
8482
+ "epoch": 0.8534251817833908,
8483
+ "grad_norm": 5.429276466369629,
8484
+ "learning_rate": 1.2758565664616829e-06,
8485
+ "loss": 0.2734,
8486
+ "step": 1115
8487
+ },
8488
+ {
8489
+ "epoch": 0.8541905855338691,
8490
+ "grad_norm": 10.059110641479492,
8491
+ "learning_rate": 1.262819620907465e-06,
8492
+ "loss": 0.3404,
8493
+ "step": 1116
8494
+ },
8495
+ {
8496
+ "epoch": 0.8549559892843475,
8497
+ "grad_norm": 6.145954608917236,
8498
+ "learning_rate": 1.249845134357357e-06,
8499
+ "loss": 0.3076,
8500
+ "step": 1117
8501
+ },
8502
+ {
8503
+ "epoch": 0.8557213930348259,
8504
+ "grad_norm": 5.079444885253906,
8505
+ "learning_rate": 1.2369331995613664e-06,
8506
+ "loss": 0.185,
8507
+ "step": 1118
8508
+ },
8509
+ {
8510
+ "epoch": 0.8564867967853043,
8511
+ "grad_norm": 15.14505672454834,
8512
+ "learning_rate": 1.224083908822331e-06,
8513
+ "loss": 0.3866,
8514
+ "step": 1119
8515
+ },
8516
+ {
8517
+ "epoch": 0.8572522005357827,
8518
+ "grad_norm": 6.147080421447754,
8519
+ "learning_rate": 1.2112973539952777e-06,
8520
+ "loss": 0.324,
8521
+ "step": 1120
8522
+ },
8523
+ {
8524
+ "epoch": 0.8572522005357827,
8525
+ "eval_accuracy": 0.8844765342960289,
8526
+ "eval_f1": 0.8333333333333334,
8527
+ "eval_loss": 0.29969924688339233,
8528
+ "eval_precision": 0.8743169398907104,
8529
+ "eval_recall": 0.7960199004975125,
8530
+ "eval_runtime": 43.0138,
8531
+ "eval_samples_per_second": 6.998,
8532
+ "eval_steps_per_second": 0.232,
8533
+ "step": 1120
8534
+ },
8535
+ {
8536
+ "epoch": 0.858017604286261,
8537
+ "grad_norm": 7.136957168579102,
8538
+ "learning_rate": 1.198573626486751e-06,
8539
+ "loss": 0.396,
8540
+ "step": 1121
8541
+ },
8542
+ {
8543
+ "epoch": 0.8587830080367393,
8544
+ "grad_norm": 5.081778049468994,
8545
+ "learning_rate": 1.1859128172541668e-06,
8546
+ "loss": 0.2741,
8547
+ "step": 1122
8548
+ },
8549
+ {
8550
+ "epoch": 0.8595484117872177,
8551
+ "grad_norm": 5.848927974700928,
8552
+ "learning_rate": 1.1733150168051632e-06,
8553
+ "loss": 0.301,
8554
+ "step": 1123
8555
+ },
8556
+ {
8557
+ "epoch": 0.8603138155376961,
8558
+ "grad_norm": 8.139251708984375,
8559
+ "learning_rate": 1.1607803151969443e-06,
8560
+ "loss": 0.3968,
8561
+ "step": 1124
8562
+ },
8563
+ {
8564
+ "epoch": 0.8610792192881745,
8565
+ "grad_norm": 11.221075057983398,
8566
+ "learning_rate": 1.148308802035648e-06,
8567
+ "loss": 0.3192,
8568
+ "step": 1125
8569
+ },
8570
+ {
8571
+ "epoch": 0.8618446230386528,
8572
+ "grad_norm": 12.196139335632324,
8573
+ "learning_rate": 1.1359005664756994e-06,
8574
+ "loss": 0.3429,
8575
+ "step": 1126
8576
+ },
8577
+ {
8578
+ "epoch": 0.8626100267891312,
8579
+ "grad_norm": 7.772244453430176,
8580
+ "learning_rate": 1.123555697219174e-06,
8581
+ "loss": 0.3333,
8582
+ "step": 1127
8583
+ },
8584
+ {
8585
+ "epoch": 0.8633754305396096,
8586
+ "grad_norm": 6.083074569702148,
8587
+ "learning_rate": 1.1112742825151669e-06,
8588
+ "loss": 0.2641,
8589
+ "step": 1128
8590
+ },
8591
+ {
8592
+ "epoch": 0.864140834290088,
8593
+ "grad_norm": 7.137205123901367,
8594
+ "learning_rate": 1.0990564101591527e-06,
8595
+ "loss": 0.3597,
8596
+ "step": 1129
8597
+ },
8598
+ {
8599
+ "epoch": 0.8649062380405664,
8600
+ "grad_norm": 8.642711639404297,
8601
+ "learning_rate": 1.0869021674923708e-06,
8602
+ "loss": 0.2681,
8603
+ "step": 1130
8604
+ },
8605
+ {
8606
+ "epoch": 0.8656716417910447,
8607
+ "grad_norm": 6.8455305099487305,
8608
+ "learning_rate": 1.074811641401189e-06,
8609
+ "loss": 0.3558,
8610
+ "step": 1131
8611
+ },
8612
+ {
8613
+ "epoch": 0.8664370455415231,
8614
+ "grad_norm": 6.04085636138916,
8615
+ "learning_rate": 1.0627849183164906e-06,
8616
+ "loss": 0.3488,
8617
+ "step": 1132
8618
+ },
8619
+ {
8620
+ "epoch": 0.8672024492920015,
8621
+ "grad_norm": 6.025130271911621,
8622
+ "learning_rate": 1.0508220842130602e-06,
8623
+ "loss": 0.3252,
8624
+ "step": 1133
8625
+ },
8626
+ {
8627
+ "epoch": 0.8679678530424799,
8628
+ "grad_norm": 5.714728355407715,
8629
+ "learning_rate": 1.0389232246089499e-06,
8630
+ "loss": 0.2859,
8631
+ "step": 1134
8632
+ },
8633
+ {
8634
+ "epoch": 0.8687332567929583,
8635
+ "grad_norm": 8.63733959197998,
8636
+ "learning_rate": 1.0270884245648905e-06,
8637
+ "loss": 0.3019,
8638
+ "step": 1135
8639
+ },
8640
+ {
8641
+ "epoch": 0.8694986605434366,
8642
+ "grad_norm": 5.155510902404785,
8643
+ "learning_rate": 1.015317768683669e-06,
8644
+ "loss": 0.2339,
8645
+ "step": 1136
8646
+ },
8647
+ {
8648
+ "epoch": 0.870264064293915,
8649
+ "grad_norm": 7.10530948638916,
8650
+ "learning_rate": 1.0036113411095304e-06,
8651
+ "loss": 0.2472,
8652
+ "step": 1137
8653
+ },
8654
+ {
8655
+ "epoch": 0.8710294680443934,
8656
+ "grad_norm": 7.122653007507324,
8657
+ "learning_rate": 9.919692255275747e-07,
8658
+ "loss": 0.3575,
8659
+ "step": 1138
8660
+ },
8661
+ {
8662
+ "epoch": 0.8717948717948718,
8663
+ "grad_norm": 7.5401530265808105,
8664
+ "learning_rate": 9.803915051631574e-07,
8665
+ "loss": 0.2504,
8666
+ "step": 1139
8667
+ },
8668
+ {
8669
+ "epoch": 0.8725602755453502,
8670
+ "grad_norm": 6.913841247558594,
8671
+ "learning_rate": 9.688782627812965e-07,
8672
+ "loss": 0.3607,
8673
+ "step": 1140
8674
+ },
8675
+ {
8676
+ "epoch": 0.8725602755453502,
8677
+ "eval_accuracy": 0.8826714801444043,
8678
+ "eval_f1": 0.8302872062663186,
8679
+ "eval_loss": 0.2986834943294525,
8680
+ "eval_precision": 0.8736263736263736,
8681
+ "eval_recall": 0.7910447761194029,
8682
+ "eval_runtime": 43.2421,
8683
+ "eval_samples_per_second": 6.961,
8684
+ "eval_steps_per_second": 0.231,
8685
+ "step": 1140
8686
+ },
8687
+ {
8688
+ "epoch": 0.8733256792958285,
8689
+ "grad_norm": 4.701968193054199,
8690
+ "learning_rate": 9.574295806860767e-07,
8691
+ "loss": 0.222,
8692
+ "step": 1141
8693
+ },
8694
+ {
8695
+ "epoch": 0.8740910830463069,
8696
+ "grad_norm": 6.480667591094971,
8697
+ "learning_rate": 9.460455407200708e-07,
8698
+ "loss": 0.3484,
8699
+ "step": 1142
8700
+ },
8701
+ {
8702
+ "epoch": 0.8748564867967853,
8703
+ "grad_norm": 6.2637481689453125,
8704
+ "learning_rate": 9.347262242637345e-07,
8705
+ "loss": 0.3202,
8706
+ "step": 1143
8707
+ },
8708
+ {
8709
+ "epoch": 0.8756218905472637,
8710
+ "grad_norm": 7.320901393890381,
8711
+ "learning_rate": 9.234717122348558e-07,
8712
+ "loss": 0.3725,
8713
+ "step": 1144
8714
+ },
8715
+ {
8716
+ "epoch": 0.8763872942977421,
8717
+ "grad_norm": 6.01532506942749,
8718
+ "learning_rate": 9.122820850879488e-07,
8719
+ "loss": 0.2905,
8720
+ "step": 1145
8721
+ },
8722
+ {
8723
+ "epoch": 0.8771526980482205,
8724
+ "grad_norm": 8.483268737792969,
8725
+ "learning_rate": 9.011574228136866e-07,
8726
+ "loss": 0.4175,
8727
+ "step": 1146
8728
+ },
8729
+ {
8730
+ "epoch": 0.8779181017986988,
8731
+ "grad_norm": 7.978078365325928,
8732
+ "learning_rate": 8.90097804938338e-07,
8733
+ "loss": 0.3556,
8734
+ "step": 1147
8735
+ },
8736
+ {
8737
+ "epoch": 0.8786835055491772,
8738
+ "grad_norm": 4.914220333099365,
8739
+ "learning_rate": 8.791033105231861e-07,
8740
+ "loss": 0.3226,
8741
+ "step": 1148
8742
+ },
8743
+ {
8744
+ "epoch": 0.8794489092996556,
8745
+ "grad_norm": 10.738564491271973,
8746
+ "learning_rate": 8.681740181639731e-07,
8747
+ "loss": 0.325,
8748
+ "step": 1149
8749
+ },
8750
+ {
8751
+ "epoch": 0.880214313050134,
8752
+ "grad_norm": 5.5547990798950195,
8753
+ "learning_rate": 8.573100059903349e-07,
8754
+ "loss": 0.2508,
8755
+ "step": 1150
8756
+ },
8757
+ {
8758
+ "epoch": 0.8809797168006124,
8759
+ "grad_norm": 9.317310333251953,
8760
+ "learning_rate": 8.465113516652424e-07,
8761
+ "loss": 0.2961,
8762
+ "step": 1151
8763
+ },
8764
+ {
8765
+ "epoch": 0.8817451205510907,
8766
+ "grad_norm": 4.832771301269531,
8767
+ "learning_rate": 8.357781323844482e-07,
8768
+ "loss": 0.1824,
8769
+ "step": 1152
8770
+ },
8771
+ {
8772
+ "epoch": 0.8825105243015691,
8773
+ "grad_norm": 7.913245677947998,
8774
+ "learning_rate": 8.251104248759256e-07,
8775
+ "loss": 0.3127,
8776
+ "step": 1153
8777
+ },
8778
+ {
8779
+ "epoch": 0.8832759280520475,
8780
+ "grad_norm": 6.320401668548584,
8781
+ "learning_rate": 8.145083053993364e-07,
8782
+ "loss": 0.2046,
8783
+ "step": 1154
8784
+ },
8785
+ {
8786
+ "epoch": 0.8840413318025259,
8787
+ "grad_norm": 6.381113052368164,
8788
+ "learning_rate": 8.039718497454685e-07,
8789
+ "loss": 0.3374,
8790
+ "step": 1155
8791
+ },
8792
+ {
8793
+ "epoch": 0.8848067355530043,
8794
+ "grad_norm": 5.279355049133301,
8795
+ "learning_rate": 7.935011332357113e-07,
8796
+ "loss": 0.2347,
8797
+ "step": 1156
8798
+ },
8799
+ {
8800
+ "epoch": 0.8855721393034826,
8801
+ "grad_norm": 5.956709861755371,
8802
+ "learning_rate": 7.83096230721505e-07,
8803
+ "loss": 0.2561,
8804
+ "step": 1157
8805
+ },
8806
+ {
8807
+ "epoch": 0.886337543053961,
8808
+ "grad_norm": 8.500905990600586,
8809
+ "learning_rate": 7.727572165838038e-07,
8810
+ "loss": 0.3429,
8811
+ "step": 1158
8812
+ },
8813
+ {
8814
+ "epoch": 0.8871029468044394,
8815
+ "grad_norm": 11.31344223022461,
8816
+ "learning_rate": 7.624841647325565e-07,
8817
+ "loss": 0.3175,
8818
+ "step": 1159
8819
+ },
8820
+ {
8821
+ "epoch": 0.8878683505549178,
8822
+ "grad_norm": 4.852387428283691,
8823
+ "learning_rate": 7.522771486061642e-07,
8824
+ "loss": 0.2201,
8825
+ "step": 1160
8826
+ },
8827
+ {
8828
+ "epoch": 0.8878683505549178,
8829
+ "eval_accuracy": 0.8880866425992779,
8830
+ "eval_f1": 0.8368421052631579,
8831
+ "eval_loss": 0.29599303007125854,
8832
+ "eval_precision": 0.888268156424581,
8833
+ "eval_recall": 0.7910447761194029,
8834
+ "eval_runtime": 42.5979,
8835
+ "eval_samples_per_second": 7.066,
8836
+ "eval_steps_per_second": 0.235,
8837
+ "step": 1160
8838
+ },
8839
+ {
8840
+ "epoch": 0.8886337543053962,
8841
+ "grad_norm": 5.681868076324463,
8842
+ "learning_rate": 7.421362411709676e-07,
8843
+ "loss": 0.2797,
8844
+ "step": 1161
8845
+ },
8846
+ {
8847
+ "epoch": 0.8893991580558744,
8848
+ "grad_norm": 7.943777561187744,
8849
+ "learning_rate": 7.320615149207177e-07,
8850
+ "loss": 0.2878,
8851
+ "step": 1162
8852
+ },
8853
+ {
8854
+ "epoch": 0.8901645618063528,
8855
+ "grad_norm": 7.188109397888184,
8856
+ "learning_rate": 7.220530418760597e-07,
8857
+ "loss": 0.2972,
8858
+ "step": 1163
8859
+ },
8860
+ {
8861
+ "epoch": 0.8909299655568312,
8862
+ "grad_norm": 5.419342994689941,
8863
+ "learning_rate": 7.121108935840193e-07,
8864
+ "loss": 0.2502,
8865
+ "step": 1164
8866
+ },
8867
+ {
8868
+ "epoch": 0.8916953693073096,
8869
+ "grad_norm": 10.313029289245605,
8870
+ "learning_rate": 7.022351411174866e-07,
8871
+ "loss": 0.4279,
8872
+ "step": 1165
8873
+ },
8874
+ {
8875
+ "epoch": 0.892460773057788,
8876
+ "grad_norm": 9.825774192810059,
8877
+ "learning_rate": 6.924258550747154e-07,
8878
+ "loss": 0.3422,
8879
+ "step": 1166
8880
+ },
8881
+ {
8882
+ "epoch": 0.8932261768082663,
8883
+ "grad_norm": 7.466933250427246,
8884
+ "learning_rate": 6.826831055788119e-07,
8885
+ "loss": 0.2288,
8886
+ "step": 1167
8887
+ },
8888
+ {
8889
+ "epoch": 0.8939915805587447,
8890
+ "grad_norm": 4.190829277038574,
8891
+ "learning_rate": 6.730069622772373e-07,
8892
+ "loss": 0.2315,
8893
+ "step": 1168
8894
+ },
8895
+ {
8896
+ "epoch": 0.8947569843092231,
8897
+ "grad_norm": 4.927202224731445,
8898
+ "learning_rate": 6.633974943413113e-07,
8899
+ "loss": 0.2855,
8900
+ "step": 1169
8901
+ },
8902
+ {
8903
+ "epoch": 0.8955223880597015,
8904
+ "grad_norm": 5.255453109741211,
8905
+ "learning_rate": 6.538547704657094e-07,
8906
+ "loss": 0.2338,
8907
+ "step": 1170
8908
+ },
8909
+ {
8910
+ "epoch": 0.8962877918101799,
8911
+ "grad_norm": 6.458939075469971,
8912
+ "learning_rate": 6.443788588679823e-07,
8913
+ "loss": 0.2398,
8914
+ "step": 1171
8915
+ },
8916
+ {
8917
+ "epoch": 0.8970531955606582,
8918
+ "grad_norm": 6.905317783355713,
8919
+ "learning_rate": 6.349698272880588e-07,
8920
+ "loss": 0.2978,
8921
+ "step": 1172
8922
+ },
8923
+ {
8924
+ "epoch": 0.8978185993111366,
8925
+ "grad_norm": 7.470308780670166,
8926
+ "learning_rate": 6.256277429877711e-07,
8927
+ "loss": 0.2552,
8928
+ "step": 1173
8929
+ },
8930
+ {
8931
+ "epoch": 0.898584003061615,
8932
+ "grad_norm": 9.028374671936035,
8933
+ "learning_rate": 6.163526727503688e-07,
8934
+ "loss": 0.2822,
8935
+ "step": 1174
8936
+ },
8937
+ {
8938
+ "epoch": 0.8993494068120934,
8939
+ "grad_norm": 4.99279260635376,
8940
+ "learning_rate": 6.071446828800353e-07,
8941
+ "loss": 0.1629,
8942
+ "step": 1175
8943
+ },
8944
+ {
8945
+ "epoch": 0.9001148105625718,
8946
+ "grad_norm": 5.656613349914551,
8947
+ "learning_rate": 5.980038392014309e-07,
8948
+ "loss": 0.2495,
8949
+ "step": 1176
8950
+ },
8951
+ {
8952
+ "epoch": 0.9008802143130501,
8953
+ "grad_norm": 4.793300628662109,
8954
+ "learning_rate": 5.889302070591985e-07,
8955
+ "loss": 0.1765,
8956
+ "step": 1177
8957
+ },
8958
+ {
8959
+ "epoch": 0.9016456180635285,
8960
+ "grad_norm": 5.23650598526001,
8961
+ "learning_rate": 5.79923851317521e-07,
8962
+ "loss": 0.1807,
8963
+ "step": 1178
8964
+ },
8965
+ {
8966
+ "epoch": 0.9024110218140069,
8967
+ "grad_norm": 4.662338733673096,
8968
+ "learning_rate": 5.709848363596404e-07,
8969
+ "loss": 0.2996,
8970
+ "step": 1179
8971
+ },
8972
+ {
8973
+ "epoch": 0.9031764255644853,
8974
+ "grad_norm": 6.364925384521484,
8975
+ "learning_rate": 5.621132260874051e-07,
8976
+ "loss": 0.2767,
8977
+ "step": 1180
8978
+ },
8979
+ {
8980
+ "epoch": 0.9031764255644853,
8981
+ "eval_accuracy": 0.8898916967509025,
8982
+ "eval_f1": 0.8390501319261213,
8983
+ "eval_loss": 0.2949095368385315,
8984
+ "eval_precision": 0.8932584269662921,
8985
+ "eval_recall": 0.7910447761194029,
8986
+ "eval_runtime": 42.8147,
8987
+ "eval_samples_per_second": 7.03,
8988
+ "eval_steps_per_second": 0.234,
8989
+ "step": 1180
8990
+ },
8991
+ {
8992
+ "epoch": 0.9039418293149637,
8993
+ "grad_norm": 8.286806106567383,
8994
+ "learning_rate": 5.533090839208133e-07,
8995
+ "loss": 0.3283,
8996
+ "step": 1181
8997
+ },
8998
+ {
8999
+ "epoch": 0.904707233065442,
9000
+ "grad_norm": 5.3382720947265625,
9001
+ "learning_rate": 5.445724727975498e-07,
9002
+ "loss": 0.2489,
9003
+ "step": 1182
9004
+ },
9005
+ {
9006
+ "epoch": 0.9054726368159204,
9007
+ "grad_norm": 7.994104862213135,
9008
+ "learning_rate": 5.359034551725517e-07,
9009
+ "loss": 0.3883,
9010
+ "step": 1183
9011
+ },
9012
+ {
9013
+ "epoch": 0.9062380405663988,
9014
+ "grad_norm": 10.035967826843262,
9015
+ "learning_rate": 5.273020930175543e-07,
9016
+ "loss": 0.325,
9017
+ "step": 1184
9018
+ },
9019
+ {
9020
+ "epoch": 0.9070034443168772,
9021
+ "grad_norm": 5.84358549118042,
9022
+ "learning_rate": 5.187684478206412e-07,
9023
+ "loss": 0.2696,
9024
+ "step": 1185
9025
+ },
9026
+ {
9027
+ "epoch": 0.9077688480673556,
9028
+ "grad_norm": 5.898288249969482,
9029
+ "learning_rate": 5.103025805858197e-07,
9030
+ "loss": 0.2285,
9031
+ "step": 1186
9032
+ },
9033
+ {
9034
+ "epoch": 0.9085342518178339,
9035
+ "grad_norm": 4.795246601104736,
9036
+ "learning_rate": 5.019045518325693e-07,
9037
+ "loss": 0.2324,
9038
+ "step": 1187
9039
+ },
9040
+ {
9041
+ "epoch": 0.9092996555683123,
9042
+ "grad_norm": 13.465359687805176,
9043
+ "learning_rate": 4.935744215954197e-07,
9044
+ "loss": 0.3142,
9045
+ "step": 1188
9046
+ },
9047
+ {
9048
+ "epoch": 0.9100650593187907,
9049
+ "grad_norm": 7.159090995788574,
9050
+ "learning_rate": 4.853122494235207e-07,
9051
+ "loss": 0.2966,
9052
+ "step": 1189
9053
+ },
9054
+ {
9055
+ "epoch": 0.9108304630692691,
9056
+ "grad_norm": 5.706002235412598,
9057
+ "learning_rate": 4.77118094380209e-07,
9058
+ "loss": 0.2718,
9059
+ "step": 1190
9060
+ },
9061
+ {
9062
+ "epoch": 0.9115958668197475,
9063
+ "grad_norm": 5.979389190673828,
9064
+ "learning_rate": 4.6899201504259196e-07,
9065
+ "loss": 0.2746,
9066
+ "step": 1191
9067
+ },
9068
+ {
9069
+ "epoch": 0.9123612705702258,
9070
+ "grad_norm": 8.064590454101562,
9071
+ "learning_rate": 4.609340695011311e-07,
9072
+ "loss": 0.2624,
9073
+ "step": 1192
9074
+ },
9075
+ {
9076
+ "epoch": 0.9131266743207042,
9077
+ "grad_norm": 4.81801176071167,
9078
+ "learning_rate": 4.5294431535922166e-07,
9079
+ "loss": 0.1888,
9080
+ "step": 1193
9081
+ },
9082
+ {
9083
+ "epoch": 0.9138920780711826,
9084
+ "grad_norm": 5.313014030456543,
9085
+ "learning_rate": 4.4502280973278135e-07,
9086
+ "loss": 0.3078,
9087
+ "step": 1194
9088
+ },
9089
+ {
9090
+ "epoch": 0.914657481821661,
9091
+ "grad_norm": 6.325895309448242,
9092
+ "learning_rate": 4.3716960924984566e-07,
9093
+ "loss": 0.3188,
9094
+ "step": 1195
9095
+ },
9096
+ {
9097
+ "epoch": 0.9154228855721394,
9098
+ "grad_norm": 5.998826026916504,
9099
+ "learning_rate": 4.2938477005015853e-07,
9100
+ "loss": 0.3208,
9101
+ "step": 1196
9102
+ },
9103
+ {
9104
+ "epoch": 0.9161882893226176,
9105
+ "grad_norm": 6.817664623260498,
9106
+ "learning_rate": 4.2166834778477717e-07,
9107
+ "loss": 0.2706,
9108
+ "step": 1197
9109
+ },
9110
+ {
9111
+ "epoch": 0.916953693073096,
9112
+ "grad_norm": 9.771849632263184,
9113
+ "learning_rate": 4.140203976156665e-07,
9114
+ "loss": 0.3291,
9115
+ "step": 1198
9116
+ },
9117
+ {
9118
+ "epoch": 0.9177190968235744,
9119
+ "grad_norm": 7.858504772186279,
9120
+ "learning_rate": 4.064409742153097e-07,
9121
+ "loss": 0.3371,
9122
+ "step": 1199
9123
+ },
9124
+ {
9125
+ "epoch": 0.9184845005740528,
9126
+ "grad_norm": 4.8687591552734375,
9127
+ "learning_rate": 3.9893013176631636e-07,
9128
+ "loss": 0.2563,
9129
+ "step": 1200
9130
+ },
9131
+ {
9132
+ "epoch": 0.9184845005740528,
9133
+ "eval_accuracy": 0.8898916967509025,
9134
+ "eval_f1": 0.8390501319261213,
9135
+ "eval_loss": 0.293884813785553,
9136
+ "eval_precision": 0.8932584269662921,
9137
+ "eval_recall": 0.7910447761194029,
9138
+ "eval_runtime": 43.9202,
9139
+ "eval_samples_per_second": 6.853,
9140
+ "eval_steps_per_second": 0.228,
9141
+ "step": 1200
9142
  }
9143
  ],
9144
  "logging_steps": 1,
 
9158
  "attributes": {}
9159
  }
9160
  },
9161
+ "total_flos": 1.8435297220388454e+17,
9162
  "train_batch_size": 8,
9163
  "trial_name": null,
9164
  "trial_params": null