bobox commited on
Commit
2a8c918
·
verified ·
1 Parent(s): ee3d70e

Training in progress, epoch 4, checkpoint

Browse files
last-checkpoint/README.md CHANGED
@@ -971,6 +971,46 @@ You can finetune this model on your own dataset.
971
  | 2.9266 | 16380 | 1.0227 | - | - | - |
972
  | 2.9516 | 16520 | 0.8159 | - | - | - |
973
  | 2.9766 | 16660 | 0.8426 | - | - | - |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
974
 
975
  </details>
976
 
 
971
  | 2.9266 | 16380 | 1.0227 | - | - | - |
972
  | 2.9516 | 16520 | 0.8159 | - | - | - |
973
  | 2.9766 | 16660 | 0.8426 | - | - | - |
974
+ | 3.0016 | 16800 | 0.7955 | 1.1895 | 0.9681 | 0.4912 |
975
+ | 3.0266 | 16940 | 0.856 | - | - | - |
976
+ | 3.0516 | 17080 | 1.0754 | - | - | - |
977
+ | 3.0766 | 17220 | 0.9151 | - | - | - |
978
+ | 3.1017 | 17360 | 1.0051 | - | - | - |
979
+ | 3.1267 | 17500 | 1.0075 | 1.0658 | 0.9583 | 0.4853 |
980
+ | 3.1517 | 17640 | 0.9909 | - | - | - |
981
+ | 3.1767 | 17780 | 1.029 | - | - | - |
982
+ | 3.2017 | 17920 | 1.0292 | - | - | - |
983
+ | 3.2267 | 18060 | 0.8334 | - | - | - |
984
+ | 3.2517 | 18200 | 1.0119 | 1.1172 | 0.9485 | 0.4674 |
985
+ | 3.2768 | 18340 | 0.8582 | - | - | - |
986
+ | 3.3018 | 18480 | 1.0397 | - | - | - |
987
+ | 3.3268 | 18620 | 1.1988 | - | - | - |
988
+ | 3.3518 | 18760 | 0.9432 | - | - | - |
989
+ | 3.3768 | 18900 | 1.0573 | 1.0815 | 0.9437 | 0.4679 |
990
+ | 3.4018 | 19040 | 0.9829 | - | - | - |
991
+ | 3.4268 | 19180 | 1.0573 | - | - | - |
992
+ | 3.4518 | 19320 | 0.9449 | - | - | - |
993
+ | 3.4769 | 19460 | 1.2005 | - | - | - |
994
+ | 3.5019 | 19600 | 0.9171 | 1.2315 | 0.9503 | 0.4799 |
995
+ | 3.5269 | 19740 | 0.9425 | - | - | - |
996
+ | 3.5519 | 19880 | 1.1213 | - | - | - |
997
+ | 3.5769 | 20020 | 1.1128 | - | - | - |
998
+ | 3.6019 | 20160 | 1.331 | - | - | - |
999
+ | 3.6269 | 20300 | 1.0495 | 1.1413 | 0.9468 | 0.4434 |
1000
+ | 3.6520 | 20440 | 0.9698 | - | - | - |
1001
+ | 3.6770 | 20580 | 0.9148 | - | - | - |
1002
+ | 3.7020 | 20720 | 0.9042 | - | - | - |
1003
+ | 3.7270 | 20860 | 0.8232 | - | - | - |
1004
+ | 3.7520 | 21000 | 1.0163 | 1.2883 | 0.9020 | 0.4574 |
1005
+ | 3.7770 | 21140 | 0.9735 | - | - | - |
1006
+ | 3.8020 | 21280 | 0.8371 | - | - | - |
1007
+ | 3.8271 | 21420 | 0.6344 | - | - | - |
1008
+ | 3.8521 | 21560 | 0.87 | - | - | - |
1009
+ | 3.8771 | 21700 | 0.7404 | 1.0644 | 0.8661 | 0.4353 |
1010
+ | 3.9021 | 21840 | 0.8486 | - | - | - |
1011
+ | 3.9271 | 21980 | 0.8895 | - | - | - |
1012
+ | 3.9521 | 22120 | 0.7476 | - | - | - |
1013
+ | 3.9771 | 22260 | 0.6761 | - | - | - |
1014
 
1015
  </details>
1016
 
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b3f8b7a10f49c7accc4d56fcde5e2b7367572fea9eadda7abc68c4ea6d413109
3
  size 1130520122
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4345ef434b7ce58a400595f316107c458077d152783f92e11230dd06b00da241
3
  size 1130520122
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9134258fe357fd6a2a6353e374fe13d69af6b328dba3af917a4e41b35fe24ced
3
  size 565251810
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c61e8dbee68a8b9aa8455be0f0f627f0276bc6305fcf58258493d0000b0b9ad
3
  size 565251810
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce15952460133b21ef920df1fe684611abb9d45ded84989240cf0c78f3eacbe8
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45081b173c6755e63dd0ede1a9b1a21b4183931e9d940410e69655e93a0291cf
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd0a193be43dda10c5ceb71b126b0e7c57e19cdb380ddf6d47b36db9131d04db
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6243024eb0239246d61ff58dd2248929ed644805efadd2d460a5d88e0b9fcfa2
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
  "eval_steps": 700,
6
- "global_step": 16791,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1392,6 +1392,478 @@
1392
  "learning_rate": 1.0994658958057889e-05,
1393
  "loss": 0.8426,
1394
  "step": 16660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1395
  }
1396
  ],
1397
  "logging_steps": 140,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 4.0,
5
  "eval_steps": 700,
6
+ "global_step": 22388,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1392
  "learning_rate": 1.0994658958057889e-05,
1393
  "loss": 0.8426,
1394
  "step": 16660
1395
+ },
1396
+ {
1397
+ "epoch": 3.0016080042880113,
1398
+ "grad_norm": 0.6402806043624878,
1399
+ "learning_rate": 1.0014032480000764e-05,
1400
+ "loss": 0.7955,
1401
+ "step": 16800
1402
+ },
1403
+ {
1404
+ "epoch": 3.0016080042880113,
1405
+ "eval_nli-pairs_loss": 0.9680945873260498,
1406
+ "eval_nli-pairs_runtime": 13.331,
1407
+ "eval_nli-pairs_samples_per_second": 112.519,
1408
+ "eval_nli-pairs_steps_per_second": 4.726,
1409
+ "step": 16800
1410
+ },
1411
+ {
1412
+ "epoch": 3.0016080042880113,
1413
+ "eval_scitail-pairs-pos_loss": 0.49118393659591675,
1414
+ "eval_scitail-pairs-pos_runtime": 15.2998,
1415
+ "eval_scitail-pairs-pos_samples_per_second": 85.23,
1416
+ "eval_scitail-pairs-pos_steps_per_second": 3.595,
1417
+ "step": 16800
1418
+ },
1419
+ {
1420
+ "epoch": 3.0016080042880113,
1421
+ "eval_qnli-contrastive_loss": 1.1894794702529907,
1422
+ "eval_qnli-contrastive_runtime": 4.8432,
1423
+ "eval_qnli-contrastive_samples_per_second": 309.715,
1424
+ "eval_qnli-contrastive_steps_per_second": 13.008,
1425
+ "step": 16800
1426
+ },
1427
+ {
1428
+ "epoch": 3.0266214043237447,
1429
+ "grad_norm": 3.155766010284424,
1430
+ "learning_rate": 9.03327071669702e-06,
1431
+ "loss": 0.856,
1432
+ "step": 16940
1433
+ },
1434
+ {
1435
+ "epoch": 3.051634804359478,
1436
+ "grad_norm": 11.008296966552734,
1437
+ "learning_rate": 8.061829059993542e-06,
1438
+ "loss": 1.0754,
1439
+ "step": 17080
1440
+ },
1441
+ {
1442
+ "epoch": 3.076648204395212,
1443
+ "grad_norm": 4.382720947265625,
1444
+ "learning_rate": 7.109073047846788e-06,
1445
+ "loss": 0.9151,
1446
+ "step": 17220
1447
+ },
1448
+ {
1449
+ "epoch": 3.1016616044309453,
1450
+ "grad_norm": 2.755722761154175,
1451
+ "learning_rate": 6.184188072434878e-06,
1452
+ "loss": 1.0051,
1453
+ "step": 17360
1454
+ },
1455
+ {
1456
+ "epoch": 3.1266750044666787,
1457
+ "grad_norm": 2.4547111988067627,
1458
+ "learning_rate": 5.296090825030854e-06,
1459
+ "loss": 1.0075,
1460
+ "step": 17500
1461
+ },
1462
+ {
1463
+ "epoch": 3.1266750044666787,
1464
+ "eval_nli-pairs_loss": 0.9583492875099182,
1465
+ "eval_nli-pairs_runtime": 12.1773,
1466
+ "eval_nli-pairs_samples_per_second": 123.18,
1467
+ "eval_nli-pairs_steps_per_second": 5.174,
1468
+ "step": 17500
1469
+ },
1470
+ {
1471
+ "epoch": 3.1266750044666787,
1472
+ "eval_scitail-pairs-pos_loss": 0.485266774892807,
1473
+ "eval_scitail-pairs-pos_runtime": 14.9222,
1474
+ "eval_scitail-pairs-pos_samples_per_second": 87.387,
1475
+ "eval_scitail-pairs-pos_steps_per_second": 3.686,
1476
+ "step": 17500
1477
+ },
1478
+ {
1479
+ "epoch": 3.1266750044666787,
1480
+ "eval_qnli-contrastive_loss": 1.0658234357833862,
1481
+ "eval_qnli-contrastive_runtime": 4.7681,
1482
+ "eval_qnli-contrastive_samples_per_second": 314.592,
1483
+ "eval_qnli-contrastive_steps_per_second": 13.213,
1484
+ "step": 17500
1485
+ },
1486
+ {
1487
+ "epoch": 3.151688404502412,
1488
+ "grad_norm": 19.061325073242188,
1489
+ "learning_rate": 4.453343331385006e-06,
1490
+ "loss": 0.9909,
1491
+ "step": 17640
1492
+ },
1493
+ {
1494
+ "epoch": 3.1767018045381454,
1495
+ "grad_norm": 17.016021728515625,
1496
+ "learning_rate": 3.6640704063896858e-06,
1497
+ "loss": 1.029,
1498
+ "step": 17780
1499
+ },
1500
+ {
1501
+ "epoch": 3.201715204573879,
1502
+ "grad_norm": 4.147863864898682,
1503
+ "learning_rate": 2.9358813238350816e-06,
1504
+ "loss": 1.0292,
1505
+ "step": 17920
1506
+ },
1507
+ {
1508
+ "epoch": 3.226728604609612,
1509
+ "grad_norm": 27.60422706604004,
1510
+ "learning_rate": 2.275796456427173e-06,
1511
+ "loss": 0.8334,
1512
+ "step": 18060
1513
+ },
1514
+ {
1515
+ "epoch": 3.2517420046453456,
1516
+ "grad_norm": 0.7800289392471313,
1517
+ "learning_rate": 1.6901795933215137e-06,
1518
+ "loss": 1.0119,
1519
+ "step": 18200
1520
+ },
1521
+ {
1522
+ "epoch": 3.2517420046453456,
1523
+ "eval_nli-pairs_loss": 0.9484548568725586,
1524
+ "eval_nli-pairs_runtime": 12.0697,
1525
+ "eval_nli-pairs_samples_per_second": 124.279,
1526
+ "eval_nli-pairs_steps_per_second": 5.22,
1527
+ "step": 18200
1528
+ },
1529
+ {
1530
+ "epoch": 3.2517420046453456,
1531
+ "eval_scitail-pairs-pos_loss": 0.4673975706100464,
1532
+ "eval_scitail-pairs-pos_runtime": 15.0509,
1533
+ "eval_scitail-pairs-pos_samples_per_second": 86.639,
1534
+ "eval_scitail-pairs-pos_steps_per_second": 3.654,
1535
+ "step": 18200
1536
+ },
1537
+ {
1538
+ "epoch": 3.2517420046453456,
1539
+ "eval_qnli-contrastive_loss": 1.1171668767929077,
1540
+ "eval_qnli-contrastive_runtime": 4.7871,
1541
+ "eval_qnli-contrastive_samples_per_second": 313.345,
1542
+ "eval_qnli-contrastive_steps_per_second": 13.16,
1543
+ "step": 18200
1544
+ },
1545
+ {
1546
+ "epoch": 3.2767554046810794,
1547
+ "grad_norm": 16.64696502685547,
1548
+ "learning_rate": 1.1846765876905709e-06,
1549
+ "loss": 0.8582,
1550
+ "step": 18340
1551
+ },
1552
+ {
1553
+ "epoch": 3.3017688047168123,
1554
+ "grad_norm": 16.13783073425293,
1555
+ "learning_rate": 7.668532006209551e-07,
1556
+ "loss": 1.0397,
1557
+ "step": 18480
1558
+ },
1559
+ {
1560
+ "epoch": 3.326782204752546,
1561
+ "grad_norm": 3.76619553565979,
1562
+ "learning_rate": 4.347306328421508e-07,
1563
+ "loss": 1.1988,
1564
+ "step": 18620
1565
+ },
1566
+ {
1567
+ "epoch": 3.3517956047882795,
1568
+ "grad_norm": 10.401665687561035,
1569
+ "learning_rate": 1.948255365952012e-07,
1570
+ "loss": 0.9432,
1571
+ "step": 18760
1572
+ },
1573
+ {
1574
+ "epoch": 3.376809004824013,
1575
+ "grad_norm": 2.400106191635132,
1576
+ "learning_rate": 4.945080454776929e-08,
1577
+ "loss": 1.0573,
1578
+ "step": 18900
1579
+ },
1580
+ {
1581
+ "epoch": 3.376809004824013,
1582
+ "eval_nli-pairs_loss": 0.9437180757522583,
1583
+ "eval_nli-pairs_runtime": 12.0974,
1584
+ "eval_nli-pairs_samples_per_second": 123.993,
1585
+ "eval_nli-pairs_steps_per_second": 5.208,
1586
+ "step": 18900
1587
+ },
1588
+ {
1589
+ "epoch": 3.376809004824013,
1590
+ "eval_scitail-pairs-pos_loss": 0.46788787841796875,
1591
+ "eval_scitail-pairs-pos_runtime": 15.1516,
1592
+ "eval_scitail-pairs-pos_samples_per_second": 86.063,
1593
+ "eval_scitail-pairs-pos_steps_per_second": 3.63,
1594
+ "step": 18900
1595
+ },
1596
+ {
1597
+ "epoch": 3.376809004824013,
1598
+ "eval_qnli-contrastive_loss": 1.081482172012329,
1599
+ "eval_qnli-contrastive_runtime": 4.8096,
1600
+ "eval_qnli-contrastive_samples_per_second": 311.875,
1601
+ "eval_qnli-contrastive_steps_per_second": 13.099,
1602
+ "step": 18900
1603
+ },
1604
+ {
1605
+ "epoch": 3.4018224048597463,
1606
+ "grad_norm": 25.33026695251465,
1607
+ "learning_rate": 7.974879220329356e-12,
1608
+ "loss": 0.9829,
1609
+ "step": 19040
1610
+ },
1611
+ {
1612
+ "epoch": 3.4268358048954797,
1613
+ "grad_norm": 4.218173027038574,
1614
+ "learning_rate": 1.995302628075987e-05,
1615
+ "loss": 1.0573,
1616
+ "step": 19180
1617
+ },
1618
+ {
1619
+ "epoch": 3.451849204931213,
1620
+ "grad_norm": 13.573431015014648,
1621
+ "learning_rate": 1.98101047527748e-05,
1622
+ "loss": 0.9449,
1623
+ "step": 19320
1624
+ },
1625
+ {
1626
+ "epoch": 3.4768626049669464,
1627
+ "grad_norm": 6.658699989318848,
1628
+ "learning_rate": 1.9572605328335534e-05,
1629
+ "loss": 1.2005,
1630
+ "step": 19460
1631
+ },
1632
+ {
1633
+ "epoch": 3.5018760050026803,
1634
+ "grad_norm": 6.075576305389404,
1635
+ "learning_rate": 1.924281770735239e-05,
1636
+ "loss": 0.9171,
1637
+ "step": 19600
1638
+ },
1639
+ {
1640
+ "epoch": 3.5018760050026803,
1641
+ "eval_nli-pairs_loss": 0.9502684473991394,
1642
+ "eval_nli-pairs_runtime": 12.0413,
1643
+ "eval_nli-pairs_samples_per_second": 124.572,
1644
+ "eval_nli-pairs_steps_per_second": 5.232,
1645
+ "step": 19600
1646
+ },
1647
+ {
1648
+ "epoch": 3.5018760050026803,
1649
+ "eval_scitail-pairs-pos_loss": 0.4798508584499359,
1650
+ "eval_scitail-pairs-pos_runtime": 14.9533,
1651
+ "eval_scitail-pairs-pos_samples_per_second": 87.205,
1652
+ "eval_scitail-pairs-pos_steps_per_second": 3.678,
1653
+ "step": 19600
1654
+ },
1655
+ {
1656
+ "epoch": 3.5018760050026803,
1657
+ "eval_qnli-contrastive_loss": 1.2315282821655273,
1658
+ "eval_qnli-contrastive_runtime": 4.7188,
1659
+ "eval_qnli-contrastive_samples_per_second": 317.874,
1660
+ "eval_qnli-contrastive_steps_per_second": 13.351,
1661
+ "step": 19600
1662
+ },
1663
+ {
1664
+ "epoch": 3.526889405038413,
1665
+ "grad_norm": 8.40775203704834,
1666
+ "learning_rate": 1.8823921327788075e-05,
1667
+ "loss": 0.9425,
1668
+ "step": 19740
1669
+ },
1670
+ {
1671
+ "epoch": 3.551902805074147,
1672
+ "grad_norm": 11.214140892028809,
1673
+ "learning_rate": 1.831995471312526e-05,
1674
+ "loss": 1.1213,
1675
+ "step": 19880
1676
+ },
1677
+ {
1678
+ "epoch": 3.5769162051098804,
1679
+ "grad_norm": 10.211651802062988,
1680
+ "learning_rate": 1.7735776537506483e-05,
1681
+ "loss": 1.1128,
1682
+ "step": 20020
1683
+ },
1684
+ {
1685
+ "epoch": 3.6019296051456138,
1686
+ "grad_norm": 44.01512908935547,
1687
+ "learning_rate": 1.707701878391224e-05,
1688
+ "loss": 1.331,
1689
+ "step": 20160
1690
+ },
1691
+ {
1692
+ "epoch": 3.626943005181347,
1693
+ "grad_norm": 13.295893669128418,
1694
+ "learning_rate": 1.6350032446972868e-05,
1695
+ "loss": 1.0495,
1696
+ "step": 20300
1697
+ },
1698
+ {
1699
+ "epoch": 3.626943005181347,
1700
+ "eval_nli-pairs_loss": 0.9468088150024414,
1701
+ "eval_nli-pairs_runtime": 11.9325,
1702
+ "eval_nli-pairs_samples_per_second": 125.707,
1703
+ "eval_nli-pairs_steps_per_second": 5.28,
1704
+ "step": 20300
1705
+ },
1706
+ {
1707
+ "epoch": 3.626943005181347,
1708
+ "eval_scitail-pairs-pos_loss": 0.4434490203857422,
1709
+ "eval_scitail-pairs-pos_runtime": 15.5134,
1710
+ "eval_scitail-pairs-pos_samples_per_second": 84.056,
1711
+ "eval_scitail-pairs-pos_steps_per_second": 3.545,
1712
+ "step": 20300
1713
+ },
1714
+ {
1715
+ "epoch": 3.626943005181347,
1716
+ "eval_qnli-contrastive_loss": 1.141271710395813,
1717
+ "eval_qnli-contrastive_runtime": 4.7207,
1718
+ "eval_qnli-contrastive_samples_per_second": 317.752,
1719
+ "eval_qnli-contrastive_steps_per_second": 13.346,
1720
+ "step": 20300
1721
+ },
1722
+ {
1723
+ "epoch": 3.6519564052170805,
1724
+ "grad_norm": 71.68439483642578,
1725
+ "learning_rate": 1.5561826303886085e-05,
1726
+ "loss": 0.9698,
1727
+ "step": 20440
1728
+ },
1729
+ {
1730
+ "epoch": 3.676969805252814,
1731
+ "grad_norm": 5.957241058349609,
1732
+ "learning_rate": 1.4719999343741618e-05,
1733
+ "loss": 0.9148,
1734
+ "step": 20580
1735
+ },
1736
+ {
1737
+ "epoch": 3.7019832052885473,
1738
+ "grad_norm": 1.4626597166061401,
1739
+ "learning_rate": 1.3839147028686583e-05,
1740
+ "loss": 0.9042,
1741
+ "step": 20720
1742
+ },
1743
+ {
1744
+ "epoch": 3.726996605324281,
1745
+ "grad_norm": 2.4634809494018555,
1746
+ "learning_rate": 1.2915097668067934e-05,
1747
+ "loss": 0.8232,
1748
+ "step": 20860
1749
+ },
1750
+ {
1751
+ "epoch": 3.752010005360014,
1752
+ "grad_norm": 1.5838899612426758,
1753
+ "learning_rate": 1.196294424410312e-05,
1754
+ "loss": 1.0163,
1755
+ "step": 21000
1756
+ },
1757
+ {
1758
+ "epoch": 3.752010005360014,
1759
+ "eval_nli-pairs_loss": 0.9020450115203857,
1760
+ "eval_nli-pairs_runtime": 12.2572,
1761
+ "eval_nli-pairs_samples_per_second": 122.377,
1762
+ "eval_nli-pairs_steps_per_second": 5.14,
1763
+ "step": 21000
1764
+ },
1765
+ {
1766
+ "epoch": 3.752010005360014,
1767
+ "eval_scitail-pairs-pos_loss": 0.4573577046394348,
1768
+ "eval_scitail-pairs-pos_runtime": 15.1478,
1769
+ "eval_scitail-pairs-pos_samples_per_second": 86.085,
1770
+ "eval_scitail-pairs-pos_steps_per_second": 3.631,
1771
+ "step": 21000
1772
+ },
1773
+ {
1774
+ "epoch": 3.752010005360014,
1775
+ "eval_qnli-contrastive_loss": 1.2882591485977173,
1776
+ "eval_qnli-contrastive_runtime": 4.762,
1777
+ "eval_qnli-contrastive_samples_per_second": 314.992,
1778
+ "eval_qnli-contrastive_steps_per_second": 13.23,
1779
+ "step": 21000
1780
+ },
1781
+ {
1782
+ "epoch": 3.777023405395748,
1783
+ "grad_norm": 5.878975868225098,
1784
+ "learning_rate": 1.099186633949893e-05,
1785
+ "loss": 0.9735,
1786
+ "step": 21140
1787
+ },
1788
+ {
1789
+ "epoch": 3.8020368054314813,
1790
+ "grad_norm": 10.22749137878418,
1791
+ "learning_rate": 1.0011225985326909e-05,
1792
+ "loss": 0.8371,
1793
+ "step": 21280
1794
+ },
1795
+ {
1796
+ "epoch": 3.8270502054672146,
1797
+ "grad_norm": 8.895988464355469,
1798
+ "learning_rate": 9.030477402944833e-06,
1799
+ "loss": 0.6344,
1800
+ "step": 21420
1801
+ },
1802
+ {
1803
+ "epoch": 3.852063605502948,
1804
+ "grad_norm": 1.564530372619629,
1805
+ "learning_rate": 8.059075857124063e-06,
1806
+ "loss": 0.87,
1807
+ "step": 21560
1808
+ },
1809
+ {
1810
+ "epoch": 3.8770770055386814,
1811
+ "grad_norm": 3.3526771068573,
1812
+ "learning_rate": 7.106386499117424e-06,
1813
+ "loss": 0.7404,
1814
+ "step": 21700
1815
+ },
1816
+ {
1817
+ "epoch": 3.8770770055386814,
1818
+ "eval_nli-pairs_loss": 0.8661152720451355,
1819
+ "eval_nli-pairs_runtime": 11.9159,
1820
+ "eval_nli-pairs_samples_per_second": 125.883,
1821
+ "eval_nli-pairs_steps_per_second": 5.287,
1822
+ "step": 21700
1823
+ },
1824
+ {
1825
+ "epoch": 3.8770770055386814,
1826
+ "eval_scitail-pairs-pos_loss": 0.4352877140045166,
1827
+ "eval_scitail-pairs-pos_runtime": 14.9412,
1828
+ "eval_scitail-pairs-pos_samples_per_second": 87.275,
1829
+ "eval_scitail-pairs-pos_steps_per_second": 3.681,
1830
+ "step": 21700
1831
+ },
1832
+ {
1833
+ "epoch": 3.8770770055386814,
1834
+ "eval_qnli-contrastive_loss": 1.0643585920333862,
1835
+ "eval_qnli-contrastive_runtime": 4.7458,
1836
+ "eval_qnli-contrastive_samples_per_second": 316.066,
1837
+ "eval_qnli-contrastive_steps_per_second": 13.275,
1838
+ "step": 21700
1839
+ },
1840
+ {
1841
+ "epoch": 3.9020904055744148,
1842
+ "grad_norm": 6.517562389373779,
1843
+ "learning_rate": 6.181594078499504e-06,
1844
+ "loss": 0.8486,
1845
+ "step": 21840
1846
+ },
1847
+ {
1848
+ "epoch": 3.927103805610148,
1849
+ "grad_norm": 4.482045650482178,
1850
+ "learning_rate": 5.293614394235034e-06,
1851
+ "loss": 0.8895,
1852
+ "step": 21980
1853
+ },
1854
+ {
1855
+ "epoch": 3.952117205645882,
1856
+ "grad_norm": 5.165999889373779,
1857
+ "learning_rate": 4.451008338663955e-06,
1858
+ "loss": 0.7476,
1859
+ "step": 22120
1860
+ },
1861
+ {
1862
+ "epoch": 3.977130605681615,
1863
+ "grad_norm": 7.821371078491211,
1864
+ "learning_rate": 3.6618993630932396e-06,
1865
+ "loss": 0.6761,
1866
+ "step": 22260
1867
  }
1868
  ],
1869
  "logging_steps": 140,