mtzig commited on
Commit
29deb5c
1 Parent(s): 168605c

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d943c6bfd2dc2b761b4d682134e0a0fc60ac1cb4096855e5091cc3393184aa64
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e10dec0995b4018139261c78cb605e216971a771e66475c5f1914653fb2f4f57
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:376a9db1af520346798c868246148f3564e7f951f971cfad89c922d341bf7f29
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bde6aac845bb5c182bad863715db5dede22d6eb24ed72999510b1888a0a5420
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9178eab5bc585c22cd46ed2fc1e92f4fdda57d7a3fa8d58230990de0c4d1f153
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3988d837c1f874ca300a2c11da38001b137b3c884ca5e57fcf34a2cf824ef294
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:664f4c51ad8b8db2543ac5506c908df582362cb40fdd8ba94c8d4d17fd478154
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3211e23064c226038f91dc981175a1f92dd37c501b0bb7e4bec35acc01edf597
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21d1ac001b88f8a5c52ed311d48d65c35c0b16a38d8e46e3f8f798f890a0ff73
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:930c7f8d60843cbcdbe3e428179d2be4cca5088da91f304d10ae4907f47fe926
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce135a57bb7016f162e8a5a5cf147734c4de738983a8be7d0e78e3767402b122
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab81432ec16b9b8b19a5a66372867d916f5de9ec1795bdfb765e5d7d340fc43b
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b143b6fca120ff625503c29f4c425415e9b350b6c85048892c81f6d44c3563a
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3ff546ca73e6c043fb8e31fc6c217b47cfd8607e275c50bfe5af6e950eda9de
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cec8c8fdc32c70be31edda43085207cfa5ae9a7dbb023c61d9fae6f55d607e9e
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d50ee0e5fb035d3dbe45a7bf19d6ca079fbb951d97348876885d6301da88bb46
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6151a2afa91abecb8de37b99e4409d6e56f16b75b43592d3da5abbb3ee272563
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ed8b54d01efdc3e72d86ac303a50711daef44ec1675e30f641e3f85c9bc7d52
3
  size 15088
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d72bd74ab0857553e9460d81b4abc084b39a8189791c68c03d4ede2cfc8a8c60
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e74e1e2786f1133a3295c5afe55775a0a39c9d03bae3a56983064f39379bcb6
3
  size 15088
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c5f15126da64891eca473dd1e04d5b41141f581a035a13b14aee5904e6e3f7e
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e405ffd74f6b37faa3d65538d7deaff2ad4d20ad6a87284a70f224a5110eab5
3
  size 15088
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:135e2ac2b5b60d2eeef0012629f402a00ab445fa2c678e7dedd20b300813acb6
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1951028221e1491837c1c800d2872cc479e1297c5c0b156185679bbffbb91c75
3
  size 15088
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6f67a0e885145319f81ed1f8c4c49622761e3f92d5ce81c356bbb700855e8e6
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af29ed8d410f75e7520cc10681cc1970ed7a3864889fb1e10dd381ad082e6570
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.25773195876288657,
5
  "eval_steps": 20,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1539,6 +1539,766 @@
1539
  "eval_samples_per_second": 5.403,
1540
  "eval_steps_per_second": 0.178,
1541
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1542
  }
1543
  ],
1544
  "logging_steps": 1,
@@ -1558,7 +2318,7 @@
1558
  "attributes": {}
1559
  }
1560
  },
1561
- "total_flos": 6.685140289008435e+16,
1562
  "train_batch_size": 8,
1563
  "trial_name": null,
1564
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.3865979381443299,
5
  "eval_steps": 20,
6
+ "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1539
  "eval_samples_per_second": 5.403,
1540
  "eval_steps_per_second": 0.178,
1541
  "step": 200
1542
+ },
1543
+ {
1544
+ "epoch": 0.25902061855670105,
1545
+ "grad_norm": 2.103586435317993,
1546
+ "learning_rate": 1.85063499545063e-05,
1547
+ "loss": 0.013,
1548
+ "step": 201
1549
+ },
1550
+ {
1551
+ "epoch": 0.2603092783505155,
1552
+ "grad_norm": 1.8666274547576904,
1553
+ "learning_rate": 1.8482600359817344e-05,
1554
+ "loss": 0.0245,
1555
+ "step": 202
1556
+ },
1557
+ {
1558
+ "epoch": 0.2615979381443299,
1559
+ "grad_norm": 2.8087830543518066,
1560
+ "learning_rate": 1.8458678927933884e-05,
1561
+ "loss": 0.0187,
1562
+ "step": 203
1563
+ },
1564
+ {
1565
+ "epoch": 0.26288659793814434,
1566
+ "grad_norm": 1.8675556182861328,
1567
+ "learning_rate": 1.843458614344691e-05,
1568
+ "loss": 0.0156,
1569
+ "step": 204
1570
+ },
1571
+ {
1572
+ "epoch": 0.26417525773195877,
1573
+ "grad_norm": 0.7611345648765564,
1574
+ "learning_rate": 1.8410322494418606e-05,
1575
+ "loss": 0.0046,
1576
+ "step": 205
1577
+ },
1578
+ {
1579
+ "epoch": 0.2654639175257732,
1580
+ "grad_norm": 1.599369764328003,
1581
+ "learning_rate": 1.8385888472372474e-05,
1582
+ "loss": 0.0328,
1583
+ "step": 206
1584
+ },
1585
+ {
1586
+ "epoch": 0.2667525773195876,
1587
+ "grad_norm": 0.5751793384552002,
1588
+ "learning_rate": 1.8361284572283356e-05,
1589
+ "loss": 0.0049,
1590
+ "step": 207
1591
+ },
1592
+ {
1593
+ "epoch": 0.26804123711340205,
1594
+ "grad_norm": 1.689562439918518,
1595
+ "learning_rate": 1.833651129256742e-05,
1596
+ "loss": 0.0157,
1597
+ "step": 208
1598
+ },
1599
+ {
1600
+ "epoch": 0.2693298969072165,
1601
+ "grad_norm": 0.5357356071472168,
1602
+ "learning_rate": 1.831156913507206e-05,
1603
+ "loss": 0.0047,
1604
+ "step": 209
1605
+ },
1606
+ {
1607
+ "epoch": 0.2706185567010309,
1608
+ "grad_norm": 1.4983047246932983,
1609
+ "learning_rate": 1.828645860506573e-05,
1610
+ "loss": 0.0487,
1611
+ "step": 210
1612
+ },
1613
+ {
1614
+ "epoch": 0.27190721649484534,
1615
+ "grad_norm": 0.2579003870487213,
1616
+ "learning_rate": 1.826118021122771e-05,
1617
+ "loss": 0.0029,
1618
+ "step": 211
1619
+ },
1620
+ {
1621
+ "epoch": 0.27319587628865977,
1622
+ "grad_norm": 2.843892812728882,
1623
+ "learning_rate": 1.8235734465637794e-05,
1624
+ "loss": 0.0181,
1625
+ "step": 212
1626
+ },
1627
+ {
1628
+ "epoch": 0.27448453608247425,
1629
+ "grad_norm": 1.1521669626235962,
1630
+ "learning_rate": 1.821012188376593e-05,
1631
+ "loss": 0.0221,
1632
+ "step": 213
1633
+ },
1634
+ {
1635
+ "epoch": 0.2757731958762887,
1636
+ "grad_norm": 1.4023137092590332,
1637
+ "learning_rate": 1.8184342984461766e-05,
1638
+ "loss": 0.0097,
1639
+ "step": 214
1640
+ },
1641
+ {
1642
+ "epoch": 0.2770618556701031,
1643
+ "grad_norm": 1.1344298124313354,
1644
+ "learning_rate": 1.8158398289944145e-05,
1645
+ "loss": 0.005,
1646
+ "step": 215
1647
+ },
1648
+ {
1649
+ "epoch": 0.27835051546391754,
1650
+ "grad_norm": 1.5524466037750244,
1651
+ "learning_rate": 1.8132288325790518e-05,
1652
+ "loss": 0.0105,
1653
+ "step": 216
1654
+ },
1655
+ {
1656
+ "epoch": 0.27963917525773196,
1657
+ "grad_norm": 0.8397157192230225,
1658
+ "learning_rate": 1.8106013620926312e-05,
1659
+ "loss": 0.0257,
1660
+ "step": 217
1661
+ },
1662
+ {
1663
+ "epoch": 0.2809278350515464,
1664
+ "grad_norm": 2.6008617877960205,
1665
+ "learning_rate": 1.8079574707614202e-05,
1666
+ "loss": 0.013,
1667
+ "step": 218
1668
+ },
1669
+ {
1670
+ "epoch": 0.2822164948453608,
1671
+ "grad_norm": 0.8384814262390137,
1672
+ "learning_rate": 1.8052972121443337e-05,
1673
+ "loss": 0.0076,
1674
+ "step": 219
1675
+ },
1676
+ {
1677
+ "epoch": 0.28350515463917525,
1678
+ "grad_norm": 1.8078651428222656,
1679
+ "learning_rate": 1.802620640131848e-05,
1680
+ "loss": 0.0138,
1681
+ "step": 220
1682
+ },
1683
+ {
1684
+ "epoch": 0.28350515463917525,
1685
+ "eval_accuracy": 0.9930486593843099,
1686
+ "eval_f1": 0.8833333333333333,
1687
+ "eval_loss": 0.02096499688923359,
1688
+ "eval_precision": 0.8412698412698413,
1689
+ "eval_recall": 0.9298245614035088,
1690
+ "eval_runtime": 85.3923,
1691
+ "eval_samples_per_second": 5.328,
1692
+ "eval_steps_per_second": 0.176,
1693
+ "step": 220
1694
+ },
1695
+ {
1696
+ "epoch": 0.2847938144329897,
1697
+ "grad_norm": 2.1315155029296875,
1698
+ "learning_rate": 1.799927808944911e-05,
1699
+ "loss": 0.0182,
1700
+ "step": 221
1701
+ },
1702
+ {
1703
+ "epoch": 0.2860824742268041,
1704
+ "grad_norm": 1.9272891283035278,
1705
+ "learning_rate": 1.797218773133841e-05,
1706
+ "loss": 0.0152,
1707
+ "step": 222
1708
+ },
1709
+ {
1710
+ "epoch": 0.28737113402061853,
1711
+ "grad_norm": 4.905808448791504,
1712
+ "learning_rate": 1.7944935875772244e-05,
1713
+ "loss": 0.0215,
1714
+ "step": 223
1715
+ },
1716
+ {
1717
+ "epoch": 0.28865979381443296,
1718
+ "grad_norm": 4.647861480712891,
1719
+ "learning_rate": 1.7917523074808024e-05,
1720
+ "loss": 0.0258,
1721
+ "step": 224
1722
+ },
1723
+ {
1724
+ "epoch": 0.28994845360824745,
1725
+ "grad_norm": 1.2799395322799683,
1726
+ "learning_rate": 1.7889949883763532e-05,
1727
+ "loss": 0.0232,
1728
+ "step": 225
1729
+ },
1730
+ {
1731
+ "epoch": 0.2912371134020619,
1732
+ "grad_norm": 2.6159801483154297,
1733
+ "learning_rate": 1.786221686120567e-05,
1734
+ "loss": 0.0332,
1735
+ "step": 226
1736
+ },
1737
+ {
1738
+ "epoch": 0.2925257731958763,
1739
+ "grad_norm": 0.3623534142971039,
1740
+ "learning_rate": 1.7834324568939137e-05,
1741
+ "loss": 0.0031,
1742
+ "step": 227
1743
+ },
1744
+ {
1745
+ "epoch": 0.29381443298969073,
1746
+ "grad_norm": 2.5764312744140625,
1747
+ "learning_rate": 1.7806273571995066e-05,
1748
+ "loss": 0.0209,
1749
+ "step": 228
1750
+ },
1751
+ {
1752
+ "epoch": 0.29510309278350516,
1753
+ "grad_norm": 2.027851104736328,
1754
+ "learning_rate": 1.7778064438619562e-05,
1755
+ "loss": 0.0128,
1756
+ "step": 229
1757
+ },
1758
+ {
1759
+ "epoch": 0.2963917525773196,
1760
+ "grad_norm": 0.7468307614326477,
1761
+ "learning_rate": 1.7749697740262197e-05,
1762
+ "loss": 0.0046,
1763
+ "step": 230
1764
+ },
1765
+ {
1766
+ "epoch": 0.297680412371134,
1767
+ "grad_norm": 1.3534049987792969,
1768
+ "learning_rate": 1.772117405156443e-05,
1769
+ "loss": 0.0137,
1770
+ "step": 231
1771
+ },
1772
+ {
1773
+ "epoch": 0.29896907216494845,
1774
+ "grad_norm": 0.830199658870697,
1775
+ "learning_rate": 1.769249395034797e-05,
1776
+ "loss": 0.0044,
1777
+ "step": 232
1778
+ },
1779
+ {
1780
+ "epoch": 0.3002577319587629,
1781
+ "grad_norm": 1.318949580192566,
1782
+ "learning_rate": 1.7663658017603073e-05,
1783
+ "loss": 0.0156,
1784
+ "step": 233
1785
+ },
1786
+ {
1787
+ "epoch": 0.3015463917525773,
1788
+ "grad_norm": 1.9248756170272827,
1789
+ "learning_rate": 1.7634666837476765e-05,
1790
+ "loss": 0.0379,
1791
+ "step": 234
1792
+ },
1793
+ {
1794
+ "epoch": 0.30283505154639173,
1795
+ "grad_norm": 1.7694895267486572,
1796
+ "learning_rate": 1.7605520997261014e-05,
1797
+ "loss": 0.0142,
1798
+ "step": 235
1799
+ },
1800
+ {
1801
+ "epoch": 0.30412371134020616,
1802
+ "grad_norm": 1.5250486135482788,
1803
+ "learning_rate": 1.757622108738083e-05,
1804
+ "loss": 0.0092,
1805
+ "step": 236
1806
+ },
1807
+ {
1808
+ "epoch": 0.30541237113402064,
1809
+ "grad_norm": 1.273772954940796,
1810
+ "learning_rate": 1.754676770138231e-05,
1811
+ "loss": 0.0216,
1812
+ "step": 237
1813
+ },
1814
+ {
1815
+ "epoch": 0.30670103092783507,
1816
+ "grad_norm": 5.388645172119141,
1817
+ "learning_rate": 1.7517161435920606e-05,
1818
+ "loss": 0.0589,
1819
+ "step": 238
1820
+ },
1821
+ {
1822
+ "epoch": 0.3079896907216495,
1823
+ "grad_norm": 2.30202317237854,
1824
+ "learning_rate": 1.7487402890747843e-05,
1825
+ "loss": 0.016,
1826
+ "step": 239
1827
+ },
1828
+ {
1829
+ "epoch": 0.30927835051546393,
1830
+ "grad_norm": 3.9984192848205566,
1831
+ "learning_rate": 1.7457492668700967e-05,
1832
+ "loss": 0.0141,
1833
+ "step": 240
1834
+ },
1835
+ {
1836
+ "epoch": 0.30927835051546393,
1837
+ "eval_accuracy": 0.9955312810327706,
1838
+ "eval_f1": 0.9203539823008849,
1839
+ "eval_loss": 0.01754908636212349,
1840
+ "eval_precision": 0.9285714285714286,
1841
+ "eval_recall": 0.9122807017543859,
1842
+ "eval_runtime": 86.0275,
1843
+ "eval_samples_per_second": 5.289,
1844
+ "eval_steps_per_second": 0.174,
1845
+ "step": 240
1846
+ },
1847
+ {
1848
+ "epoch": 0.31056701030927836,
1849
+ "grad_norm": 1.2816661596298218,
1850
+ "learning_rate": 1.7427431375689544e-05,
1851
+ "loss": 0.0147,
1852
+ "step": 241
1853
+ },
1854
+ {
1855
+ "epoch": 0.3118556701030928,
1856
+ "grad_norm": 1.6840155124664307,
1857
+ "learning_rate": 1.7397219620683465e-05,
1858
+ "loss": 0.0047,
1859
+ "step": 242
1860
+ },
1861
+ {
1862
+ "epoch": 0.3131443298969072,
1863
+ "grad_norm": 1.563914179801941,
1864
+ "learning_rate": 1.7366858015700626e-05,
1865
+ "loss": 0.017,
1866
+ "step": 243
1867
+ },
1868
+ {
1869
+ "epoch": 0.31443298969072164,
1870
+ "grad_norm": 1.6181697845458984,
1871
+ "learning_rate": 1.7336347175794523e-05,
1872
+ "loss": 0.0137,
1873
+ "step": 244
1874
+ },
1875
+ {
1876
+ "epoch": 0.31572164948453607,
1877
+ "grad_norm": 0.8612284064292908,
1878
+ "learning_rate": 1.73056877190418e-05,
1879
+ "loss": 0.0052,
1880
+ "step": 245
1881
+ },
1882
+ {
1883
+ "epoch": 0.3170103092783505,
1884
+ "grad_norm": 1.467340111732483,
1885
+ "learning_rate": 1.7274880266529716e-05,
1886
+ "loss": 0.0085,
1887
+ "step": 246
1888
+ },
1889
+ {
1890
+ "epoch": 0.31829896907216493,
1891
+ "grad_norm": 1.4295095205307007,
1892
+ "learning_rate": 1.7243925442343578e-05,
1893
+ "loss": 0.0333,
1894
+ "step": 247
1895
+ },
1896
+ {
1897
+ "epoch": 0.31958762886597936,
1898
+ "grad_norm": 0.7592663764953613,
1899
+ "learning_rate": 1.721282387355408e-05,
1900
+ "loss": 0.009,
1901
+ "step": 248
1902
+ },
1903
+ {
1904
+ "epoch": 0.32087628865979384,
1905
+ "grad_norm": 1.1355818510055542,
1906
+ "learning_rate": 1.718157619020462e-05,
1907
+ "loss": 0.0067,
1908
+ "step": 249
1909
+ },
1910
+ {
1911
+ "epoch": 0.32216494845360827,
1912
+ "grad_norm": 1.8645901679992676,
1913
+ "learning_rate": 1.715018302529852e-05,
1914
+ "loss": 0.0126,
1915
+ "step": 250
1916
+ },
1917
+ {
1918
+ "epoch": 0.3234536082474227,
1919
+ "grad_norm": 1.5803511142730713,
1920
+ "learning_rate": 1.711864501478622e-05,
1921
+ "loss": 0.0157,
1922
+ "step": 251
1923
+ },
1924
+ {
1925
+ "epoch": 0.3247422680412371,
1926
+ "grad_norm": 2.9905714988708496,
1927
+ "learning_rate": 1.7086962797552376e-05,
1928
+ "loss": 0.0167,
1929
+ "step": 252
1930
+ },
1931
+ {
1932
+ "epoch": 0.32603092783505155,
1933
+ "grad_norm": 0.2156965732574463,
1934
+ "learning_rate": 1.7055137015402935e-05,
1935
+ "loss": 0.0038,
1936
+ "step": 253
1937
+ },
1938
+ {
1939
+ "epoch": 0.327319587628866,
1940
+ "grad_norm": 1.7128149271011353,
1941
+ "learning_rate": 1.7023168313052118e-05,
1942
+ "loss": 0.0221,
1943
+ "step": 254
1944
+ },
1945
+ {
1946
+ "epoch": 0.3286082474226804,
1947
+ "grad_norm": 0.7819356322288513,
1948
+ "learning_rate": 1.6991057338109376e-05,
1949
+ "loss": 0.0043,
1950
+ "step": 255
1951
+ },
1952
+ {
1953
+ "epoch": 0.32989690721649484,
1954
+ "grad_norm": 2.1492764949798584,
1955
+ "learning_rate": 1.6958804741066254e-05,
1956
+ "loss": 0.0289,
1957
+ "step": 256
1958
+ },
1959
+ {
1960
+ "epoch": 0.33118556701030927,
1961
+ "grad_norm": 1.3265386819839478,
1962
+ "learning_rate": 1.6926411175283227e-05,
1963
+ "loss": 0.0091,
1964
+ "step": 257
1965
+ },
1966
+ {
1967
+ "epoch": 0.3324742268041237,
1968
+ "grad_norm": 1.2150596380233765,
1969
+ "learning_rate": 1.689387729697646e-05,
1970
+ "loss": 0.0065,
1971
+ "step": 258
1972
+ },
1973
+ {
1974
+ "epoch": 0.3337628865979381,
1975
+ "grad_norm": 1.4492149353027344,
1976
+ "learning_rate": 1.686120376520451e-05,
1977
+ "loss": 0.009,
1978
+ "step": 259
1979
+ },
1980
+ {
1981
+ "epoch": 0.33505154639175255,
1982
+ "grad_norm": 0.4440682530403137,
1983
+ "learning_rate": 1.6828391241854983e-05,
1984
+ "loss": 0.0037,
1985
+ "step": 260
1986
+ },
1987
+ {
1988
+ "epoch": 0.33505154639175255,
1989
+ "eval_accuracy": 0.9940417080436942,
1990
+ "eval_f1": 0.896551724137931,
1991
+ "eval_loss": 0.01701418310403824,
1992
+ "eval_precision": 0.8813559322033898,
1993
+ "eval_recall": 0.9122807017543859,
1994
+ "eval_runtime": 86.0037,
1995
+ "eval_samples_per_second": 5.29,
1996
+ "eval_steps_per_second": 0.174,
1997
+ "step": 260
1998
+ },
1999
+ {
2000
+ "epoch": 0.33634020618556704,
2001
+ "grad_norm": 0.5090395212173462,
2002
+ "learning_rate": 1.6795440391631122e-05,
2003
+ "loss": 0.0047,
2004
+ "step": 261
2005
+ },
2006
+ {
2007
+ "epoch": 0.33762886597938147,
2008
+ "grad_norm": 2.755124807357788,
2009
+ "learning_rate": 1.6762351882038342e-05,
2010
+ "loss": 0.0169,
2011
+ "step": 262
2012
+ },
2013
+ {
2014
+ "epoch": 0.3389175257731959,
2015
+ "grad_norm": 2.456214189529419,
2016
+ "learning_rate": 1.6729126383370696e-05,
2017
+ "loss": 0.031,
2018
+ "step": 263
2019
+ },
2020
+ {
2021
+ "epoch": 0.3402061855670103,
2022
+ "grad_norm": 0.8938114047050476,
2023
+ "learning_rate": 1.669576456869733e-05,
2024
+ "loss": 0.0051,
2025
+ "step": 264
2026
+ },
2027
+ {
2028
+ "epoch": 0.34149484536082475,
2029
+ "grad_norm": 0.6057696342468262,
2030
+ "learning_rate": 1.666226711384881e-05,
2031
+ "loss": 0.0029,
2032
+ "step": 265
2033
+ },
2034
+ {
2035
+ "epoch": 0.3427835051546392,
2036
+ "grad_norm": 0.5478299856185913,
2037
+ "learning_rate": 1.6628634697403447e-05,
2038
+ "loss": 0.0026,
2039
+ "step": 266
2040
+ },
2041
+ {
2042
+ "epoch": 0.3440721649484536,
2043
+ "grad_norm": 2.5459206104278564,
2044
+ "learning_rate": 1.6594868000673562e-05,
2045
+ "loss": 0.0217,
2046
+ "step": 267
2047
+ },
2048
+ {
2049
+ "epoch": 0.34536082474226804,
2050
+ "grad_norm": 0.39067503809928894,
2051
+ "learning_rate": 1.6560967707691663e-05,
2052
+ "loss": 0.0034,
2053
+ "step": 268
2054
+ },
2055
+ {
2056
+ "epoch": 0.34664948453608246,
2057
+ "grad_norm": 0.7849224209785461,
2058
+ "learning_rate": 1.6526934505196605e-05,
2059
+ "loss": 0.0059,
2060
+ "step": 269
2061
+ },
2062
+ {
2063
+ "epoch": 0.3479381443298969,
2064
+ "grad_norm": 3.5039610862731934,
2065
+ "learning_rate": 1.649276908261967e-05,
2066
+ "loss": 0.0557,
2067
+ "step": 270
2068
+ },
2069
+ {
2070
+ "epoch": 0.3492268041237113,
2071
+ "grad_norm": 1.609676480293274,
2072
+ "learning_rate": 1.64584721320706e-05,
2073
+ "loss": 0.0124,
2074
+ "step": 271
2075
+ },
2076
+ {
2077
+ "epoch": 0.35051546391752575,
2078
+ "grad_norm": 3.219574213027954,
2079
+ "learning_rate": 1.642404434832358e-05,
2080
+ "loss": 0.0438,
2081
+ "step": 272
2082
+ },
2083
+ {
2084
+ "epoch": 0.35180412371134023,
2085
+ "grad_norm": 2.468843936920166,
2086
+ "learning_rate": 1.6389486428803173e-05,
2087
+ "loss": 0.0084,
2088
+ "step": 273
2089
+ },
2090
+ {
2091
+ "epoch": 0.35309278350515466,
2092
+ "grad_norm": 2.0141680240631104,
2093
+ "learning_rate": 1.635479907357016e-05,
2094
+ "loss": 0.0419,
2095
+ "step": 274
2096
+ },
2097
+ {
2098
+ "epoch": 0.3543814432989691,
2099
+ "grad_norm": 1.8954237699508667,
2100
+ "learning_rate": 1.63199829853074e-05,
2101
+ "loss": 0.0293,
2102
+ "step": 275
2103
+ },
2104
+ {
2105
+ "epoch": 0.3556701030927835,
2106
+ "grad_norm": 5.950355529785156,
2107
+ "learning_rate": 1.6285038869305565e-05,
2108
+ "loss": 0.0224,
2109
+ "step": 276
2110
+ },
2111
+ {
2112
+ "epoch": 0.35695876288659795,
2113
+ "grad_norm": 0.3175673186779022,
2114
+ "learning_rate": 1.624996743344887e-05,
2115
+ "loss": 0.002,
2116
+ "step": 277
2117
+ },
2118
+ {
2119
+ "epoch": 0.3582474226804124,
2120
+ "grad_norm": 1.8034769296646118,
2121
+ "learning_rate": 1.621476938820071e-05,
2122
+ "loss": 0.0107,
2123
+ "step": 278
2124
+ },
2125
+ {
2126
+ "epoch": 0.3595360824742268,
2127
+ "grad_norm": 4.965821743011475,
2128
+ "learning_rate": 1.6179445446589308e-05,
2129
+ "loss": 0.019,
2130
+ "step": 279
2131
+ },
2132
+ {
2133
+ "epoch": 0.36082474226804123,
2134
+ "grad_norm": 2.015825033187866,
2135
+ "learning_rate": 1.6143996324193227e-05,
2136
+ "loss": 0.0076,
2137
+ "step": 280
2138
+ },
2139
+ {
2140
+ "epoch": 0.36082474226804123,
2141
+ "eval_accuracy": 0.9955312810327706,
2142
+ "eval_f1": 0.9230769230769231,
2143
+ "eval_loss": 0.018561244010925293,
2144
+ "eval_precision": 0.9,
2145
+ "eval_recall": 0.9473684210526315,
2146
+ "eval_runtime": 85.7232,
2147
+ "eval_samples_per_second": 5.308,
2148
+ "eval_steps_per_second": 0.175,
2149
+ "step": 280
2150
+ },
2151
+ {
2152
+ "epoch": 0.36211340206185566,
2153
+ "grad_norm": 0.7538387179374695,
2154
+ "learning_rate": 1.6108422739126896e-05,
2155
+ "loss": 0.0053,
2156
+ "step": 281
2157
+ },
2158
+ {
2159
+ "epoch": 0.3634020618556701,
2160
+ "grad_norm": 6.5049309730529785,
2161
+ "learning_rate": 1.6072725412026066e-05,
2162
+ "loss": 0.0248,
2163
+ "step": 282
2164
+ },
2165
+ {
2166
+ "epoch": 0.3646907216494845,
2167
+ "grad_norm": 2.8648669719696045,
2168
+ "learning_rate": 1.6036905066033207e-05,
2169
+ "loss": 0.0055,
2170
+ "step": 283
2171
+ },
2172
+ {
2173
+ "epoch": 0.36597938144329895,
2174
+ "grad_norm": 0.21768106520175934,
2175
+ "learning_rate": 1.6000962426782844e-05,
2176
+ "loss": 0.0011,
2177
+ "step": 284
2178
+ },
2179
+ {
2180
+ "epoch": 0.36726804123711343,
2181
+ "grad_norm": 1.1262462139129639,
2182
+ "learning_rate": 1.596489822238689e-05,
2183
+ "loss": 0.0103,
2184
+ "step": 285
2185
+ },
2186
+ {
2187
+ "epoch": 0.36855670103092786,
2188
+ "grad_norm": 0.8232690095901489,
2189
+ "learning_rate": 1.592871318341986e-05,
2190
+ "loss": 0.0036,
2191
+ "step": 286
2192
+ },
2193
+ {
2194
+ "epoch": 0.3698453608247423,
2195
+ "grad_norm": 3.285132646560669,
2196
+ "learning_rate": 1.5892408042904098e-05,
2197
+ "loss": 0.0073,
2198
+ "step": 287
2199
+ },
2200
+ {
2201
+ "epoch": 0.3711340206185567,
2202
+ "grad_norm": 0.6526831388473511,
2203
+ "learning_rate": 1.585598353629492e-05,
2204
+ "loss": 0.0044,
2205
+ "step": 288
2206
+ },
2207
+ {
2208
+ "epoch": 0.37242268041237114,
2209
+ "grad_norm": 0.3050073981285095,
2210
+ "learning_rate": 1.58194404014657e-05,
2211
+ "loss": 0.0016,
2212
+ "step": 289
2213
+ },
2214
+ {
2215
+ "epoch": 0.37371134020618557,
2216
+ "grad_norm": 4.195672988891602,
2217
+ "learning_rate": 1.5782779378692957e-05,
2218
+ "loss": 0.0229,
2219
+ "step": 290
2220
+ },
2221
+ {
2222
+ "epoch": 0.375,
2223
+ "grad_norm": 2.8954057693481445,
2224
+ "learning_rate": 1.5746001210641316e-05,
2225
+ "loss": 0.0169,
2226
+ "step": 291
2227
+ },
2228
+ {
2229
+ "epoch": 0.37628865979381443,
2230
+ "grad_norm": 0.4121025800704956,
2231
+ "learning_rate": 1.57091066423485e-05,
2232
+ "loss": 0.0027,
2233
+ "step": 292
2234
+ },
2235
+ {
2236
+ "epoch": 0.37757731958762886,
2237
+ "grad_norm": 1.3155614137649536,
2238
+ "learning_rate": 1.5672096421210217e-05,
2239
+ "loss": 0.0139,
2240
+ "step": 293
2241
+ },
2242
+ {
2243
+ "epoch": 0.3788659793814433,
2244
+ "grad_norm": 0.1106419637799263,
2245
+ "learning_rate": 1.5634971296965027e-05,
2246
+ "loss": 0.0008,
2247
+ "step": 294
2248
+ },
2249
+ {
2250
+ "epoch": 0.3801546391752577,
2251
+ "grad_norm": 0.11678878217935562,
2252
+ "learning_rate": 1.5597732021679153e-05,
2253
+ "loss": 0.0008,
2254
+ "step": 295
2255
+ },
2256
+ {
2257
+ "epoch": 0.38144329896907214,
2258
+ "grad_norm": 2.1817727088928223,
2259
+ "learning_rate": 1.5560379349731234e-05,
2260
+ "loss": 0.0171,
2261
+ "step": 296
2262
+ },
2263
+ {
2264
+ "epoch": 0.38273195876288657,
2265
+ "grad_norm": 2.412383556365967,
2266
+ "learning_rate": 1.552291403779707e-05,
2267
+ "loss": 0.0203,
2268
+ "step": 297
2269
+ },
2270
+ {
2271
+ "epoch": 0.38402061855670105,
2272
+ "grad_norm": 2.77812123298645,
2273
+ "learning_rate": 1.5485336844834274e-05,
2274
+ "loss": 0.0134,
2275
+ "step": 298
2276
+ },
2277
+ {
2278
+ "epoch": 0.3853092783505155,
2279
+ "grad_norm": 0.05827389657497406,
2280
+ "learning_rate": 1.544764853206689e-05,
2281
+ "loss": 0.0005,
2282
+ "step": 299
2283
+ },
2284
+ {
2285
+ "epoch": 0.3865979381443299,
2286
+ "grad_norm": 1.8391661643981934,
2287
+ "learning_rate": 1.5409849862969994e-05,
2288
+ "loss": 0.0133,
2289
+ "step": 300
2290
+ },
2291
+ {
2292
+ "epoch": 0.3865979381443299,
2293
+ "eval_accuracy": 0.9975173783515392,
2294
+ "eval_f1": 0.9557522123893806,
2295
+ "eval_loss": 0.015246791765093803,
2296
+ "eval_precision": 0.9642857142857143,
2297
+ "eval_recall": 0.9473684210526315,
2298
+ "eval_runtime": 85.9032,
2299
+ "eval_samples_per_second": 5.297,
2300
+ "eval_steps_per_second": 0.175,
2301
+ "step": 300
2302
  }
2303
  ],
2304
  "logging_steps": 1,
 
2318
  "attributes": {}
2319
  }
2320
  },
2321
+ "total_flos": 1.0011678798669414e+17,
2322
  "train_batch_size": 8,
2323
  "trial_name": null,
2324
  "trial_params": null