mtzig commited on
Commit
c5a961f
·
verified ·
1 Parent(s): fe2e3cb

Training in progress, step 3614, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:268444beddbb5a7d91ef5cb5b665db6aef7c7fcec333ad0c3a08c686565a8913
3
  size 1130174
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b57402c9a04b6a83c55631793729abf4135e066e940007e5be32d1b580969c5
3
  size 1130174
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c087ac61d068ff86b9310239fcb0b763d36d2fb265ae2f65972b0892c62e815b
3
  size 7242420036
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b082226244870565d28873eba1d507851f0af49f4366f9a37b971141a812bac
3
  size 7242420036
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:289e64957b824caccb5a7d36e8929fef7b62cf8fcec2dd4c9b69df6d6d2ad84e
3
  size 7242473280
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94d305a995adc93c165aa366b5cfa7cea152c21931cf4b250d0e0f76cf977dc6
3
  size 7242473280
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc3a9e94aa60029ab2b26eee275332b5602a587d09513e314517fe265b826336
3
  size 7242469280
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce18db0ad733a422365d20b5d06639b0c2e8c082890dcee42f3551c057730408
3
  size 7242469280
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75ffef6d56758276b78cf9eebe93d4d83914fa79fcaeca75ae94133640be3b29
3
  size 7242469280
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d1fd6f4a58d8be9b8f4ed9c00096312ac2a19be2b05a83ef344c3caa6fddfb0
3
  size 7242469280
last-checkpoint/optimizer_0/__4_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d01e5b9ddf802f6a422646b056a23b3fbe05ff93b6930b6755300be4894c712
3
  size 7242471556
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dc057d11daf173591f444f3e42edce09b92d5f8fa6cf2ce68d3ec7654d90c72
3
  size 7242471556
last-checkpoint/optimizer_0/__5_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16fe6a883206ff7b9651bc19a06d8b3987ddbd9a9e8cbbe9c11d875741d9dc6a
3
  size 7242471556
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6f87ef3065fc4b31182b03a4a17a669e4d62016bfcaf05113ea4793e1d37546
3
  size 7242471556
last-checkpoint/optimizer_0/__6_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:420747a58261d11f13d0b7877ca8d5a836aedbcf81b1ab915458eca1c5b7b07c
3
  size 7242471556
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aec250e9248ff1407735a2e38d1f42ca07be095eb7d003ce955c101c4bf37f4
3
  size 7242471556
last-checkpoint/optimizer_0/__7_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd539001454d1aecf6ff0aed4e849255d703a67f16dc4fba8c31a28ccd1fd027
3
  size 7242481476
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f74c7304e82f6b6ca55a4662ed1108321ff9c5805d048fa74b7eced022291ad
3
  size 7242481476
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b32cd75a5673f4c46449f0ff87db37d4991506ede421aab3f9b20a60edbaa45
3
  size 3621209428
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96a514e755628059dd0e872f51e9e9c03303f32c8bb807e60ede865d5074b952
3
  size 3621209428
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd4dc89c486499582d0239f57be38d061d43385604d9dca4aba765cf2241536c
3
  size 3621209428
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dabe2309a6a1a18a27265cbb83de5cbf207f35f6ef8f3af19f22dcb76735a00
3
  size 3621209428
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34dfdc92dc5c4d7743496ab404c7b2bdb4ef766a1ecc160e4b587f3a3148a09e
3
  size 3621209428
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a6c9137d8b880c8044701a18add7422c303241d293c1b566c8d7be63bbda599
3
  size 3621209428
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c2d077edf81f8a2e0faaffb9ad04610ca0596326e8669b526b260742e7cb4d5
3
  size 3621209428
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ec44558c788ed40d962a7a7ac7d47f797d88247ed258682966e1699e97fb500
3
  size 3621209428
last-checkpoint/pytorch_model_fsdp_0/__4_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f138bb4a538672b2e4a60a1a043a7f1e4fb6d41e05a6a12867f22b2f6b4443b2
3
  size 3621209428
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bda62e5d3c9978ebc867b9a5f6e80ddeeb08c36f0e4c0f708e3b4650f841d014
3
  size 3621209428
last-checkpoint/pytorch_model_fsdp_0/__5_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c55b02b75216a84dc5bc094cc9fe1eb4bce52ed53eb5b4d8422a4980092ef06e
3
  size 3621209428
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79b845162dd458f08b347feb21b321cc26bd49f8a90ec6692679f7cc11b5843f
3
  size 3621209428
last-checkpoint/pytorch_model_fsdp_0/__6_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60733eb18a71043c1601a1e3d88059714c1609dc45ea18a81ee2ed0f06026b16
3
  size 3621209428
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:293577d7261da8e39c01fb14ec56c5de9511e969763ae8a2c7e45eae07bb561c
3
  size 3621209428
last-checkpoint/pytorch_model_fsdp_0/__7_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ebe351a9e7adc275647873f0d026749cd90c2d7f0b02b4f54e8db4db1c3b1d7
3
  size 3621209428
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b52705274f60d2ba6542de0450be411273d3dedeffe53a0afb4913227822ee7a
3
  size 3621209428
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37563542a2053d121a18e38b53d5eacbd5a8a56ffc762d2fc6ba3aca1c345c28
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d567092e95857ec2cef0d42902ce0b7b850534369c357b926f52de75e6483b0c
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:22b4d19df971cb0e10833e6468d4e75e98f2bb9f15fba9ac69c676842f44a7bf
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a43106d8f7ea7f1b60a91f9c8dae3af8f4578d4a265510b906d37cd15ce8b30e
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f4075d6bc44efbb6da438685827fb7ea74011b7ee8a7952293c08412e393ec0
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2aebd5c68731a8f11af25059034403042750f2005fa4358349f244ccdff4a5d
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb2b5faa03ec4380b2492cab429fbb33ad1cf1dc9709616dc45076680e1d9489
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6d2d85ac2d4aae4a01ac2269d0aad572db524a435d3c1422869e17cc061f2f8
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02d3942e4518f17813014944d7923a9ea06fd5afadf183e6f63ff2ea9f10296b
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:148f3aa738ce382dd8aa49c18096e0644495a5e19f8e8545441dda2bef5afd1c
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:618b7aa8c086dac7361b603ef9fd226ea34d2b62e3b0380b72515bed54d74b1c
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bfb030b9b6b13727bb580725f96bd18f81f6aa676eb9c08437ece83091d55a2
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:526882e40ab43ef441655b29b6feed50978eb1b4f317dbf6dc5727bab57b68b7
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ae2f442a63b70c5786d8b8bc2857bc39fb849023f943cc0d7c5d4a9ba3108f4
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e6109b90c545c3aa0c18e3d098415feeff62366f202fa6219bc0cab14f5e269b
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6333d466bbde25a38a25343ca06136fd75295a54967090d2fe4c85ebb3b769ee
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bffb650b01ff286af25ed32d89dc7e672eba21627ca5382acef7e640641ba621
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21662799391bd7e52a3f2ab8ae98718c240562ca5104178c8186801ae79acc59
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9683890157017362,
5
  "eval_steps": 100,
6
- "global_step": 3500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -24939,6 +24939,816 @@
24939
  "eval_samples_per_second": 6.662,
24940
  "eval_steps_per_second": 0.236,
24941
  "step": 3500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24942
  }
24943
  ],
24944
  "logging_steps": 1,
@@ -24953,12 +25763,12 @@
24953
  "should_evaluate": false,
24954
  "should_log": false,
24955
  "should_save": true,
24956
- "should_training_stop": false
24957
  },
24958
  "attributes": {}
24959
  }
24960
  },
24961
- "total_flos": 8.311866995716915e+17,
24962
  "train_batch_size": 2,
24963
  "trial_name": null,
24964
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9999308293560213,
5
  "eval_steps": 100,
6
+ "global_step": 3614,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
24939
  "eval_samples_per_second": 6.662,
24940
  "eval_steps_per_second": 0.236,
24941
  "step": 3500
24942
+ },
24943
+ {
24944
+ "epoch": 0.9686656982776509,
24945
+ "grad_norm": 5.235546112060547,
24946
+ "learning_rate": 4.9179685389096896e-08,
24947
+ "loss": 0.3635,
24948
+ "step": 3501
24949
+ },
24950
+ {
24951
+ "epoch": 0.9689423808535658,
24952
+ "grad_norm": 6.3581223487854,
24953
+ "learning_rate": 4.831449832053525e-08,
24954
+ "loss": 0.443,
24955
+ "step": 3502
24956
+ },
24957
+ {
24958
+ "epoch": 0.9692190634294805,
24959
+ "grad_norm": 8.401195526123047,
24960
+ "learning_rate": 4.74569523379137e-08,
24961
+ "loss": 0.4533,
24962
+ "step": 3503
24963
+ },
24964
+ {
24965
+ "epoch": 0.9694957460053953,
24966
+ "grad_norm": 10.938070297241211,
24967
+ "learning_rate": 4.6607048764534814e-08,
24968
+ "loss": 0.4868,
24969
+ "step": 3504
24970
+ },
24971
+ {
24972
+ "epoch": 0.9697724285813101,
24973
+ "grad_norm": 5.621330261230469,
24974
+ "learning_rate": 4.5764788911908384e-08,
24975
+ "loss": 0.4099,
24976
+ "step": 3505
24977
+ },
24978
+ {
24979
+ "epoch": 0.9700491111572249,
24980
+ "grad_norm": 8.117724418640137,
24981
+ "learning_rate": 4.493017407975087e-08,
24982
+ "loss": 0.419,
24983
+ "step": 3506
24984
+ },
24985
+ {
24986
+ "epoch": 0.9703257937331397,
24987
+ "grad_norm": 6.391302108764648,
24988
+ "learning_rate": 4.410320555597869e-08,
24989
+ "loss": 0.4502,
24990
+ "step": 3507
24991
+ },
24992
+ {
24993
+ "epoch": 0.9706024763090544,
24994
+ "grad_norm": 12.152544975280762,
24995
+ "learning_rate": 4.328388461671107e-08,
24996
+ "loss": 0.5602,
24997
+ "step": 3508
24998
+ },
24999
+ {
25000
+ "epoch": 0.9708791588849692,
25001
+ "grad_norm": 11.282763481140137,
25002
+ "learning_rate": 4.247221252626499e-08,
25003
+ "loss": 0.5866,
25004
+ "step": 3509
25005
+ },
25006
+ {
25007
+ "epoch": 0.971155841460884,
25008
+ "grad_norm": 6.5668253898620605,
25009
+ "learning_rate": 4.166819053715521e-08,
25010
+ "loss": 0.397,
25011
+ "step": 3510
25012
+ },
25013
+ {
25014
+ "epoch": 0.9714325240367988,
25015
+ "grad_norm": 6.569908618927002,
25016
+ "learning_rate": 4.087181989008926e-08,
25017
+ "loss": 0.4887,
25018
+ "step": 3511
25019
+ },
25020
+ {
25021
+ "epoch": 0.9717092066127135,
25022
+ "grad_norm": 9.562713623046875,
25023
+ "learning_rate": 4.0083101813970794e-08,
25024
+ "loss": 0.3468,
25025
+ "step": 3512
25026
+ },
25027
+ {
25028
+ "epoch": 0.9719858891886284,
25029
+ "grad_norm": 9.6434907913208,
25030
+ "learning_rate": 3.93020375258929e-08,
25031
+ "loss": 0.36,
25032
+ "step": 3513
25033
+ },
25034
+ {
25035
+ "epoch": 0.9722625717645431,
25036
+ "grad_norm": 11.750981330871582,
25037
+ "learning_rate": 3.852862823113701e-08,
25038
+ "loss": 0.4,
25039
+ "step": 3514
25040
+ },
25041
+ {
25042
+ "epoch": 0.972539254340458,
25043
+ "grad_norm": 6.20719575881958,
25044
+ "learning_rate": 3.776287512317345e-08,
25045
+ "loss": 0.3826,
25046
+ "step": 3515
25047
+ },
25048
+ {
25049
+ "epoch": 0.9728159369163727,
25050
+ "grad_norm": 5.036905288696289,
25051
+ "learning_rate": 3.7004779383657543e-08,
25052
+ "loss": 0.3524,
25053
+ "step": 3516
25054
+ },
25055
+ {
25056
+ "epoch": 0.9730926194922874,
25057
+ "grad_norm": 10.913909912109375,
25058
+ "learning_rate": 3.6254342182428515e-08,
25059
+ "loss": 0.3848,
25060
+ "step": 3517
25061
+ },
25062
+ {
25063
+ "epoch": 0.9733693020682023,
25064
+ "grad_norm": 5.970122814178467,
25065
+ "learning_rate": 3.5511564677506715e-08,
25066
+ "loss": 0.3321,
25067
+ "step": 3518
25068
+ },
25069
+ {
25070
+ "epoch": 0.973645984644117,
25071
+ "grad_norm": 11.325397491455078,
25072
+ "learning_rate": 3.477644801509306e-08,
25073
+ "loss": 0.4813,
25074
+ "step": 3519
25075
+ },
25076
+ {
25077
+ "epoch": 0.9739226672200318,
25078
+ "grad_norm": 6.1236066818237305,
25079
+ "learning_rate": 3.404899332956735e-08,
25080
+ "loss": 0.3705,
25081
+ "step": 3520
25082
+ },
25083
+ {
25084
+ "epoch": 0.9741993497959466,
25085
+ "grad_norm": 7.407733917236328,
25086
+ "learning_rate": 3.332920174348497e-08,
25087
+ "loss": 0.4437,
25088
+ "step": 3521
25089
+ },
25090
+ {
25091
+ "epoch": 0.9744760323718614,
25092
+ "grad_norm": 6.992598533630371,
25093
+ "learning_rate": 3.2617074367576886e-08,
25094
+ "loss": 0.4798,
25095
+ "step": 3522
25096
+ },
25097
+ {
25098
+ "epoch": 0.9747527149477762,
25099
+ "grad_norm": 8.392826080322266,
25100
+ "learning_rate": 3.1912612300747384e-08,
25101
+ "loss": 0.3594,
25102
+ "step": 3523
25103
+ },
25104
+ {
25105
+ "epoch": 0.975029397523691,
25106
+ "grad_norm": 6.5278639793396,
25107
+ "learning_rate": 3.121581663007134e-08,
25108
+ "loss": 0.4254,
25109
+ "step": 3524
25110
+ },
25111
+ {
25112
+ "epoch": 0.9753060800996057,
25113
+ "grad_norm": 6.965699195861816,
25114
+ "learning_rate": 3.052668843079365e-08,
25115
+ "loss": 0.3341,
25116
+ "step": 3525
25117
+ },
25118
+ {
25119
+ "epoch": 0.9755827626755205,
25120
+ "grad_norm": 6.369235992431641,
25121
+ "learning_rate": 2.984522876632812e-08,
25122
+ "loss": 0.4223,
25123
+ "step": 3526
25124
+ },
25125
+ {
25126
+ "epoch": 0.9758594452514353,
25127
+ "grad_norm": 5.943597793579102,
25128
+ "learning_rate": 2.9171438688254118e-08,
25129
+ "loss": 0.4617,
25130
+ "step": 3527
25131
+ },
25132
+ {
25133
+ "epoch": 0.97613612782735,
25134
+ "grad_norm": 8.69240665435791,
25135
+ "learning_rate": 2.850531923631661e-08,
25136
+ "loss": 0.4274,
25137
+ "step": 3528
25138
+ },
25139
+ {
25140
+ "epoch": 0.9764128104032649,
25141
+ "grad_norm": 6.936566352844238,
25142
+ "learning_rate": 2.784687143842224e-08,
25143
+ "loss": 0.3652,
25144
+ "step": 3529
25145
+ },
25146
+ {
25147
+ "epoch": 0.9766894929791796,
25148
+ "grad_norm": 6.269064426422119,
25149
+ "learning_rate": 2.7196096310641573e-08,
25150
+ "loss": 0.4139,
25151
+ "step": 3530
25152
+ },
25153
+ {
25154
+ "epoch": 0.9769661755550945,
25155
+ "grad_norm": 7.397728443145752,
25156
+ "learning_rate": 2.6552994857204083e-08,
25157
+ "loss": 0.4978,
25158
+ "step": 3531
25159
+ },
25160
+ {
25161
+ "epoch": 0.9772428581310092,
25162
+ "grad_norm": 6.353926181793213,
25163
+ "learning_rate": 2.5917568070496503e-08,
25164
+ "loss": 0.4468,
25165
+ "step": 3532
25166
+ },
25167
+ {
25168
+ "epoch": 0.9775195407069239,
25169
+ "grad_norm": 6.348305702209473,
25170
+ "learning_rate": 2.528981693106558e-08,
25171
+ "loss": 0.4326,
25172
+ "step": 3533
25173
+ },
25174
+ {
25175
+ "epoch": 0.9777962232828388,
25176
+ "grad_norm": 7.442327499389648,
25177
+ "learning_rate": 2.4669742407610332e-08,
25178
+ "loss": 0.4463,
25179
+ "step": 3534
25180
+ },
25181
+ {
25182
+ "epoch": 0.9780729058587535,
25183
+ "grad_norm": 6.542540073394775,
25184
+ "learning_rate": 2.4057345456987013e-08,
25185
+ "loss": 0.4501,
25186
+ "step": 3535
25187
+ },
25188
+ {
25189
+ "epoch": 0.9783495884346683,
25190
+ "grad_norm": 6.655422210693359,
25191
+ "learning_rate": 2.3452627024200815e-08,
25192
+ "loss": 0.4371,
25193
+ "step": 3536
25194
+ },
25195
+ {
25196
+ "epoch": 0.9786262710105831,
25197
+ "grad_norm": 6.123124599456787,
25198
+ "learning_rate": 2.2855588042410838e-08,
25199
+ "loss": 0.4332,
25200
+ "step": 3537
25201
+ },
25202
+ {
25203
+ "epoch": 0.9789029535864979,
25204
+ "grad_norm": 6.827947616577148,
25205
+ "learning_rate": 2.226622943292567e-08,
25206
+ "loss": 0.4926,
25207
+ "step": 3538
25208
+ },
25209
+ {
25210
+ "epoch": 0.9791796361624127,
25211
+ "grad_norm": 4.61018705368042,
25212
+ "learning_rate": 2.1684552105199485e-08,
25213
+ "loss": 0.4004,
25214
+ "step": 3539
25215
+ },
25216
+ {
25217
+ "epoch": 0.9794563187383275,
25218
+ "grad_norm": 8.030633926391602,
25219
+ "learning_rate": 2.1110556956835394e-08,
25220
+ "loss": 0.3661,
25221
+ "step": 3540
25222
+ },
25223
+ {
25224
+ "epoch": 0.9797330013142422,
25225
+ "grad_norm": 5.738858222961426,
25226
+ "learning_rate": 2.0544244873582643e-08,
25227
+ "loss": 0.5086,
25228
+ "step": 3541
25229
+ },
25230
+ {
25231
+ "epoch": 0.980009683890157,
25232
+ "grad_norm": 6.600244998931885,
25233
+ "learning_rate": 1.9985616729332747e-08,
25234
+ "loss": 0.391,
25235
+ "step": 3542
25236
+ },
25237
+ {
25238
+ "epoch": 0.9802863664660718,
25239
+ "grad_norm": 5.6170806884765625,
25240
+ "learning_rate": 1.9434673386120594e-08,
25241
+ "loss": 0.4199,
25242
+ "step": 3543
25243
+ },
25244
+ {
25245
+ "epoch": 0.9805630490419865,
25246
+ "grad_norm": 6.685068607330322,
25247
+ "learning_rate": 1.889141569412223e-08,
25248
+ "loss": 0.4244,
25249
+ "step": 3544
25250
+ },
25251
+ {
25252
+ "epoch": 0.9808397316179014,
25253
+ "grad_norm": 8.005044937133789,
25254
+ "learning_rate": 1.8355844491654284e-08,
25255
+ "loss": 0.4274,
25256
+ "step": 3545
25257
+ },
25258
+ {
25259
+ "epoch": 0.9811164141938161,
25260
+ "grad_norm": 9.169407844543457,
25261
+ "learning_rate": 1.7827960605171778e-08,
25262
+ "loss": 0.4674,
25263
+ "step": 3546
25264
+ },
25265
+ {
25266
+ "epoch": 0.981393096769731,
25267
+ "grad_norm": 8.508087158203125,
25268
+ "learning_rate": 1.7307764849266996e-08,
25269
+ "loss": 0.4041,
25270
+ "step": 3547
25271
+ },
25272
+ {
25273
+ "epoch": 0.9816697793456457,
25274
+ "grad_norm": 7.333049774169922,
25275
+ "learning_rate": 1.679525802666948e-08,
25276
+ "loss": 0.3855,
25277
+ "step": 3548
25278
+ },
25279
+ {
25280
+ "epoch": 0.9819464619215605,
25281
+ "grad_norm": 5.400395393371582,
25282
+ "learning_rate": 1.6290440928241613e-08,
25283
+ "loss": 0.4063,
25284
+ "step": 3549
25285
+ },
25286
+ {
25287
+ "epoch": 0.9822231444974753,
25288
+ "grad_norm": 8.128486633300781,
25289
+ "learning_rate": 1.5793314332982477e-08,
25290
+ "loss": 0.3734,
25291
+ "step": 3550
25292
+ },
25293
+ {
25294
+ "epoch": 0.98249982707339,
25295
+ "grad_norm": 7.244846343994141,
25296
+ "learning_rate": 1.5303879008021773e-08,
25297
+ "loss": 0.3945,
25298
+ "step": 3551
25299
+ },
25300
+ {
25301
+ "epoch": 0.9827765096493049,
25302
+ "grad_norm": 7.506839275360107,
25303
+ "learning_rate": 1.482213570861979e-08,
25304
+ "loss": 0.4032,
25305
+ "step": 3552
25306
+ },
25307
+ {
25308
+ "epoch": 0.9830531922252196,
25309
+ "grad_norm": 6.991761207580566,
25310
+ "learning_rate": 1.4348085178169658e-08,
25311
+ "loss": 0.358,
25312
+ "step": 3553
25313
+ },
25314
+ {
25315
+ "epoch": 0.9833298748011344,
25316
+ "grad_norm": 6.618139266967773,
25317
+ "learning_rate": 1.3881728148191775e-08,
25318
+ "loss": 0.4842,
25319
+ "step": 3554
25320
+ },
25321
+ {
25322
+ "epoch": 0.9836065573770492,
25323
+ "grad_norm": 5.5989274978637695,
25324
+ "learning_rate": 1.3423065338334373e-08,
25325
+ "loss": 0.3639,
25326
+ "step": 3555
25327
+ },
25328
+ {
25329
+ "epoch": 0.983883239952964,
25330
+ "grad_norm": 10.364370346069336,
25331
+ "learning_rate": 1.2972097456373512e-08,
25332
+ "loss": 0.3677,
25333
+ "step": 3556
25334
+ },
25335
+ {
25336
+ "epoch": 0.9841599225288787,
25337
+ "grad_norm": 6.9868340492248535,
25338
+ "learning_rate": 1.2528825198210304e-08,
25339
+ "loss": 0.4279,
25340
+ "step": 3557
25341
+ },
25342
+ {
25343
+ "epoch": 0.9844366051047935,
25344
+ "grad_norm": 7.0398125648498535,
25345
+ "learning_rate": 1.209324924787092e-08,
25346
+ "loss": 0.4194,
25347
+ "step": 3558
25348
+ },
25349
+ {
25350
+ "epoch": 0.9847132876807083,
25351
+ "grad_norm": 4.859399795532227,
25352
+ "learning_rate": 1.1665370277504917e-08,
25353
+ "loss": 0.4016,
25354
+ "step": 3559
25355
+ },
25356
+ {
25357
+ "epoch": 0.984989970256623,
25358
+ "grad_norm": 6.274821758270264,
25359
+ "learning_rate": 1.1245188947384133e-08,
25360
+ "loss": 0.4823,
25361
+ "step": 3560
25362
+ },
25363
+ {
25364
+ "epoch": 0.9852666528325379,
25365
+ "grad_norm": 7.054055213928223,
25366
+ "learning_rate": 1.083270590590213e-08,
25367
+ "loss": 0.4286,
25368
+ "step": 3561
25369
+ },
25370
+ {
25371
+ "epoch": 0.9855433354084526,
25372
+ "grad_norm": 4.351117134094238,
25373
+ "learning_rate": 1.0427921789573636e-08,
25374
+ "loss": 0.39,
25375
+ "step": 3562
25376
+ },
25377
+ {
25378
+ "epoch": 0.9858200179843675,
25379
+ "grad_norm": 8.033738136291504,
25380
+ "learning_rate": 1.003083722303233e-08,
25381
+ "loss": 0.4421,
25382
+ "step": 3563
25383
+ },
25384
+ {
25385
+ "epoch": 0.9860967005602822,
25386
+ "grad_norm": 4.982435703277588,
25387
+ "learning_rate": 9.641452819030283e-09,
25388
+ "loss": 0.332,
25389
+ "step": 3564
25390
+ },
25391
+ {
25392
+ "epoch": 0.986373383136197,
25393
+ "grad_norm": 15.608251571655273,
25394
+ "learning_rate": 9.259769178438516e-09,
25395
+ "loss": 0.3892,
25396
+ "step": 3565
25397
+ },
25398
+ {
25399
+ "epoch": 0.9866500657121118,
25400
+ "grad_norm": 7.015858173370361,
25401
+ "learning_rate": 8.885786890242554e-09,
25402
+ "loss": 0.3854,
25403
+ "step": 3566
25404
+ },
25405
+ {
25406
+ "epoch": 0.9869267482880265,
25407
+ "grad_norm": 10.827554702758789,
25408
+ "learning_rate": 8.519506531545763e-09,
25409
+ "loss": 0.5023,
25410
+ "step": 3567
25411
+ },
25412
+ {
25413
+ "epoch": 0.9872034308639414,
25414
+ "grad_norm": 5.95730447769165,
25415
+ "learning_rate": 8.160928667566015e-09,
25416
+ "loss": 0.4019,
25417
+ "step": 3568
25418
+ },
25419
+ {
25420
+ "epoch": 0.9874801134398561,
25421
+ "grad_norm": 10.602973937988281,
25422
+ "learning_rate": 7.81005385163458e-09,
25423
+ "loss": 0.4717,
25424
+ "step": 3569
25425
+ },
25426
+ {
25427
+ "epoch": 0.9877567960157709,
25428
+ "grad_norm": 6.7663164138793945,
25429
+ "learning_rate": 7.466882625196126e-09,
25430
+ "loss": 0.3858,
25431
+ "step": 3570
25432
+ },
25433
+ {
25434
+ "epoch": 0.9880334785916857,
25435
+ "grad_norm": 14.224672317504883,
25436
+ "learning_rate": 7.13141551780816e-09,
25437
+ "loss": 0.587,
25438
+ "step": 3571
25439
+ },
25440
+ {
25441
+ "epoch": 0.9883101611676005,
25442
+ "grad_norm": 10.713641166687012,
25443
+ "learning_rate": 6.803653047138814e-09,
25444
+ "loss": 0.379,
25445
+ "step": 3572
25446
+ },
25447
+ {
25448
+ "epoch": 0.9885868437435152,
25449
+ "grad_norm": 6.506649494171143,
25450
+ "learning_rate": 6.48359571896906e-09,
25451
+ "loss": 0.3868,
25452
+ "step": 3573
25453
+ },
25454
+ {
25455
+ "epoch": 0.9888635263194301,
25456
+ "grad_norm": 5.888006210327148,
25457
+ "learning_rate": 6.171244027187162e-09,
25458
+ "loss": 0.4599,
25459
+ "step": 3574
25460
+ },
25461
+ {
25462
+ "epoch": 0.9891402088953448,
25463
+ "grad_norm": 4.954348087310791,
25464
+ "learning_rate": 5.866598453792005e-09,
25465
+ "loss": 0.4112,
25466
+ "step": 3575
25467
+ },
25468
+ {
25469
+ "epoch": 0.9894168914712596,
25470
+ "grad_norm": 6.747551441192627,
25471
+ "learning_rate": 5.569659468891431e-09,
25472
+ "loss": 0.3918,
25473
+ "step": 3576
25474
+ },
25475
+ {
25476
+ "epoch": 0.9896935740471744,
25477
+ "grad_norm": 6.1678972244262695,
25478
+ "learning_rate": 5.2804275306994615e-09,
25479
+ "loss": 0.49,
25480
+ "step": 3577
25481
+ },
25482
+ {
25483
+ "epoch": 0.9899702566230891,
25484
+ "grad_norm": 9.625385284423828,
25485
+ "learning_rate": 4.998903085539075e-09,
25486
+ "loss": 0.4239,
25487
+ "step": 3578
25488
+ },
25489
+ {
25490
+ "epoch": 0.990246939199004,
25491
+ "grad_norm": 12.304625511169434,
25492
+ "learning_rate": 4.7250865678377665e-09,
25493
+ "loss": 0.4733,
25494
+ "step": 3579
25495
+ },
25496
+ {
25497
+ "epoch": 0.9905236217749187,
25498
+ "grad_norm": 7.027407646179199,
25499
+ "learning_rate": 4.458978400130321e-09,
25500
+ "loss": 0.5059,
25501
+ "step": 3580
25502
+ },
25503
+ {
25504
+ "epoch": 0.9908003043508335,
25505
+ "grad_norm": 7.810775279998779,
25506
+ "learning_rate": 4.200578993054927e-09,
25507
+ "loss": 0.5052,
25508
+ "step": 3581
25509
+ },
25510
+ {
25511
+ "epoch": 0.9910769869267483,
25512
+ "grad_norm": 8.920899391174316,
25513
+ "learning_rate": 3.9498887453559565e-09,
25514
+ "loss": 0.4596,
25515
+ "step": 3582
25516
+ },
25517
+ {
25518
+ "epoch": 0.991353669502663,
25519
+ "grad_norm": 5.510407447814941,
25520
+ "learning_rate": 3.70690804387952e-09,
25521
+ "loss": 0.3717,
25522
+ "step": 3583
25523
+ },
25524
+ {
25525
+ "epoch": 0.9916303520785779,
25526
+ "grad_norm": 10.58802604675293,
25527
+ "learning_rate": 3.4716372635767993e-09,
25528
+ "loss": 0.3158,
25529
+ "step": 3584
25530
+ },
25531
+ {
25532
+ "epoch": 0.9919070346544926,
25533
+ "grad_norm": 6.024365425109863,
25534
+ "learning_rate": 3.2440767675007144e-09,
25535
+ "loss": 0.4396,
25536
+ "step": 3585
25537
+ },
25538
+ {
25539
+ "epoch": 0.9921837172304074,
25540
+ "grad_norm": 4.717132091522217,
25541
+ "learning_rate": 3.024226906805927e-09,
25542
+ "loss": 0.3467,
25543
+ "step": 3586
25544
+ },
25545
+ {
25546
+ "epoch": 0.9924603998063222,
25547
+ "grad_norm": 6.302797794342041,
25548
+ "learning_rate": 2.8120880207493928e-09,
25549
+ "loss": 0.4866,
25550
+ "step": 3587
25551
+ },
25552
+ {
25553
+ "epoch": 0.992737082382237,
25554
+ "grad_norm": 5.059561252593994,
25555
+ "learning_rate": 2.607660436688697e-09,
25556
+ "loss": 0.4122,
25557
+ "step": 3588
25558
+ },
25559
+ {
25560
+ "epoch": 0.9930137649581517,
25561
+ "grad_norm": 7.106605052947998,
25562
+ "learning_rate": 2.4109444700815e-09,
25563
+ "loss": 0.4918,
25564
+ "step": 3589
25565
+ },
25566
+ {
25567
+ "epoch": 0.9932904475340666,
25568
+ "grad_norm": 5.022238731384277,
25569
+ "learning_rate": 2.221940424485536e-09,
25570
+ "loss": 0.3677,
25571
+ "step": 3590
25572
+ },
25573
+ {
25574
+ "epoch": 0.9935671301099813,
25575
+ "grad_norm": 6.2519330978393555,
25576
+ "learning_rate": 2.040648591559169e-09,
25577
+ "loss": 0.482,
25578
+ "step": 3591
25579
+ },
25580
+ {
25581
+ "epoch": 0.993843812685896,
25582
+ "grad_norm": 7.059159278869629,
25583
+ "learning_rate": 1.8670692510580625e-09,
25584
+ "loss": 0.44,
25585
+ "step": 3592
25586
+ },
25587
+ {
25588
+ "epoch": 0.9941204952618109,
25589
+ "grad_norm": 7.916522026062012,
25590
+ "learning_rate": 1.7012026708373985e-09,
25591
+ "loss": 0.5168,
25592
+ "step": 3593
25593
+ },
25594
+ {
25595
+ "epoch": 0.9943971778377256,
25596
+ "grad_norm": 8.189726829528809,
25597
+ "learning_rate": 1.5430491068513243e-09,
25598
+ "loss": 0.4161,
25599
+ "step": 3594
25600
+ },
25601
+ {
25602
+ "epoch": 0.9946738604136405,
25603
+ "grad_norm": 10.01470947265625,
25604
+ "learning_rate": 1.3926088031507302e-09,
25605
+ "loss": 0.4104,
25606
+ "step": 3595
25607
+ },
25608
+ {
25609
+ "epoch": 0.9949505429895552,
25610
+ "grad_norm": 6.116405010223389,
25611
+ "learning_rate": 1.2498819918843609e-09,
25612
+ "loss": 0.4398,
25613
+ "step": 3596
25614
+ },
25615
+ {
25616
+ "epoch": 0.99522722556547,
25617
+ "grad_norm": 5.669225692749023,
25618
+ "learning_rate": 1.1148688932977047e-09,
25619
+ "loss": 0.4811,
25620
+ "step": 3597
25621
+ },
25622
+ {
25623
+ "epoch": 0.9955039081413848,
25624
+ "grad_norm": 7.954200267791748,
25625
+ "learning_rate": 9.875697157329945e-10,
25626
+ "loss": 0.5057,
25627
+ "step": 3598
25628
+ },
25629
+ {
25630
+ "epoch": 0.9957805907172996,
25631
+ "grad_norm": 12.605840682983398,
25632
+ "learning_rate": 8.679846556303162e-10,
25633
+ "loss": 0.4993,
25634
+ "step": 3599
25635
+ },
25636
+ {
25637
+ "epoch": 0.9960572732932144,
25638
+ "grad_norm": 9.73613166809082,
25639
+ "learning_rate": 7.561138975242798e-10,
25640
+ "loss": 0.4514,
25641
+ "step": 3600
25642
+ },
25643
+ {
25644
+ "epoch": 0.9960572732932144,
25645
+ "eval_accuracy": 0.7272727272727273,
25646
+ "eval_f1": 0.3137254901960784,
25647
+ "eval_loss": 0.6381392478942871,
25648
+ "eval_precision": 0.5454545454545454,
25649
+ "eval_recall": 0.22018348623853212,
25650
+ "eval_runtime": 118.7108,
25651
+ "eval_samples_per_second": 1.904,
25652
+ "eval_steps_per_second": 0.067,
25653
+ "step": 3600
25654
+ },
25655
+ {
25656
+ "epoch": 0.9963339558691291,
25657
+ "grad_norm": 5.538408279418945,
25658
+ "learning_rate": 6.519576140451289e-10,
25659
+ "loss": 0.4153,
25660
+ "step": 3601
25661
+ },
25662
+ {
25663
+ "epoch": 0.9966106384450439,
25664
+ "grad_norm": 5.18894624710083,
25665
+ "learning_rate": 5.555159659204057e-10,
25666
+ "loss": 0.4861,
25667
+ "step": 3602
25668
+ },
25669
+ {
25670
+ "epoch": 0.9968873210209587,
25671
+ "grad_norm": 7.266717910766602,
25672
+ "learning_rate": 4.667891019710657e-10,
25673
+ "loss": 0.4072,
25674
+ "step": 3603
25675
+ },
25676
+ {
25677
+ "epoch": 0.9971640035968735,
25678
+ "grad_norm": 8.036751747131348,
25679
+ "learning_rate": 3.857771591142534e-10,
25680
+ "loss": 0.4333,
25681
+ "step": 3604
25682
+ },
25683
+ {
25684
+ "epoch": 0.9974406861727882,
25685
+ "grad_norm": 9.265700340270996,
25686
+ "learning_rate": 3.124802623627465e-10,
25687
+ "loss": 0.4393,
25688
+ "step": 3605
25689
+ },
25690
+ {
25691
+ "epoch": 0.9977173687487031,
25692
+ "grad_norm": 5.40745210647583,
25693
+ "learning_rate": 2.4689852482162604e-10,
25694
+ "loss": 0.3861,
25695
+ "step": 3606
25696
+ },
25697
+ {
25698
+ "epoch": 0.9979940513246178,
25699
+ "grad_norm": 5.892364025115967,
25700
+ "learning_rate": 1.8903204769271655e-10,
25701
+ "loss": 0.4592,
25702
+ "step": 3607
25703
+ },
25704
+ {
25705
+ "epoch": 0.9982707339005326,
25706
+ "grad_norm": 8.214534759521484,
25707
+ "learning_rate": 1.38880920271256e-10,
25708
+ "loss": 0.4237,
25709
+ "step": 3608
25710
+ },
25711
+ {
25712
+ "epoch": 0.9985474164764474,
25713
+ "grad_norm": 7.651143550872803,
25714
+ "learning_rate": 9.64452199464505e-11,
25715
+ "loss": 0.3782,
25716
+ "step": 3609
25717
+ },
25718
+ {
25719
+ "epoch": 0.9988240990523621,
25720
+ "grad_norm": 6.732639312744141,
25721
+ "learning_rate": 6.172501220313986e-11,
25722
+ "loss": 0.4235,
25723
+ "step": 3610
25724
+ },
25725
+ {
25726
+ "epoch": 0.999100781628277,
25727
+ "grad_norm": 11.96877670288086,
25728
+ "learning_rate": 3.472035061791168e-11,
25729
+ "loss": 0.484,
25730
+ "step": 3611
25731
+ },
25732
+ {
25733
+ "epoch": 0.9993774642041917,
25734
+ "grad_norm": 6.621433258056641,
25735
+ "learning_rate": 1.5431276862987176e-11,
25736
+ "loss": 0.4221,
25737
+ "step": 3612
25738
+ },
25739
+ {
25740
+ "epoch": 0.9996541467801066,
25741
+ "grad_norm": 9.467500686645508,
25742
+ "learning_rate": 3.857820704000759e-12,
25743
+ "loss": 0.4852,
25744
+ "step": 3613
25745
+ },
25746
+ {
25747
+ "epoch": 0.9999308293560213,
25748
+ "grad_norm": 6.072620391845703,
25749
+ "learning_rate": 0.0,
25750
+ "loss": 0.401,
25751
+ "step": 3614
25752
  }
25753
  ],
25754
  "logging_steps": 1,
 
25763
  "should_evaluate": false,
25764
  "should_log": false,
25765
  "should_save": true,
25766
+ "should_training_stop": true
25767
  },
25768
  "attributes": {}
25769
  }
25770
  },
25771
+ "total_flos": 8.582232535864443e+17,
25772
  "train_batch_size": 2,
25773
  "trial_name": null,
25774
  "trial_params": null