Training in progress, epoch 3, checkpoint

Browse files

Files changed (15) hide show

last-checkpoint/model-00001-of-00004.safetensors +1 -1
last-checkpoint/model-00002-of-00004.safetensors +1 -1
last-checkpoint/model-00003-of-00004.safetensors +1 -1
last-checkpoint/model-00004-of-00004.safetensors +1 -1
last-checkpoint/optimizer.pt +2 -2
last-checkpoint/rng_state_0.pth +1 -1
last-checkpoint/rng_state_1.pth +1 -1
last-checkpoint/rng_state_2.pth +1 -1
last-checkpoint/rng_state_3.pth +1 -1
last-checkpoint/rng_state_4.pth +1 -1
last-checkpoint/rng_state_5.pth +1 -1
last-checkpoint/rng_state_6.pth +1 -1
last-checkpoint/rng_state_7.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +523 -4

last-checkpoint/model-00001-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8c060b8e42388903f369391791eb7945956545d305e3f2ec2bd4aee7d43c9627
 size 4903351912

 version https://git-lfs.github.com/spec/v1
+oid sha256:44918dfe39ee4b6d085b2fd0ede145b5142b915f183672ae9f4f3a923ec9ace2
 size 4903351912

last-checkpoint/model-00002-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:230396faf4f924420f20c75b719bae88e31df325fc35e94f9fa673dc6bdd9774
 size 4947570872

 version https://git-lfs.github.com/spec/v1
+oid sha256:1906b644e20f7418e0ff8861b4c47c96c8329cbafa385d6221bb549bd49078a6
 size 4947570872

last-checkpoint/model-00003-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b28cd530d6ef53c146ea079e7092af33e6f1bc2e392efdc66ae16e2eae76a988
 size 4962221464

 version https://git-lfs.github.com/spec/v1
+oid sha256:f784163add06741c32f71bfd53242b55e4499fad506d9059b5bffcf9bf06714a
 size 4962221464

last-checkpoint/model-00004-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ac1e152be2252f6499925b2928cee4c632ff7777a037b95b6e31767da1531e4a
 size 3670322200

 version https://git-lfs.github.com/spec/v1
+oid sha256:1c8c9b8ecede7d332419cd9140fecb55d03b3de4fdcc2b353e9acb041598999f
 size 3670322200

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0117f33fed181aa8ea76fa63db6fac08f4883c66b863ad19ae2b40826aef632b
-size 36967230034

 version https://git-lfs.github.com/spec/v1
+oid sha256:69d47d5b8f28bba9ccfb52093efcff4d4e0d7768a82f2cf566cfc4e6be74db24
+size 33781260608

last-checkpoint/rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0c461c9d337dfc684e9352ec72bfa344e2f5d377f7cfc4475de9acae294dca89
 size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:69ec6e3926fa071bede113523efa3dc6e630c3c7958c54a9ca321cf4d62ed145
 size 15984

last-checkpoint/rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fae392ec6232cbf9da21d6ed12bc8247d0d24e7f3a3606acd23be00f3e8bbfc5
 size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:f6127ee4f0c13500ec5038fce65af8f7beec63c137c7d4b7c157aa6303cf5879
 size 15984

last-checkpoint/rng_state_2.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cbf3e7ca9991a58b0b16574a3c653483c551c270aa05aba06c162ea593f7b0f2
 size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:da01d1c5eb2cc3a323f97c1f590d13ccfac2a4c5b1479bd378b4e643304f5a4f
 size 15984

last-checkpoint/rng_state_3.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c695bebf6bcb75cbe26378bfe0ab7e2a33c49f713b9d6e4d10632b24322977e7
 size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:49a3f04d76c0d3acc7d3dd95a04215f368f35a451ae8cba8a2fdba38cda9ca0a
 size 15984

last-checkpoint/rng_state_4.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d5ebb13c71265c5464c9aa9bb9b66f07764d73befe6cd63a2aaf8e781bf0a374
 size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:df7d2c9825dba80cb544920f8cc0c72122f96514e6cd259052a8765b034393e2
 size 15984

last-checkpoint/rng_state_5.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:12cc6e245e189be568c8dfd43a4dd8f04bb3dbd9f17f41458107935d2c2a6a9d
 size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:6a20a42d44ff48cc162224010190e898fe28598ddad8cd1896d330a3bb1d8ec3
 size 15984

last-checkpoint/rng_state_6.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:36086646e9a8f76fea69f8a227112e83bb63524964ccdfb82f4cdad88b90e5e4
 size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:18ac0dc4f09f25179860561fcea7c5c8f997aabdc46a170665f9dc5a72bc27c6
 size 15984

last-checkpoint/rng_state_7.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9b44153bacf860d0ca6ce4c6b9380a199feab8a72ca613e6745bfb671b02c4e4
 size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:6a16fcb5411ff961b47eff7378d85105fe9837e0492d19ea5ce3b7c4b77aa3b6
 size 15984

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:82d7549e2049af0aa601c2cb8b24328e3b8070b31576a769b0483a09a01779f5
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:edc9a50bea580864bcd8cfa624a3000d833ef59c0757429ea149c48330d1c567
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 2.0,
   "eval_steps": 300,
-  "global_step": 732,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -1052,6 +1052,525 @@
       "learning_rate": 5.066342480105459e-07,
       "loss": 1.5082,
       "step": 730
     }
   ],
   "logging_steps": 5,
@@ -1066,12 +1585,12 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": false
       },
       "attributes": {}
     }
   },
-  "total_flos": 4.791978551408591e+18,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 3.0,
   "eval_steps": 300,
+  "global_step": 1098,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "learning_rate": 5.066342480105459e-07,
       "loss": 1.5082,
       "step": 730
+    },
+    {
+      "epoch": 2.0081967213114753,
+      "grad_norm": 1.6484375,
+      "learning_rate": 4.942189840051287e-07,
+      "loss": 1.5054,
+      "step": 735
+    },
+    {
+      "epoch": 2.021857923497268,
+      "grad_norm": 1.84375,
+      "learning_rate": 4.819076099973152e-07,
+      "loss": 1.4999,
+      "step": 740
+    },
+    {
+      "epoch": 2.0355191256830603,
+      "grad_norm": 1.6171875,
+      "learning_rate": 4.697026548060528e-07,
+      "loss": 1.5094,
+      "step": 745
+    },
+    {
+      "epoch": 2.0491803278688523,
+      "grad_norm": 1.703125,
+      "learning_rate": 4.5760662539132077e-07,
+      "loss": 1.5065,
+      "step": 750
+    },
+    {
+      "epoch": 2.0628415300546448,
+      "grad_norm": 1.921875,
+      "learning_rate": 4.4562200633918943e-07,
+      "loss": 1.5013,
+      "step": 755
+    },
+    {
+      "epoch": 2.0765027322404372,
+      "grad_norm": 1.625,
+      "learning_rate": 4.337512593514728e-07,
+      "loss": 1.5041,
+      "step": 760
+    },
+    {
+      "epoch": 2.0901639344262297,
+      "grad_norm": 1.6640625,
+      "learning_rate": 4.2199682274008255e-07,
+      "loss": 1.5083,
+      "step": 765
+    },
+    {
+      "epoch": 2.1038251366120218,
+      "grad_norm": 1.6171875,
+      "learning_rate": 4.103611109261872e-07,
+      "loss": 1.4982,
+      "step": 770
+    },
+    {
+      "epoch": 2.1174863387978142,
+      "grad_norm": 1.6328125,
+      "learning_rate": 3.9884651394427625e-07,
+      "loss": 1.5051,
+      "step": 775
+    },
+    {
+      "epoch": 2.1311475409836067,
+      "grad_norm": 1.671875,
+      "learning_rate": 3.8745539695123577e-07,
+      "loss": 1.4924,
+      "step": 780
+    },
+    {
+      "epoch": 2.1448087431693987,
+      "grad_norm": 1.65625,
+      "learning_rate": 3.761900997405332e-07,
+      "loss": 1.5113,
+      "step": 785
+    },
+    {
+      "epoch": 2.158469945355191,
+      "grad_norm": 1.6484375,
+      "learning_rate": 3.6505293626161127e-07,
+      "loss": 1.5058,
+      "step": 790
+    },
+    {
+      "epoch": 2.1721311475409837,
+      "grad_norm": 1.625,
+      "learning_rate": 3.5404619414459147e-07,
+      "loss": 1.5019,
+      "step": 795
+    },
+    {
+      "epoch": 2.185792349726776,
+      "grad_norm": 1.640625,
+      "learning_rate": 3.4317213423038384e-07,
+      "loss": 1.4948,
+      "step": 800
+    },
+    {
+      "epoch": 2.199453551912568,
+      "grad_norm": 1.625,
+      "learning_rate": 3.32432990106297e-07,
+      "loss": 1.5092,
+      "step": 805
+    },
+    {
+      "epoch": 2.2131147540983607,
+      "grad_norm": 1.734375,
+      "learning_rate": 3.2183096764724914e-07,
+      "loss": 1.5048,
+      "step": 810
+    },
+    {
+      "epoch": 2.226775956284153,
+      "grad_norm": 1.8984375,
+      "learning_rate": 3.1136824456267006e-07,
+      "loss": 1.5005,
+      "step": 815
+    },
+    {
+      "epoch": 2.240437158469945,
+      "grad_norm": 1.7421875,
+      "learning_rate": 3.01046969949188e-07,
+      "loss": 1.5097,
+      "step": 820
+    },
+    {
+      "epoch": 2.2540983606557377,
+      "grad_norm": 1.6640625,
+      "learning_rate": 2.908692638491945e-07,
+      "loss": 1.5054,
+      "step": 825
+    },
+    {
+      "epoch": 2.26775956284153,
+      "grad_norm": 1.65625,
+      "learning_rate": 2.80837216815378e-07,
+      "loss": 1.5131,
+      "step": 830
+    },
+    {
+      "epoch": 2.281420765027322,
+      "grad_norm": 1.671875,
+      "learning_rate": 2.7095288948131114e-07,
+      "loss": 1.4999,
+      "step": 835
+    },
+    {
+      "epoch": 2.2950819672131146,
+      "grad_norm": 1.6328125,
+      "learning_rate": 2.6121831213818826e-07,
+      "loss": 1.4989,
+      "step": 840
+    },
+    {
+      "epoch": 2.308743169398907,
+      "grad_norm": 1.6953125,
+      "learning_rate": 2.51635484317793e-07,
+      "loss": 1.495,
+      "step": 845
+    },
+    {
+      "epoch": 2.3224043715846996,
+      "grad_norm": 1.703125,
+      "learning_rate": 2.4220637438178313e-07,
+      "loss": 1.5125,
+      "step": 850
+    },
+    {
+      "epoch": 2.3360655737704916,
+      "grad_norm": 1.703125,
+      "learning_rate": 2.3293291911738078e-07,
+      "loss": 1.5092,
+      "step": 855
+    },
+    {
+      "epoch": 2.349726775956284,
+      "grad_norm": 1.6640625,
+      "learning_rate": 2.2381702333954433e-07,
+      "loss": 1.4965,
+      "step": 860
+    },
+    {
+      "epoch": 2.3633879781420766,
+      "grad_norm": 1.6328125,
+      "learning_rate": 2.148605594997115e-07,
+      "loss": 1.497,
+      "step": 865
+    },
+    {
+      "epoch": 2.3770491803278686,
+      "grad_norm": 1.765625,
+      "learning_rate": 2.0606536730118763e-07,
+      "loss": 1.5081,
+      "step": 870
+    },
+    {
+      "epoch": 2.390710382513661,
+      "grad_norm": 1.625,
+      "learning_rate": 1.9743325332126105e-07,
+      "loss": 1.5091,
+      "step": 875
+    },
+    {
+      "epoch": 2.4043715846994536,
+      "grad_norm": 1.6171875,
+      "learning_rate": 1.8896599064012298e-07,
+      "loss": 1.5045,
+      "step": 880
+    },
+    {
+      "epoch": 2.418032786885246,
+      "grad_norm": 1.640625,
+      "learning_rate": 1.8066531847666888e-07,
+      "loss": 1.5008,
+      "step": 885
+    },
+    {
+      "epoch": 2.431693989071038,
+      "grad_norm": 1.65625,
+      "learning_rate": 1.7253294183125222e-07,
+      "loss": 1.511,
+      "step": 890
+    },
+    {
+      "epoch": 2.4453551912568305,
+      "grad_norm": 1.6328125,
+      "learning_rate": 1.645705311354697e-07,
+      "loss": 1.4998,
+      "step": 895
+    },
+    {
+      "epoch": 2.459016393442623,
+      "grad_norm": 1.65625,
+      "learning_rate": 1.5677972190904621e-07,
+      "loss": 1.5021,
+      "step": 900
+    },
+    {
+      "epoch": 2.459016393442623,
+      "eval_loss": 1.5163270235061646,
+      "eval_runtime": 19.6134,
+      "eval_samples_per_second": 36.098,
+      "eval_steps_per_second": 1.173,
+      "step": 900
+    },
+    {
+      "epoch": 2.4726775956284155,
+      "grad_norm": 1.6328125,
+      "learning_rate": 1.4916211442389048e-07,
+      "loss": 1.502,
+      "step": 905
+    },
+    {
+      "epoch": 2.4863387978142075,
+      "grad_norm": 1.625,
+      "learning_rate": 1.4171927337539104e-07,
+      "loss": 1.5012,
+      "step": 910
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 1.640625,
+      "learning_rate": 1.344527275610202e-07,
+      "loss": 1.5019,
+      "step": 915
+    },
+    {
+      "epoch": 2.5136612021857925,
+      "grad_norm": 1.640625,
+      "learning_rate": 1.273639695663108e-07,
+      "loss": 1.5085,
+      "step": 920
+    },
+    {
+      "epoch": 2.527322404371585,
+      "grad_norm": 1.75,
+      "learning_rate": 1.204544554582716e-07,
+      "loss": 1.4973,
+      "step": 925
+    },
+    {
+      "epoch": 2.540983606557377,
+      "grad_norm": 1.671875,
+      "learning_rate": 1.1372560448630375e-07,
+      "loss": 1.5037,
+      "step": 930
+    },
+    {
+      "epoch": 2.5546448087431695,
+      "grad_norm": 1.65625,
+      "learning_rate": 1.0717879879068004e-07,
+      "loss": 1.5005,
+      "step": 935
+    },
+    {
+      "epoch": 2.5683060109289615,
+      "grad_norm": 1.6796875,
+      "learning_rate": 1.0081538311864568e-07,
+      "loss": 1.5024,
+      "step": 940
+    },
+    {
+      "epoch": 2.581967213114754,
+      "grad_norm": 1.625,
+      "learning_rate": 9.463666454820118e-08,
+      "loss": 1.4988,
+      "step": 945
+    },
+    {
+      "epoch": 2.5956284153005464,
+      "grad_norm": 1.625,
+      "learning_rate": 8.864391221962064e-08,
+      "loss": 1.5053,
+      "step": 950
+    },
+    {
+      "epoch": 2.609289617486339,
+      "grad_norm": 1.6328125,
+      "learning_rate": 8.28383570747655e-08,
+      "loss": 1.5044,
+      "step": 955
+    },
+    {
+      "epoch": 2.6229508196721314,
+      "grad_norm": 1.609375,
+      "learning_rate": 7.722119160424112e-08,
+      "loss": 1.4995,
+      "step": 960
+    },
+    {
+      "epoch": 2.6366120218579234,
+      "grad_norm": 1.640625,
+      "learning_rate": 7.179356960245409e-08,
+      "loss": 1.5122,
+      "step": 965
+    },
+    {
+      "epoch": 2.650273224043716,
+      "grad_norm": 1.7890625,
+      "learning_rate": 6.655660593061718e-08,
+      "loss": 1.5054,
+      "step": 970
+    },
+    {
+      "epoch": 2.663934426229508,
+      "grad_norm": 1.6484375,
+      "learning_rate": 6.151137628775049e-08,
+      "loss": 1.5108,
+      "step": 975
+    },
+    {
+      "epoch": 2.6775956284153004,
+      "grad_norm": 1.609375,
+      "learning_rate": 5.665891698972769e-08,
+      "loss": 1.5003,
+      "step": 980
+    },
+    {
+      "epoch": 2.691256830601093,
+      "grad_norm": 1.640625,
+      "learning_rate": 5.200022475641153e-08,
+      "loss": 1.5015,
+      "step": 985
+    },
+    {
+      "epoch": 2.7049180327868854,
+      "grad_norm": 1.609375,
+      "learning_rate": 4.75362565069225e-08,
+      "loss": 1.5002,
+      "step": 990
+    },
+    {
+      "epoch": 2.718579234972678,
+      "grad_norm": 1.640625,
+      "learning_rate": 4.326792916308242e-08,
+      "loss": 1.5029,
+      "step": 995
+    },
+    {
+      "epoch": 2.73224043715847,
+      "grad_norm": 1.7421875,
+      "learning_rate": 3.919611946107493e-08,
+      "loss": 1.5068,
+      "step": 1000
+    },
+    {
+      "epoch": 2.7459016393442623,
+      "grad_norm": 1.921875,
+      "learning_rate": 3.532166377135814e-08,
+      "loss": 1.4961,
+      "step": 1005
+    },
+    {
+      "epoch": 2.7595628415300544,
+      "grad_norm": 1.6953125,
+      "learning_rate": 3.164535792687095e-08,
+      "loss": 1.5,
+      "step": 1010
+    },
+    {
+      "epoch": 2.773224043715847,
+      "grad_norm": 1.6171875,
+      "learning_rate": 2.8167957059564095e-08,
+      "loss": 1.5035,
+      "step": 1015
+    },
+    {
+      "epoch": 2.7868852459016393,
+      "grad_norm": 1.6484375,
+      "learning_rate": 2.4890175445293147e-08,
+      "loss": 1.5007,
+      "step": 1020
+    },
+    {
+      "epoch": 2.800546448087432,
+      "grad_norm": 1.6328125,
+      "learning_rate": 2.1812686357101428e-08,
+      "loss": 1.505,
+      "step": 1025
+    },
+    {
+      "epoch": 2.8142076502732243,
+      "grad_norm": 1.6484375,
+      "learning_rate": 1.8936121926927507e-08,
+      "loss": 1.5066,
+      "step": 1030
+    },
+    {
+      "epoch": 2.8278688524590163,
+      "grad_norm": 1.640625,
+      "learning_rate": 1.6261073015761072e-08,
+      "loss": 1.502,
+      "step": 1035
+    },
+    {
+      "epoch": 2.841530054644809,
+      "grad_norm": 1.75,
+      "learning_rate": 1.3788089092277688e-08,
+      "loss": 1.5016,
+      "step": 1040
+    },
+    {
+      "epoch": 2.855191256830601,
+      "grad_norm": 1.65625,
+      "learning_rate": 1.1517678119975061e-08,
+      "loss": 1.5024,
+      "step": 1045
+    },
+    {
+      "epoch": 2.8688524590163933,
+      "grad_norm": 1.609375,
+      "learning_rate": 9.450306452834178e-09,
+      "loss": 1.5,
+      "step": 1050
+    },
+    {
+      "epoch": 2.8825136612021858,
+      "grad_norm": 1.734375,
+      "learning_rate": 7.586398739528932e-09,
+      "loss": 1.4989,
+      "step": 1055
+    },
+    {
+      "epoch": 2.8961748633879782,
+      "grad_norm": 1.7890625,
+      "learning_rate": 5.926337836199891e-09,
+      "loss": 1.507,
+      "step": 1060
+    },
+    {
+      "epoch": 2.9098360655737707,
+      "grad_norm": 1.7734375,
+      "learning_rate": 4.470464727814538e-09,
+      "loss": 1.5119,
+      "step": 1065
+    },
+    {
+      "epoch": 2.9234972677595628,
+      "grad_norm": 1.7734375,
+      "learning_rate": 3.219078458127078e-09,
+      "loss": 1.5072,
+      "step": 1070
+    },
+    {
+      "epoch": 2.9371584699453552,
+      "grad_norm": 1.6328125,
+      "learning_rate": 2.172436068252792e-09,
+      "loss": 1.5074,
+      "step": 1075
+    },
+    {
+      "epoch": 2.9508196721311473,
+      "grad_norm": 1.6171875,
+      "learning_rate": 1.330752543871161e-09,
+      "loss": 1.4947,
+      "step": 1080
+    },
+    {
+      "epoch": 2.9644808743169397,
+      "grad_norm": 1.671875,
+      "learning_rate": 6.942007710665221e-10,
+      "loss": 1.5025,
+      "step": 1085
+    },
+    {
+      "epoch": 2.978142076502732,
+      "grad_norm": 1.6640625,
+      "learning_rate": 2.6291150081603207e-10,
+      "loss": 1.5014,
+      "step": 1090
+    },
+    {
+      "epoch": 2.9918032786885247,
+      "grad_norm": 1.609375,
+      "learning_rate": 3.697332213348225e-11,
+      "loss": 1.4964,
+      "step": 1095
     }
   ],
   "logging_steps": 5,
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": true
       },
       "attributes": {}
     }
   },
+  "total_flos": 7.187967826039144e+18,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null