mtzig commited on
Commit
cfb1295
·
verified ·
1 Parent(s): 8cbecd3

Training in progress, step 900, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1780f9c1185f675cd6d51e82c1e80ca4f3e247aa087747947454cf738a5d4c69
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:751cf08e904d158afe5d5c376833ae0e82507272c605040cee2892e5b08babd7
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5c1cde5d8aa7aeb944e3e90997bd082e98d4ca837e32246771d3c564f3795b8
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06a90cd633a16bc937002e25bf17189511b59b8e8d6bd000662a0b07bbf80e65
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b83c11bec463dc59bf896173c919dd90bf0348066e44adb9b0e543295330a1bf
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a2a1314f17bbfab7f1e0d63c5f4ad16ed3c81d2546ede535c552f819fd2b3ab
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8fd391b9627b47091cd87960049ac26042dde18861c1d7e32eb8c1118797bb8
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61ce145ff3942d4b7afa3864d5060ecb372e5343ad7abd8681f9ef04e3996a27
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe0b23737c45689a0672974138d2e4bf3250efbe4a01af660359c0407af9bb21
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef11d2a12800199b371850d31a6f25caf9ed6263ace1a113ebdb48708b504181
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:10e2f0e46255077284fd1723812107259e93b0ddedaac70ce788c07045f96453
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:146b0c5dbeeaf44008b0996c6e5385da4bd543d15c17518bdbadad49738050cf
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75eb0c742069dd550c1d5ea0a95a5d6f378d2b8b113ac677ef2cdd84c0e57d0b
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce829e48185824cbdd9332cfc8db3ea7a52a07e0542cfb9ceb1a123cf238c986
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcb86dc908dae8fcffc64fbd5f755a25779848603655d0b14f508407e36d14b3
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ebc6b5b3bf3ff4db9103e1f6c2a534b04905118e03682dcfd5b2651cf5023a9
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1bbdc461acddb68778ff93a3106863b49db625d394bc9812c902dbf4a3f0decb
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de551c8b1c7165a88a996c1e2c9ccef2b9c716ed5dd27dbead3aed497f875baf
3
  size 15088
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:84aacbbc43943b2ed2d3eeb03d889744c56f76845a7d96e49fa7bff5da5be3b6
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d7ac6617d0f4bedb4a49c51f3499536a49a714fbf47c2bf0f3ef31d2fc421c6
3
  size 15088
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa47b6b07773b1a1fc22394cf526cb8aba69b558d0c5e9d7eeb7487fded1884b
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52e8353d56a8d408800aeacfb35920a2641f5b231344cfc6e9685c5fff1b02a0
3
  size 15088
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae426b79fc7504b3415de1b203e94c3df18eeaa89bc726cbd2ff1e2fdd6c95ce
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e436a7cd1ba66004593b56f4b7ea09bd56b9555226a359189f1b562635d9c755
3
  size 15088
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8412f5078d5372d361046df0c90d05d473909fc8ab1aa58766d872c8114b5c70
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7754cc7953f3b6df72a3313e3aea13311d99a52571c658951fa15bc741d1fcff
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.6779661016949152,
5
  "eval_steps": 20,
6
- "global_step": 800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -6099,6 +6099,766 @@
6099
  "eval_samples_per_second": 5.538,
6100
  "eval_steps_per_second": 0.19,
6101
  "step": 800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6102
  }
6103
  ],
6104
  "logging_steps": 1,
@@ -6118,7 +6878,7 @@
6118
  "attributes": {}
6119
  }
6120
  },
6121
- "total_flos": 2.46322680561664e+17,
6122
  "train_batch_size": 8,
6123
  "trial_name": null,
6124
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7627118644067796,
5
  "eval_steps": 20,
6
+ "global_step": 900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
6099
  "eval_samples_per_second": 5.538,
6100
  "eval_steps_per_second": 0.19,
6101
  "step": 800
6102
+ },
6103
+ {
6104
+ "epoch": 0.6788135593220339,
6105
+ "grad_norm": 0.8634885549545288,
6106
+ "learning_rate": 5.6535492087643904e-06,
6107
+ "loss": 0.0026,
6108
+ "step": 801
6109
+ },
6110
+ {
6111
+ "epoch": 0.6796610169491526,
6112
+ "grad_norm": 2.032689094543457,
6113
+ "learning_rate": 5.626926795411447e-06,
6114
+ "loss": 0.01,
6115
+ "step": 802
6116
+ },
6117
+ {
6118
+ "epoch": 0.6805084745762712,
6119
+ "grad_norm": 0.10548463463783264,
6120
+ "learning_rate": 5.6003426501798665e-06,
6121
+ "loss": 0.0005,
6122
+ "step": 803
6123
+ },
6124
+ {
6125
+ "epoch": 0.6813559322033899,
6126
+ "grad_norm": 1.6854254007339478,
6127
+ "learning_rate": 5.573797005703601e-06,
6128
+ "loss": 0.0037,
6129
+ "step": 804
6130
+ },
6131
+ {
6132
+ "epoch": 0.6822033898305084,
6133
+ "grad_norm": 0.280429869890213,
6134
+ "learning_rate": 5.547290094279687e-06,
6135
+ "loss": 0.001,
6136
+ "step": 805
6137
+ },
6138
+ {
6139
+ "epoch": 0.6830508474576271,
6140
+ "grad_norm": 0.711410403251648,
6141
+ "learning_rate": 5.520822147866214e-06,
6142
+ "loss": 0.0033,
6143
+ "step": 806
6144
+ },
6145
+ {
6146
+ "epoch": 0.6838983050847458,
6147
+ "grad_norm": 0.2613879144191742,
6148
+ "learning_rate": 5.494393398080292e-06,
6149
+ "loss": 0.0011,
6150
+ "step": 807
6151
+ },
6152
+ {
6153
+ "epoch": 0.6847457627118644,
6154
+ "grad_norm": 0.8584437966346741,
6155
+ "learning_rate": 5.46800407619603e-06,
6156
+ "loss": 0.0023,
6157
+ "step": 808
6158
+ },
6159
+ {
6160
+ "epoch": 0.6855932203389831,
6161
+ "grad_norm": 3.826030731201172,
6162
+ "learning_rate": 5.441654413142508e-06,
6163
+ "loss": 0.0228,
6164
+ "step": 809
6165
+ },
6166
+ {
6167
+ "epoch": 0.6864406779661016,
6168
+ "grad_norm": 0.7557411193847656,
6169
+ "learning_rate": 5.415344639501754e-06,
6170
+ "loss": 0.0033,
6171
+ "step": 810
6172
+ },
6173
+ {
6174
+ "epoch": 0.6872881355932203,
6175
+ "grad_norm": 1.0392420291900635,
6176
+ "learning_rate": 5.38907498550674e-06,
6177
+ "loss": 0.0043,
6178
+ "step": 811
6179
+ },
6180
+ {
6181
+ "epoch": 0.688135593220339,
6182
+ "grad_norm": 2.502735137939453,
6183
+ "learning_rate": 5.362845681039348e-06,
6184
+ "loss": 0.0147,
6185
+ "step": 812
6186
+ },
6187
+ {
6188
+ "epoch": 0.6889830508474576,
6189
+ "grad_norm": 0.42259418964385986,
6190
+ "learning_rate": 5.336656955628371e-06,
6191
+ "loss": 0.0021,
6192
+ "step": 813
6193
+ },
6194
+ {
6195
+ "epoch": 0.6898305084745763,
6196
+ "grad_norm": 2.6665008068084717,
6197
+ "learning_rate": 5.310509038447492e-06,
6198
+ "loss": 0.0194,
6199
+ "step": 814
6200
+ },
6201
+ {
6202
+ "epoch": 0.690677966101695,
6203
+ "grad_norm": 2.9882800579071045,
6204
+ "learning_rate": 5.284402158313299e-06,
6205
+ "loss": 0.0415,
6206
+ "step": 815
6207
+ },
6208
+ {
6209
+ "epoch": 0.6915254237288135,
6210
+ "grad_norm": 0.5250701904296875,
6211
+ "learning_rate": 5.25833654368326e-06,
6212
+ "loss": 0.0027,
6213
+ "step": 816
6214
+ },
6215
+ {
6216
+ "epoch": 0.6923728813559322,
6217
+ "grad_norm": 0.46300551295280457,
6218
+ "learning_rate": 5.23231242265375e-06,
6219
+ "loss": 0.0013,
6220
+ "step": 817
6221
+ },
6222
+ {
6223
+ "epoch": 0.6932203389830508,
6224
+ "grad_norm": 0.7175685167312622,
6225
+ "learning_rate": 5.206330022958024e-06,
6226
+ "loss": 0.0037,
6227
+ "step": 818
6228
+ },
6229
+ {
6230
+ "epoch": 0.6940677966101695,
6231
+ "grad_norm": 0.36853301525115967,
6232
+ "learning_rate": 5.180389571964249e-06,
6233
+ "loss": 0.0016,
6234
+ "step": 819
6235
+ },
6236
+ {
6237
+ "epoch": 0.6949152542372882,
6238
+ "grad_norm": 0.713392436504364,
6239
+ "learning_rate": 5.1544912966735e-06,
6240
+ "loss": 0.0048,
6241
+ "step": 820
6242
+ },
6243
+ {
6244
+ "epoch": 0.6949152542372882,
6245
+ "eval_accuracy": 1.0,
6246
+ "eval_f1": 1.0,
6247
+ "eval_loss": 9.25260319490917e-05,
6248
+ "eval_precision": 1.0,
6249
+ "eval_recall": 1.0,
6250
+ "eval_runtime": 51.5622,
6251
+ "eval_samples_per_second": 5.663,
6252
+ "eval_steps_per_second": 0.194,
6253
+ "step": 820
6254
+ },
6255
+ {
6256
+ "epoch": 0.6957627118644067,
6257
+ "grad_norm": 1.1480737924575806,
6258
+ "learning_rate": 5.1286354237177806e-06,
6259
+ "loss": 0.0071,
6260
+ "step": 821
6261
+ },
6262
+ {
6263
+ "epoch": 0.6966101694915254,
6264
+ "grad_norm": 1.0792937278747559,
6265
+ "learning_rate": 5.102822179358037e-06,
6266
+ "loss": 0.0062,
6267
+ "step": 822
6268
+ },
6269
+ {
6270
+ "epoch": 0.6974576271186441,
6271
+ "grad_norm": 2.4379987716674805,
6272
+ "learning_rate": 5.0770517894821815e-06,
6273
+ "loss": 0.0301,
6274
+ "step": 823
6275
+ },
6276
+ {
6277
+ "epoch": 0.6983050847457627,
6278
+ "grad_norm": 1.457727313041687,
6279
+ "learning_rate": 5.051324479603106e-06,
6280
+ "loss": 0.0199,
6281
+ "step": 824
6282
+ },
6283
+ {
6284
+ "epoch": 0.6991525423728814,
6285
+ "grad_norm": 0.9668751955032349,
6286
+ "learning_rate": 5.025640474856732e-06,
6287
+ "loss": 0.003,
6288
+ "step": 825
6289
+ },
6290
+ {
6291
+ "epoch": 0.7,
6292
+ "grad_norm": 0.48410564661026,
6293
+ "learning_rate": 5.000000000000003e-06,
6294
+ "loss": 0.0013,
6295
+ "step": 826
6296
+ },
6297
+ {
6298
+ "epoch": 0.7008474576271186,
6299
+ "grad_norm": 0.28067731857299805,
6300
+ "learning_rate": 4.97440327940895e-06,
6301
+ "loss": 0.0012,
6302
+ "step": 827
6303
+ },
6304
+ {
6305
+ "epoch": 0.7016949152542373,
6306
+ "grad_norm": 0.2483496367931366,
6307
+ "learning_rate": 4.948850537076719e-06,
6308
+ "loss": 0.0007,
6309
+ "step": 828
6310
+ },
6311
+ {
6312
+ "epoch": 0.7025423728813559,
6313
+ "grad_norm": 1.3155626058578491,
6314
+ "learning_rate": 4.923341996611604e-06,
6315
+ "loss": 0.0077,
6316
+ "step": 829
6317
+ },
6318
+ {
6319
+ "epoch": 0.7033898305084746,
6320
+ "grad_norm": 0.7930201888084412,
6321
+ "learning_rate": 4.897877881235091e-06,
6322
+ "loss": 0.0027,
6323
+ "step": 830
6324
+ },
6325
+ {
6326
+ "epoch": 0.7042372881355933,
6327
+ "grad_norm": 0.8325580954551697,
6328
+ "learning_rate": 4.8724584137799235e-06,
6329
+ "loss": 0.0081,
6330
+ "step": 831
6331
+ },
6332
+ {
6333
+ "epoch": 0.7050847457627119,
6334
+ "grad_norm": 2.1520488262176514,
6335
+ "learning_rate": 4.847083816688123e-06,
6336
+ "loss": 0.0172,
6337
+ "step": 832
6338
+ },
6339
+ {
6340
+ "epoch": 0.7059322033898305,
6341
+ "grad_norm": 0.5673890709877014,
6342
+ "learning_rate": 4.821754312009063e-06,
6343
+ "loss": 0.0019,
6344
+ "step": 833
6345
+ },
6346
+ {
6347
+ "epoch": 0.7067796610169491,
6348
+ "grad_norm": 2.460827350616455,
6349
+ "learning_rate": 4.796470121397518e-06,
6350
+ "loss": 0.0238,
6351
+ "step": 834
6352
+ },
6353
+ {
6354
+ "epoch": 0.7076271186440678,
6355
+ "grad_norm": 1.8376518487930298,
6356
+ "learning_rate": 4.771231466111725e-06,
6357
+ "loss": 0.0136,
6358
+ "step": 835
6359
+ },
6360
+ {
6361
+ "epoch": 0.7084745762711865,
6362
+ "grad_norm": 1.0489223003387451,
6363
+ "learning_rate": 4.746038567011454e-06,
6364
+ "loss": 0.0035,
6365
+ "step": 836
6366
+ },
6367
+ {
6368
+ "epoch": 0.7093220338983051,
6369
+ "grad_norm": 2.19427227973938,
6370
+ "learning_rate": 4.7208916445560625e-06,
6371
+ "loss": 0.0212,
6372
+ "step": 837
6373
+ },
6374
+ {
6375
+ "epoch": 0.7101694915254237,
6376
+ "grad_norm": 2.033280849456787,
6377
+ "learning_rate": 4.695790918802577e-06,
6378
+ "loss": 0.0033,
6379
+ "step": 838
6380
+ },
6381
+ {
6382
+ "epoch": 0.7110169491525423,
6383
+ "grad_norm": 1.6251804828643799,
6384
+ "learning_rate": 4.67073660940376e-06,
6385
+ "loss": 0.0187,
6386
+ "step": 839
6387
+ },
6388
+ {
6389
+ "epoch": 0.711864406779661,
6390
+ "grad_norm": 0.2912212610244751,
6391
+ "learning_rate": 4.645728935606194e-06,
6392
+ "loss": 0.0007,
6393
+ "step": 840
6394
+ },
6395
+ {
6396
+ "epoch": 0.711864406779661,
6397
+ "eval_accuracy": 1.0,
6398
+ "eval_f1": 1.0,
6399
+ "eval_loss": 0.00012361357221379876,
6400
+ "eval_precision": 1.0,
6401
+ "eval_recall": 1.0,
6402
+ "eval_runtime": 51.0511,
6403
+ "eval_samples_per_second": 5.72,
6404
+ "eval_steps_per_second": 0.196,
6405
+ "step": 840
6406
+ },
6407
+ {
6408
+ "epoch": 0.7127118644067797,
6409
+ "grad_norm": 3.8638107776641846,
6410
+ "learning_rate": 4.620768116248362e-06,
6411
+ "loss": 0.0192,
6412
+ "step": 841
6413
+ },
6414
+ {
6415
+ "epoch": 0.7135593220338983,
6416
+ "grad_norm": 3.521618366241455,
6417
+ "learning_rate": 4.595854369758727e-06,
6418
+ "loss": 0.0239,
6419
+ "step": 842
6420
+ },
6421
+ {
6422
+ "epoch": 0.714406779661017,
6423
+ "grad_norm": 4.152134418487549,
6424
+ "learning_rate": 4.570987914153824e-06,
6425
+ "loss": 0.0344,
6426
+ "step": 843
6427
+ },
6428
+ {
6429
+ "epoch": 0.7152542372881356,
6430
+ "grad_norm": 1.025924801826477,
6431
+ "learning_rate": 4.546168967036352e-06,
6432
+ "loss": 0.0037,
6433
+ "step": 844
6434
+ },
6435
+ {
6436
+ "epoch": 0.7161016949152542,
6437
+ "grad_norm": 1.601256012916565,
6438
+ "learning_rate": 4.521397745593279e-06,
6439
+ "loss": 0.008,
6440
+ "step": 845
6441
+ },
6442
+ {
6443
+ "epoch": 0.7169491525423729,
6444
+ "grad_norm": 1.8692275285720825,
6445
+ "learning_rate": 4.4966744665939226e-06,
6446
+ "loss": 0.0072,
6447
+ "step": 846
6448
+ },
6449
+ {
6450
+ "epoch": 0.7177966101694915,
6451
+ "grad_norm": 2.0935654640197754,
6452
+ "learning_rate": 4.4719993463880695e-06,
6453
+ "loss": 0.0217,
6454
+ "step": 847
6455
+ },
6456
+ {
6457
+ "epoch": 0.7186440677966102,
6458
+ "grad_norm": 1.089236855506897,
6459
+ "learning_rate": 4.447372600904065e-06,
6460
+ "loss": 0.0095,
6461
+ "step": 848
6462
+ },
6463
+ {
6464
+ "epoch": 0.7194915254237289,
6465
+ "grad_norm": 1.3698294162750244,
6466
+ "learning_rate": 4.422794445646947e-06,
6467
+ "loss": 0.0075,
6468
+ "step": 849
6469
+ },
6470
+ {
6471
+ "epoch": 0.7203389830508474,
6472
+ "grad_norm": 1.4754582643508911,
6473
+ "learning_rate": 4.398265095696539e-06,
6474
+ "loss": 0.012,
6475
+ "step": 850
6476
+ },
6477
+ {
6478
+ "epoch": 0.7211864406779661,
6479
+ "grad_norm": 0.5242229104042053,
6480
+ "learning_rate": 4.373784765705586e-06,
6481
+ "loss": 0.0038,
6482
+ "step": 851
6483
+ },
6484
+ {
6485
+ "epoch": 0.7220338983050848,
6486
+ "grad_norm": 3.324636220932007,
6487
+ "learning_rate": 4.349353669897856e-06,
6488
+ "loss": 0.0299,
6489
+ "step": 852
6490
+ },
6491
+ {
6492
+ "epoch": 0.7228813559322034,
6493
+ "grad_norm": 1.4912688732147217,
6494
+ "learning_rate": 4.324972022066284e-06,
6495
+ "loss": 0.009,
6496
+ "step": 853
6497
+ },
6498
+ {
6499
+ "epoch": 0.7237288135593221,
6500
+ "grad_norm": 1.8088048696517944,
6501
+ "learning_rate": 4.300640035571085e-06,
6502
+ "loss": 0.0156,
6503
+ "step": 854
6504
+ },
6505
+ {
6506
+ "epoch": 0.7245762711864406,
6507
+ "grad_norm": 0.9810622930526733,
6508
+ "learning_rate": 4.276357923337903e-06,
6509
+ "loss": 0.0054,
6510
+ "step": 855
6511
+ },
6512
+ {
6513
+ "epoch": 0.7254237288135593,
6514
+ "grad_norm": 1.399301290512085,
6515
+ "learning_rate": 4.2521258978559324e-06,
6516
+ "loss": 0.0144,
6517
+ "step": 856
6518
+ },
6519
+ {
6520
+ "epoch": 0.726271186440678,
6521
+ "grad_norm": 1.4187586307525635,
6522
+ "learning_rate": 4.227944171176072e-06,
6523
+ "loss": 0.0111,
6524
+ "step": 857
6525
+ },
6526
+ {
6527
+ "epoch": 0.7271186440677966,
6528
+ "grad_norm": 1.7159850597381592,
6529
+ "learning_rate": 4.203812954909057e-06,
6530
+ "loss": 0.0124,
6531
+ "step": 858
6532
+ },
6533
+ {
6534
+ "epoch": 0.7279661016949153,
6535
+ "grad_norm": 1.37935471534729,
6536
+ "learning_rate": 4.179732460223626e-06,
6537
+ "loss": 0.0118,
6538
+ "step": 859
6539
+ },
6540
+ {
6541
+ "epoch": 0.7288135593220338,
6542
+ "grad_norm": 0.29636350274086,
6543
+ "learning_rate": 4.1557028978446415e-06,
6544
+ "loss": 0.0014,
6545
+ "step": 860
6546
+ },
6547
+ {
6548
+ "epoch": 0.7288135593220338,
6549
+ "eval_accuracy": 1.0,
6550
+ "eval_f1": 1.0,
6551
+ "eval_loss": 0.00012519690790213645,
6552
+ "eval_precision": 1.0,
6553
+ "eval_recall": 1.0,
6554
+ "eval_runtime": 50.6172,
6555
+ "eval_samples_per_second": 5.769,
6556
+ "eval_steps_per_second": 0.198,
6557
+ "step": 860
6558
+ },
6559
+ {
6560
+ "epoch": 0.7296610169491525,
6561
+ "grad_norm": 0.8602759838104248,
6562
+ "learning_rate": 4.1317244780512775e-06,
6563
+ "loss": 0.0092,
6564
+ "step": 861
6565
+ },
6566
+ {
6567
+ "epoch": 0.7305084745762712,
6568
+ "grad_norm": 2.8475470542907715,
6569
+ "learning_rate": 4.107797410675166e-06,
6570
+ "loss": 0.0388,
6571
+ "step": 862
6572
+ },
6573
+ {
6574
+ "epoch": 0.7313559322033898,
6575
+ "grad_norm": 1.3523766994476318,
6576
+ "learning_rate": 4.083921905098559e-06,
6577
+ "loss": 0.0068,
6578
+ "step": 863
6579
+ },
6580
+ {
6581
+ "epoch": 0.7322033898305085,
6582
+ "grad_norm": 1.5425556898117065,
6583
+ "learning_rate": 4.060098170252495e-06,
6584
+ "loss": 0.0116,
6585
+ "step": 864
6586
+ },
6587
+ {
6588
+ "epoch": 0.7330508474576272,
6589
+ "grad_norm": 0.3472643494606018,
6590
+ "learning_rate": 4.036326414614985e-06,
6591
+ "loss": 0.0016,
6592
+ "step": 865
6593
+ },
6594
+ {
6595
+ "epoch": 0.7338983050847457,
6596
+ "grad_norm": 0.4821791648864746,
6597
+ "learning_rate": 4.0126068462091705e-06,
6598
+ "loss": 0.0022,
6599
+ "step": 866
6600
+ },
6601
+ {
6602
+ "epoch": 0.7347457627118644,
6603
+ "grad_norm": 0.49641525745391846,
6604
+ "learning_rate": 3.988939672601509e-06,
6605
+ "loss": 0.0018,
6606
+ "step": 867
6607
+ },
6608
+ {
6609
+ "epoch": 0.735593220338983,
6610
+ "grad_norm": 1.2695690393447876,
6611
+ "learning_rate": 3.9653251008999615e-06,
6612
+ "loss": 0.0189,
6613
+ "step": 868
6614
+ },
6615
+ {
6616
+ "epoch": 0.7364406779661017,
6617
+ "grad_norm": 0.7525699734687805,
6618
+ "learning_rate": 3.941763337752177e-06,
6619
+ "loss": 0.0027,
6620
+ "step": 869
6621
+ },
6622
+ {
6623
+ "epoch": 0.7372881355932204,
6624
+ "grad_norm": 1.9671075344085693,
6625
+ "learning_rate": 3.918254589343683e-06,
6626
+ "loss": 0.0161,
6627
+ "step": 870
6628
+ },
6629
+ {
6630
+ "epoch": 0.738135593220339,
6631
+ "grad_norm": 0.5608029365539551,
6632
+ "learning_rate": 3.894799061396086e-06,
6633
+ "loss": 0.002,
6634
+ "step": 871
6635
+ },
6636
+ {
6637
+ "epoch": 0.7389830508474576,
6638
+ "grad_norm": 2.967503309249878,
6639
+ "learning_rate": 3.871396959165267e-06,
6640
+ "loss": 0.0201,
6641
+ "step": 872
6642
+ },
6643
+ {
6644
+ "epoch": 0.7398305084745763,
6645
+ "grad_norm": 0.9887444972991943,
6646
+ "learning_rate": 3.848048487439583e-06,
6647
+ "loss": 0.0059,
6648
+ "step": 873
6649
+ },
6650
+ {
6651
+ "epoch": 0.7406779661016949,
6652
+ "grad_norm": 1.1107892990112305,
6653
+ "learning_rate": 3.824753850538082e-06,
6654
+ "loss": 0.0071,
6655
+ "step": 874
6656
+ },
6657
+ {
6658
+ "epoch": 0.7415254237288136,
6659
+ "grad_norm": 0.9307472705841064,
6660
+ "learning_rate": 3.801513252308712e-06,
6661
+ "loss": 0.0042,
6662
+ "step": 875
6663
+ },
6664
+ {
6665
+ "epoch": 0.7423728813559322,
6666
+ "grad_norm": 2.223409652709961,
6667
+ "learning_rate": 3.778326896126533e-06,
6668
+ "loss": 0.0084,
6669
+ "step": 876
6670
+ },
6671
+ {
6672
+ "epoch": 0.7432203389830508,
6673
+ "grad_norm": 0.8251433968544006,
6674
+ "learning_rate": 3.755194984891943e-06,
6675
+ "loss": 0.0042,
6676
+ "step": 877
6677
+ },
6678
+ {
6679
+ "epoch": 0.7440677966101695,
6680
+ "grad_norm": 1.5690832138061523,
6681
+ "learning_rate": 3.7321177210288974e-06,
6682
+ "loss": 0.0045,
6683
+ "step": 878
6684
+ },
6685
+ {
6686
+ "epoch": 0.7449152542372881,
6687
+ "grad_norm": 1.1555606126785278,
6688
+ "learning_rate": 3.709095306483149e-06,
6689
+ "loss": 0.009,
6690
+ "step": 879
6691
+ },
6692
+ {
6693
+ "epoch": 0.7457627118644068,
6694
+ "grad_norm": 0.7731421589851379,
6695
+ "learning_rate": 3.6861279427204634e-06,
6696
+ "loss": 0.0056,
6697
+ "step": 880
6698
+ },
6699
+ {
6700
+ "epoch": 0.7457627118644068,
6701
+ "eval_accuracy": 1.0,
6702
+ "eval_f1": 1.0,
6703
+ "eval_loss": 9.962059266399592e-05,
6704
+ "eval_precision": 1.0,
6705
+ "eval_recall": 1.0,
6706
+ "eval_runtime": 52.1678,
6707
+ "eval_samples_per_second": 5.597,
6708
+ "eval_steps_per_second": 0.192,
6709
+ "step": 880
6710
+ },
6711
+ {
6712
+ "epoch": 0.7466101694915255,
6713
+ "grad_norm": 0.9392852783203125,
6714
+ "learning_rate": 3.6632158307248713e-06,
6715
+ "loss": 0.0043,
6716
+ "step": 881
6717
+ },
6718
+ {
6719
+ "epoch": 0.747457627118644,
6720
+ "grad_norm": 0.8620559573173523,
6721
+ "learning_rate": 3.6403591709968924e-06,
6722
+ "loss": 0.0049,
6723
+ "step": 882
6724
+ },
6725
+ {
6726
+ "epoch": 0.7483050847457627,
6727
+ "grad_norm": 1.7161260843276978,
6728
+ "learning_rate": 3.617558163551802e-06,
6729
+ "loss": 0.0101,
6730
+ "step": 883
6731
+ },
6732
+ {
6733
+ "epoch": 0.7491525423728813,
6734
+ "grad_norm": 0.5389106273651123,
6735
+ "learning_rate": 3.5948130079178666e-06,
6736
+ "loss": 0.0052,
6737
+ "step": 884
6738
+ },
6739
+ {
6740
+ "epoch": 0.75,
6741
+ "grad_norm": 1.6375874280929565,
6742
+ "learning_rate": 3.5721239031346067e-06,
6743
+ "loss": 0.0099,
6744
+ "step": 885
6745
+ },
6746
+ {
6747
+ "epoch": 0.7508474576271187,
6748
+ "grad_norm": 0.791953444480896,
6749
+ "learning_rate": 3.5494910477510445e-06,
6750
+ "loss": 0.0035,
6751
+ "step": 886
6752
+ },
6753
+ {
6754
+ "epoch": 0.7516949152542373,
6755
+ "grad_norm": 2.004249095916748,
6756
+ "learning_rate": 3.526914639823973e-06,
6757
+ "loss": 0.012,
6758
+ "step": 887
6759
+ },
6760
+ {
6761
+ "epoch": 0.752542372881356,
6762
+ "grad_norm": 0.7185747623443604,
6763
+ "learning_rate": 3.5043948769162227e-06,
6764
+ "loss": 0.0037,
6765
+ "step": 888
6766
+ },
6767
+ {
6768
+ "epoch": 0.7533898305084745,
6769
+ "grad_norm": 1.0839685201644897,
6770
+ "learning_rate": 3.4819319560949303e-06,
6771
+ "loss": 0.0043,
6772
+ "step": 889
6773
+ },
6774
+ {
6775
+ "epoch": 0.7542372881355932,
6776
+ "grad_norm": 1.8889586925506592,
6777
+ "learning_rate": 3.4595260739298174e-06,
6778
+ "loss": 0.0236,
6779
+ "step": 890
6780
+ },
6781
+ {
6782
+ "epoch": 0.7550847457627119,
6783
+ "grad_norm": 1.4316060543060303,
6784
+ "learning_rate": 3.437177426491468e-06,
6785
+ "loss": 0.0168,
6786
+ "step": 891
6787
+ },
6788
+ {
6789
+ "epoch": 0.7559322033898305,
6790
+ "grad_norm": 1.128692865371704,
6791
+ "learning_rate": 3.414886209349615e-06,
6792
+ "loss": 0.007,
6793
+ "step": 892
6794
+ },
6795
+ {
6796
+ "epoch": 0.7567796610169492,
6797
+ "grad_norm": 1.5015407800674438,
6798
+ "learning_rate": 3.3926526175714246e-06,
6799
+ "loss": 0.0063,
6800
+ "step": 893
6801
+ },
6802
+ {
6803
+ "epoch": 0.7576271186440678,
6804
+ "grad_norm": 0.8763653635978699,
6805
+ "learning_rate": 3.3704768457197933e-06,
6806
+ "loss": 0.0039,
6807
+ "step": 894
6808
+ },
6809
+ {
6810
+ "epoch": 0.7584745762711864,
6811
+ "grad_norm": 2.3314383029937744,
6812
+ "learning_rate": 3.3483590878516437e-06,
6813
+ "loss": 0.0201,
6814
+ "step": 895
6815
+ },
6816
+ {
6817
+ "epoch": 0.7593220338983051,
6818
+ "grad_norm": 1.774864912033081,
6819
+ "learning_rate": 3.3262995375162256e-06,
6820
+ "loss": 0.0058,
6821
+ "step": 896
6822
+ },
6823
+ {
6824
+ "epoch": 0.7601694915254237,
6825
+ "grad_norm": 0.5185796618461609,
6826
+ "learning_rate": 3.304298387753426e-06,
6827
+ "loss": 0.0022,
6828
+ "step": 897
6829
+ },
6830
+ {
6831
+ "epoch": 0.7610169491525424,
6832
+ "grad_norm": 1.1957974433898926,
6833
+ "learning_rate": 3.282355831092072e-06,
6834
+ "loss": 0.0046,
6835
+ "step": 898
6836
+ },
6837
+ {
6838
+ "epoch": 0.761864406779661,
6839
+ "grad_norm": 1.759142518043518,
6840
+ "learning_rate": 3.2604720595482598e-06,
6841
+ "loss": 0.0097,
6842
+ "step": 899
6843
+ },
6844
+ {
6845
+ "epoch": 0.7627118644067796,
6846
+ "grad_norm": 2.0010337829589844,
6847
+ "learning_rate": 3.2386472646236565e-06,
6848
+ "loss": 0.0107,
6849
+ "step": 900
6850
+ },
6851
+ {
6852
+ "epoch": 0.7627118644067796,
6853
+ "eval_accuracy": 1.0,
6854
+ "eval_f1": 1.0,
6855
+ "eval_loss": 9.30224996409379e-05,
6856
+ "eval_precision": 1.0,
6857
+ "eval_recall": 1.0,
6858
+ "eval_runtime": 49.6959,
6859
+ "eval_samples_per_second": 5.876,
6860
+ "eval_steps_per_second": 0.201,
6861
+ "step": 900
6862
  }
6863
  ],
6864
  "logging_steps": 1,
 
6878
  "attributes": {}
6879
  }
6880
  },
6881
+ "total_flos": 2.771703818092544e+17,
6882
  "train_batch_size": 8,
6883
  "trial_name": null,
6884
  "trial_params": null