mtzig commited on
Commit
960a54b
·
verified ·
1 Parent(s): 042789c

Training in progress, step 900, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c4366e69d60737b3779bf9a490440dab3b7129f75dd2feb73dc07db72babfbdf
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dcfd94f62ff19b8a9933a9541a5733207408d391abbc19039ab84f7f6b53b2b
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6dd1a92d0a5135b333dcb0ee27f4083935df79d3cdbd2e1782a276fcaa9d154
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dbf500184a745e32656bd72aed96444d450b8822d32f095305fab829cad5dcd
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4835991b793120481b099eb89774026995ab06f7469520e242d29572c729356e
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19d8b41074874752835461e4b0b43cfae611a1b52b65b06efded073a8540f0a3
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc58fcb266dccc572788bea6fad1f9f18f3c730bcccdaac874c4e3525aa012f7
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:420c4af07d8d11503cf38a19956fa08bce83ddcb83c306ed5f4f325d2289c54f
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:10960d9381443ad0dcee2d617e86555e9074adf42f10ea5b5031e4d17e4541fc
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b5d70ab7c84844a31deff224e2783035d5a27b4e451d959e41e69e6ba1bfba0
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40b617e3c4555dd60638eed9d7607d51addbf2e88c7e24d50cc4af0a97e2c1ee
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acfc04e204792faef03bd393bc190684855a1199069618e09617dabdc06f61f4
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3aca6570204ebce4c6f1f377b28739749ab7e2db8774a4ae79b88318ecc645d3
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6560ad83e9bf537962a4628c52535beab1489d42b33161c72bbe9a7052130abb
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:af486d9702ac04f9288a8d7f8d269fc56995d192739a371cdc0ab84562d444ee
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9290d11cadcf051134f7e3a6303ac31bd31867fa4eca0a0ba3fad88d33d73e1e
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6651bb14848ddc09834acedbc497fa211b2b3bf1fd839c0a3e2edc297b450478
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f014cc69554457011024b3f78c613f5d6d21afe59987b579c8148d7a14d4266
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae03e0e09af0ed507e28016716e1ab7cf741033f725d1d5e7ab0b16606ee820f
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:211fe4cc875fc5d117275383fb9805fb9073808b1bf8c46687ccdf31963af654
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38f2cc82eeba75672cf489909ffeec57c804049b636096ef6ef25670940386a2
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c181295f448353cc0ad867ce15d877dc5e4902c02d374aebe08a6faa6eaf73cb
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e86dc8922a03fbd7dae55bdea9fe0c57d7c92e7e4db833cf223eab9d601f3f59
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff94a6828461c3753dda9d571a0a287a6967bc241a3276d73771206e60da3988
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0693986dcdbacb610348452117526a26c1a6fb2feb944c0d12e677a07f1eaf73
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da38d22b5a3dd941093c6e3a0d9705292603a8a72ab0757d88c7c3b61a44e0be
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5412719891745602,
5
  "eval_steps": 20,
6
- "global_step": 800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -6099,6 +6099,766 @@
6099
  "eval_samples_per_second": 5.622,
6100
  "eval_steps_per_second": 0.189,
6101
  "step": 800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6102
  }
6103
  ],
6104
  "logging_steps": 1,
@@ -6118,7 +6878,7 @@
6118
  "attributes": {}
6119
  }
6120
  },
6121
- "total_flos": 2.4277215937665434e+17,
6122
  "train_batch_size": 8,
6123
  "trial_name": null,
6124
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.6089309878213802,
5
  "eval_steps": 20,
6
+ "global_step": 900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
6099
  "eval_samples_per_second": 5.622,
6100
  "eval_steps_per_second": 0.189,
6101
  "step": 800
6102
+ },
6103
+ {
6104
+ "epoch": 0.5419485791610285,
6105
+ "grad_norm": 4.048157691955566,
6106
+ "learning_rate": 1.0283414013680233e-05,
6107
+ "loss": 0.1629,
6108
+ "step": 801
6109
+ },
6110
+ {
6111
+ "epoch": 0.5426251691474966,
6112
+ "grad_norm": 8.180608749389648,
6113
+ "learning_rate": 1.0259801736097634e-05,
6114
+ "loss": 0.2779,
6115
+ "step": 802
6116
+ },
6117
+ {
6118
+ "epoch": 0.5433017591339648,
6119
+ "grad_norm": 2.2637126445770264,
6120
+ "learning_rate": 1.023618800894798e-05,
6121
+ "loss": 0.0888,
6122
+ "step": 803
6123
+ },
6124
+ {
6125
+ "epoch": 0.543978349120433,
6126
+ "grad_norm": 3.4222660064697266,
6127
+ "learning_rate": 1.0212572963984358e-05,
6128
+ "loss": 0.1072,
6129
+ "step": 804
6130
+ },
6131
+ {
6132
+ "epoch": 0.5446549391069012,
6133
+ "grad_norm": 5.149094104766846,
6134
+ "learning_rate": 1.0188956732967208e-05,
6135
+ "loss": 0.1834,
6136
+ "step": 805
6137
+ },
6138
+ {
6139
+ "epoch": 0.5453315290933695,
6140
+ "grad_norm": 4.928592681884766,
6141
+ "learning_rate": 1.0165339447663586e-05,
6142
+ "loss": 0.1065,
6143
+ "step": 806
6144
+ },
6145
+ {
6146
+ "epoch": 0.5460081190798376,
6147
+ "grad_norm": 3.3745458126068115,
6148
+ "learning_rate": 1.0141721239846436e-05,
6149
+ "loss": 0.1105,
6150
+ "step": 807
6151
+ },
6152
+ {
6153
+ "epoch": 0.5466847090663058,
6154
+ "grad_norm": 3.140056848526001,
6155
+ "learning_rate": 1.0118102241293848e-05,
6156
+ "loss": 0.1057,
6157
+ "step": 808
6158
+ },
6159
+ {
6160
+ "epoch": 0.547361299052774,
6161
+ "grad_norm": 2.9225945472717285,
6162
+ "learning_rate": 1.0094482583788311e-05,
6163
+ "loss": 0.1409,
6164
+ "step": 809
6165
+ },
6166
+ {
6167
+ "epoch": 0.5480378890392422,
6168
+ "grad_norm": 5.8072333335876465,
6169
+ "learning_rate": 1.0070862399116016e-05,
6170
+ "loss": 0.1697,
6171
+ "step": 810
6172
+ },
6173
+ {
6174
+ "epoch": 0.5487144790257105,
6175
+ "grad_norm": 2.9896950721740723,
6176
+ "learning_rate": 1.0047241819066069e-05,
6177
+ "loss": 0.0893,
6178
+ "step": 811
6179
+ },
6180
+ {
6181
+ "epoch": 0.5493910690121786,
6182
+ "grad_norm": 9.982451438903809,
6183
+ "learning_rate": 1.0023620975429803e-05,
6184
+ "loss": 0.2119,
6185
+ "step": 812
6186
+ },
6187
+ {
6188
+ "epoch": 0.5500676589986468,
6189
+ "grad_norm": 5.057304382324219,
6190
+ "learning_rate": 1e-05,
6191
+ "loss": 0.1417,
6192
+ "step": 813
6193
+ },
6194
+ {
6195
+ "epoch": 0.550744248985115,
6196
+ "grad_norm": 8.598350524902344,
6197
+ "learning_rate": 9.976379024570202e-06,
6198
+ "loss": 0.2122,
6199
+ "step": 814
6200
+ },
6201
+ {
6202
+ "epoch": 0.5514208389715832,
6203
+ "grad_norm": 2.8650100231170654,
6204
+ "learning_rate": 9.952758180933933e-06,
6205
+ "loss": 0.0901,
6206
+ "step": 815
6207
+ },
6208
+ {
6209
+ "epoch": 0.5520974289580515,
6210
+ "grad_norm": 5.409826278686523,
6211
+ "learning_rate": 9.929137600883986e-06,
6212
+ "loss": 0.1613,
6213
+ "step": 816
6214
+ },
6215
+ {
6216
+ "epoch": 0.5527740189445196,
6217
+ "grad_norm": 2.837448835372925,
6218
+ "learning_rate": 9.90551741621169e-06,
6219
+ "loss": 0.0946,
6220
+ "step": 817
6221
+ },
6222
+ {
6223
+ "epoch": 0.5534506089309879,
6224
+ "grad_norm": 3.9413063526153564,
6225
+ "learning_rate": 9.881897758706155e-06,
6226
+ "loss": 0.1099,
6227
+ "step": 818
6228
+ },
6229
+ {
6230
+ "epoch": 0.554127198917456,
6231
+ "grad_norm": 2.284583330154419,
6232
+ "learning_rate": 9.858278760153567e-06,
6233
+ "loss": 0.0653,
6234
+ "step": 819
6235
+ },
6236
+ {
6237
+ "epoch": 0.5548037889039242,
6238
+ "grad_norm": 6.636195182800293,
6239
+ "learning_rate": 9.834660552336415e-06,
6240
+ "loss": 0.1583,
6241
+ "step": 820
6242
+ },
6243
+ {
6244
+ "epoch": 0.5548037889039242,
6245
+ "eval_accuracy": 0.7788990825688074,
6246
+ "eval_f1": 0.3674540682414698,
6247
+ "eval_loss": 0.47594916820526123,
6248
+ "eval_precision": 0.8433734939759037,
6249
+ "eval_recall": 0.2348993288590604,
6250
+ "eval_runtime": 53.6382,
6251
+ "eval_samples_per_second": 5.556,
6252
+ "eval_steps_per_second": 0.186,
6253
+ "step": 820
6254
+ },
6255
+ {
6256
+ "epoch": 0.5554803788903924,
6257
+ "grad_norm": 3.6005120277404785,
6258
+ "learning_rate": 9.811043267032797e-06,
6259
+ "loss": 0.0887,
6260
+ "step": 821
6261
+ },
6262
+ {
6263
+ "epoch": 0.5561569688768606,
6264
+ "grad_norm": 3.6898558139801025,
6265
+ "learning_rate": 9.787427036015647e-06,
6266
+ "loss": 0.147,
6267
+ "step": 822
6268
+ },
6269
+ {
6270
+ "epoch": 0.5568335588633289,
6271
+ "grad_norm": 6.481770992279053,
6272
+ "learning_rate": 9.763811991052021e-06,
6273
+ "loss": 0.1877,
6274
+ "step": 823
6275
+ },
6276
+ {
6277
+ "epoch": 0.557510148849797,
6278
+ "grad_norm": 2.4457807540893555,
6279
+ "learning_rate": 9.74019826390237e-06,
6280
+ "loss": 0.1004,
6281
+ "step": 824
6282
+ },
6283
+ {
6284
+ "epoch": 0.5581867388362652,
6285
+ "grad_norm": 2.3346476554870605,
6286
+ "learning_rate": 9.716585986319769e-06,
6287
+ "loss": 0.1199,
6288
+ "step": 825
6289
+ },
6290
+ {
6291
+ "epoch": 0.5588633288227334,
6292
+ "grad_norm": 5.753274917602539,
6293
+ "learning_rate": 9.692975290049228e-06,
6294
+ "loss": 0.1646,
6295
+ "step": 826
6296
+ },
6297
+ {
6298
+ "epoch": 0.5595399188092016,
6299
+ "grad_norm": 4.540411949157715,
6300
+ "learning_rate": 9.669366306826919e-06,
6301
+ "loss": 0.1275,
6302
+ "step": 827
6303
+ },
6304
+ {
6305
+ "epoch": 0.5602165087956699,
6306
+ "grad_norm": 4.377498149871826,
6307
+ "learning_rate": 9.645759168379463e-06,
6308
+ "loss": 0.1508,
6309
+ "step": 828
6310
+ },
6311
+ {
6312
+ "epoch": 0.560893098782138,
6313
+ "grad_norm": 4.894872188568115,
6314
+ "learning_rate": 9.622154006423185e-06,
6315
+ "loss": 0.1608,
6316
+ "step": 829
6317
+ },
6318
+ {
6319
+ "epoch": 0.5615696887686062,
6320
+ "grad_norm": 3.9579596519470215,
6321
+ "learning_rate": 9.598550952663383e-06,
6322
+ "loss": 0.0748,
6323
+ "step": 830
6324
+ },
6325
+ {
6326
+ "epoch": 0.5622462787550744,
6327
+ "grad_norm": 3.1920807361602783,
6328
+ "learning_rate": 9.574950138793593e-06,
6329
+ "loss": 0.0958,
6330
+ "step": 831
6331
+ },
6332
+ {
6333
+ "epoch": 0.5629228687415426,
6334
+ "grad_norm": 2.8056745529174805,
6335
+ "learning_rate": 9.551351696494854e-06,
6336
+ "loss": 0.1528,
6337
+ "step": 832
6338
+ },
6339
+ {
6340
+ "epoch": 0.5635994587280109,
6341
+ "grad_norm": 2.493863105773926,
6342
+ "learning_rate": 9.527755757434968e-06,
6343
+ "loss": 0.0693,
6344
+ "step": 833
6345
+ },
6346
+ {
6347
+ "epoch": 0.564276048714479,
6348
+ "grad_norm": 3.3314931392669678,
6349
+ "learning_rate": 9.504162453267776e-06,
6350
+ "loss": 0.1227,
6351
+ "step": 834
6352
+ },
6353
+ {
6354
+ "epoch": 0.5649526387009473,
6355
+ "grad_norm": 3.9033989906311035,
6356
+ "learning_rate": 9.480571915632422e-06,
6357
+ "loss": 0.1199,
6358
+ "step": 835
6359
+ },
6360
+ {
6361
+ "epoch": 0.5656292286874154,
6362
+ "grad_norm": 3.6395678520202637,
6363
+ "learning_rate": 9.456984276152598e-06,
6364
+ "loss": 0.1057,
6365
+ "step": 836
6366
+ },
6367
+ {
6368
+ "epoch": 0.5663058186738836,
6369
+ "grad_norm": 6.916732311248779,
6370
+ "learning_rate": 9.43339966643584e-06,
6371
+ "loss": 0.1741,
6372
+ "step": 837
6373
+ },
6374
+ {
6375
+ "epoch": 0.5669824086603519,
6376
+ "grad_norm": 3.8561432361602783,
6377
+ "learning_rate": 9.409818218072774e-06,
6378
+ "loss": 0.1654,
6379
+ "step": 838
6380
+ },
6381
+ {
6382
+ "epoch": 0.56765899864682,
6383
+ "grad_norm": 3.962113380432129,
6384
+ "learning_rate": 9.386240062636388e-06,
6385
+ "loss": 0.1459,
6386
+ "step": 839
6387
+ },
6388
+ {
6389
+ "epoch": 0.5683355886332883,
6390
+ "grad_norm": 2.5661449432373047,
6391
+ "learning_rate": 9.362665331681294e-06,
6392
+ "loss": 0.1363,
6393
+ "step": 840
6394
+ },
6395
+ {
6396
+ "epoch": 0.5683355886332883,
6397
+ "eval_accuracy": 0.8009174311926606,
6398
+ "eval_f1": 0.4668304668304668,
6399
+ "eval_loss": 0.43011632561683655,
6400
+ "eval_precision": 0.8715596330275229,
6401
+ "eval_recall": 0.3187919463087248,
6402
+ "eval_runtime": 54.0976,
6403
+ "eval_samples_per_second": 5.509,
6404
+ "eval_steps_per_second": 0.185,
6405
+ "step": 840
6406
+ },
6407
+ {
6408
+ "epoch": 0.5690121786197564,
6409
+ "grad_norm": 3.792685031890869,
6410
+ "learning_rate": 9.339094156743007e-06,
6411
+ "loss": 0.1408,
6412
+ "step": 841
6413
+ },
6414
+ {
6415
+ "epoch": 0.5696887686062246,
6416
+ "grad_norm": 4.538841247558594,
6417
+ "learning_rate": 9.315526669337189e-06,
6418
+ "loss": 0.1399,
6419
+ "step": 842
6420
+ },
6421
+ {
6422
+ "epoch": 0.5703653585926928,
6423
+ "grad_norm": 5.182969570159912,
6424
+ "learning_rate": 9.291963000958932e-06,
6425
+ "loss": 0.1753,
6426
+ "step": 843
6427
+ },
6428
+ {
6429
+ "epoch": 0.571041948579161,
6430
+ "grad_norm": 3.905219554901123,
6431
+ "learning_rate": 9.268403283082025e-06,
6432
+ "loss": 0.143,
6433
+ "step": 844
6434
+ },
6435
+ {
6436
+ "epoch": 0.5717185385656293,
6437
+ "grad_norm": 3.7634634971618652,
6438
+ "learning_rate": 9.244847647158203e-06,
6439
+ "loss": 0.1469,
6440
+ "step": 845
6441
+ },
6442
+ {
6443
+ "epoch": 0.5723951285520974,
6444
+ "grad_norm": 3.5530450344085693,
6445
+ "learning_rate": 9.221296224616443e-06,
6446
+ "loss": 0.1334,
6447
+ "step": 846
6448
+ },
6449
+ {
6450
+ "epoch": 0.5730717185385656,
6451
+ "grad_norm": 6.1246161460876465,
6452
+ "learning_rate": 9.197749146862193e-06,
6453
+ "loss": 0.1216,
6454
+ "step": 847
6455
+ },
6456
+ {
6457
+ "epoch": 0.5737483085250338,
6458
+ "grad_norm": 7.231658458709717,
6459
+ "learning_rate": 9.174206545276678e-06,
6460
+ "loss": 0.2128,
6461
+ "step": 848
6462
+ },
6463
+ {
6464
+ "epoch": 0.574424898511502,
6465
+ "grad_norm": 6.129051685333252,
6466
+ "learning_rate": 9.150668551216134e-06,
6467
+ "loss": 0.2178,
6468
+ "step": 849
6469
+ },
6470
+ {
6471
+ "epoch": 0.5751014884979703,
6472
+ "grad_norm": 4.892454624176025,
6473
+ "learning_rate": 9.127135296011102e-06,
6474
+ "loss": 0.1496,
6475
+ "step": 850
6476
+ },
6477
+ {
6478
+ "epoch": 0.5757780784844384,
6479
+ "grad_norm": 6.8777337074279785,
6480
+ "learning_rate": 9.103606910965666e-06,
6481
+ "loss": 0.2008,
6482
+ "step": 851
6483
+ },
6484
+ {
6485
+ "epoch": 0.5764546684709067,
6486
+ "grad_norm": 3.538118600845337,
6487
+ "learning_rate": 9.080083527356755e-06,
6488
+ "loss": 0.1232,
6489
+ "step": 852
6490
+ },
6491
+ {
6492
+ "epoch": 0.5771312584573748,
6493
+ "grad_norm": 5.2440080642700195,
6494
+ "learning_rate": 9.056565276433378e-06,
6495
+ "loss": 0.1973,
6496
+ "step": 853
6497
+ },
6498
+ {
6499
+ "epoch": 0.577807848443843,
6500
+ "grad_norm": 2.282479763031006,
6501
+ "learning_rate": 9.033052289415914e-06,
6502
+ "loss": 0.0696,
6503
+ "step": 854
6504
+ },
6505
+ {
6506
+ "epoch": 0.5784844384303113,
6507
+ "grad_norm": 3.643191337585449,
6508
+ "learning_rate": 9.009544697495373e-06,
6509
+ "loss": 0.1378,
6510
+ "step": 855
6511
+ },
6512
+ {
6513
+ "epoch": 0.5791610284167794,
6514
+ "grad_norm": 3.0240986347198486,
6515
+ "learning_rate": 8.986042631832656e-06,
6516
+ "loss": 0.1579,
6517
+ "step": 856
6518
+ },
6519
+ {
6520
+ "epoch": 0.5798376184032477,
6521
+ "grad_norm": 3.921018362045288,
6522
+ "learning_rate": 8.962546223557838e-06,
6523
+ "loss": 0.1194,
6524
+ "step": 857
6525
+ },
6526
+ {
6527
+ "epoch": 0.5805142083897158,
6528
+ "grad_norm": 3.4192543029785156,
6529
+ "learning_rate": 8.93905560376942e-06,
6530
+ "loss": 0.1817,
6531
+ "step": 858
6532
+ },
6533
+ {
6534
+ "epoch": 0.581190798376184,
6535
+ "grad_norm": 4.1514363288879395,
6536
+ "learning_rate": 8.915570903533615e-06,
6537
+ "loss": 0.1489,
6538
+ "step": 859
6539
+ },
6540
+ {
6541
+ "epoch": 0.5818673883626523,
6542
+ "grad_norm": 3.885377883911133,
6543
+ "learning_rate": 8.892092253883602e-06,
6544
+ "loss": 0.1456,
6545
+ "step": 860
6546
+ },
6547
+ {
6548
+ "epoch": 0.5818673883626523,
6549
+ "eval_accuracy": 0.7926605504587156,
6550
+ "eval_f1": 0.42346938775510207,
6551
+ "eval_loss": 0.4394099712371826,
6552
+ "eval_precision": 0.8829787234042553,
6553
+ "eval_recall": 0.2785234899328859,
6554
+ "eval_runtime": 53.3745,
6555
+ "eval_samples_per_second": 5.583,
6556
+ "eval_steps_per_second": 0.187,
6557
+ "step": 860
6558
+ },
6559
+ {
6560
+ "epoch": 0.5825439783491204,
6561
+ "grad_norm": 3.6462066173553467,
6562
+ "learning_rate": 8.8686197858188e-06,
6563
+ "loss": 0.1782,
6564
+ "step": 861
6565
+ },
6566
+ {
6567
+ "epoch": 0.5832205683355887,
6568
+ "grad_norm": 3.2633800506591797,
6569
+ "learning_rate": 8.84515363030414e-06,
6570
+ "loss": 0.1656,
6571
+ "step": 862
6572
+ },
6573
+ {
6574
+ "epoch": 0.5838971583220568,
6575
+ "grad_norm": 5.255461692810059,
6576
+ "learning_rate": 8.821693918269334e-06,
6577
+ "loss": 0.1306,
6578
+ "step": 863
6579
+ },
6580
+ {
6581
+ "epoch": 0.584573748308525,
6582
+ "grad_norm": 4.525811672210693,
6583
+ "learning_rate": 8.798240780608143e-06,
6584
+ "loss": 0.1684,
6585
+ "step": 864
6586
+ },
6587
+ {
6588
+ "epoch": 0.5852503382949933,
6589
+ "grad_norm": 2.788585901260376,
6590
+ "learning_rate": 8.774794348177641e-06,
6591
+ "loss": 0.1456,
6592
+ "step": 865
6593
+ },
6594
+ {
6595
+ "epoch": 0.5859269282814614,
6596
+ "grad_norm": 3.1500301361083984,
6597
+ "learning_rate": 8.751354751797492e-06,
6598
+ "loss": 0.1347,
6599
+ "step": 866
6600
+ },
6601
+ {
6602
+ "epoch": 0.5866035182679297,
6603
+ "grad_norm": 3.487180471420288,
6604
+ "learning_rate": 8.727922122249221e-06,
6605
+ "loss": 0.1393,
6606
+ "step": 867
6607
+ },
6608
+ {
6609
+ "epoch": 0.5872801082543978,
6610
+ "grad_norm": 2.1133573055267334,
6611
+ "learning_rate": 8.704496590275479e-06,
6612
+ "loss": 0.0814,
6613
+ "step": 868
6614
+ },
6615
+ {
6616
+ "epoch": 0.587956698240866,
6617
+ "grad_norm": 3.227505922317505,
6618
+ "learning_rate": 8.68107828657931e-06,
6619
+ "loss": 0.1104,
6620
+ "step": 869
6621
+ },
6622
+ {
6623
+ "epoch": 0.5886332882273342,
6624
+ "grad_norm": 2.8195204734802246,
6625
+ "learning_rate": 8.657667341823449e-06,
6626
+ "loss": 0.1073,
6627
+ "step": 870
6628
+ },
6629
+ {
6630
+ "epoch": 0.5893098782138024,
6631
+ "grad_norm": 6.85077428817749,
6632
+ "learning_rate": 8.63426388662954e-06,
6633
+ "loss": 0.2117,
6634
+ "step": 871
6635
+ },
6636
+ {
6637
+ "epoch": 0.5899864682002707,
6638
+ "grad_norm": 3.027806043624878,
6639
+ "learning_rate": 8.61086805157747e-06,
6640
+ "loss": 0.13,
6641
+ "step": 872
6642
+ },
6643
+ {
6644
+ "epoch": 0.5906630581867388,
6645
+ "grad_norm": 3.608955144882202,
6646
+ "learning_rate": 8.587479967204584e-06,
6647
+ "loss": 0.1323,
6648
+ "step": 873
6649
+ },
6650
+ {
6651
+ "epoch": 0.591339648173207,
6652
+ "grad_norm": 3.6784167289733887,
6653
+ "learning_rate": 8.564099764004998e-06,
6654
+ "loss": 0.1205,
6655
+ "step": 874
6656
+ },
6657
+ {
6658
+ "epoch": 0.5920162381596752,
6659
+ "grad_norm": 3.6753430366516113,
6660
+ "learning_rate": 8.540727572428854e-06,
6661
+ "loss": 0.1728,
6662
+ "step": 875
6663
+ },
6664
+ {
6665
+ "epoch": 0.5926928281461434,
6666
+ "grad_norm": 3.4869165420532227,
6667
+ "learning_rate": 8.51736352288158e-06,
6668
+ "loss": 0.1363,
6669
+ "step": 876
6670
+ },
6671
+ {
6672
+ "epoch": 0.5933694181326117,
6673
+ "grad_norm": 6.327773571014404,
6674
+ "learning_rate": 8.494007745723197e-06,
6675
+ "loss": 0.1723,
6676
+ "step": 877
6677
+ },
6678
+ {
6679
+ "epoch": 0.5940460081190798,
6680
+ "grad_norm": 4.366674423217773,
6681
+ "learning_rate": 8.47066037126754e-06,
6682
+ "loss": 0.1557,
6683
+ "step": 878
6684
+ },
6685
+ {
6686
+ "epoch": 0.5947225981055481,
6687
+ "grad_norm": 3.28305721282959,
6688
+ "learning_rate": 8.447321529781597e-06,
6689
+ "loss": 0.1253,
6690
+ "step": 879
6691
+ },
6692
+ {
6693
+ "epoch": 0.5953991880920162,
6694
+ "grad_norm": 3.0015041828155518,
6695
+ "learning_rate": 8.423991351484715e-06,
6696
+ "loss": 0.1318,
6697
+ "step": 880
6698
+ },
6699
+ {
6700
+ "epoch": 0.5953991880920162,
6701
+ "eval_accuracy": 0.8192660550458716,
6702
+ "eval_f1": 0.5553047404063205,
6703
+ "eval_loss": 0.3900049328804016,
6704
+ "eval_precision": 0.8482758620689655,
6705
+ "eval_recall": 0.412751677852349,
6706
+ "eval_runtime": 53.5058,
6707
+ "eval_samples_per_second": 5.569,
6708
+ "eval_steps_per_second": 0.187,
6709
+ "step": 880
6710
+ },
6711
+ {
6712
+ "epoch": 0.5960757780784844,
6713
+ "grad_norm": 2.9270071983337402,
6714
+ "learning_rate": 8.400669966547925e-06,
6715
+ "loss": 0.1256,
6716
+ "step": 881
6717
+ },
6718
+ {
6719
+ "epoch": 0.5967523680649527,
6720
+ "grad_norm": 2.6574175357818604,
6721
+ "learning_rate": 8.377357505093183e-06,
6722
+ "loss": 0.0761,
6723
+ "step": 882
6724
+ },
6725
+ {
6726
+ "epoch": 0.5974289580514208,
6727
+ "grad_norm": 3.648263692855835,
6728
+ "learning_rate": 8.35405409719266e-06,
6729
+ "loss": 0.124,
6730
+ "step": 883
6731
+ },
6732
+ {
6733
+ "epoch": 0.5981055480378891,
6734
+ "grad_norm": 4.690035820007324,
6735
+ "learning_rate": 8.330759872868022e-06,
6736
+ "loss": 0.182,
6737
+ "step": 884
6738
+ },
6739
+ {
6740
+ "epoch": 0.5987821380243572,
6741
+ "grad_norm": 3.0360960960388184,
6742
+ "learning_rate": 8.307474962089676e-06,
6743
+ "loss": 0.1437,
6744
+ "step": 885
6745
+ },
6746
+ {
6747
+ "epoch": 0.5994587280108254,
6748
+ "grad_norm": 3.8773977756500244,
6749
+ "learning_rate": 8.284199494776083e-06,
6750
+ "loss": 0.0975,
6751
+ "step": 886
6752
+ },
6753
+ {
6754
+ "epoch": 0.6001353179972937,
6755
+ "grad_norm": 3.7407238483428955,
6756
+ "learning_rate": 8.260933600793003e-06,
6757
+ "loss": 0.1422,
6758
+ "step": 887
6759
+ },
6760
+ {
6761
+ "epoch": 0.6008119079837618,
6762
+ "grad_norm": 4.789558410644531,
6763
+ "learning_rate": 8.237677409952784e-06,
6764
+ "loss": 0.1737,
6765
+ "step": 888
6766
+ },
6767
+ {
6768
+ "epoch": 0.6014884979702301,
6769
+ "grad_norm": 5.8444929122924805,
6770
+ "learning_rate": 8.214431052013636e-06,
6771
+ "loss": 0.1658,
6772
+ "step": 889
6773
+ },
6774
+ {
6775
+ "epoch": 0.6021650879566982,
6776
+ "grad_norm": 2.8387224674224854,
6777
+ "learning_rate": 8.191194656678905e-06,
6778
+ "loss": 0.1317,
6779
+ "step": 890
6780
+ },
6781
+ {
6782
+ "epoch": 0.6028416779431665,
6783
+ "grad_norm": 3.5490684509277344,
6784
+ "learning_rate": 8.16796835359635e-06,
6785
+ "loss": 0.0796,
6786
+ "step": 891
6787
+ },
6788
+ {
6789
+ "epoch": 0.6035182679296346,
6790
+ "grad_norm": 3.600038766860962,
6791
+ "learning_rate": 8.144752272357424e-06,
6792
+ "loss": 0.1059,
6793
+ "step": 892
6794
+ },
6795
+ {
6796
+ "epoch": 0.6041948579161028,
6797
+ "grad_norm": 6.19486665725708,
6798
+ "learning_rate": 8.12154654249654e-06,
6799
+ "loss": 0.2211,
6800
+ "step": 893
6801
+ },
6802
+ {
6803
+ "epoch": 0.6048714479025711,
6804
+ "grad_norm": 3.217571973800659,
6805
+ "learning_rate": 8.098351293490365e-06,
6806
+ "loss": 0.0893,
6807
+ "step": 894
6808
+ },
6809
+ {
6810
+ "epoch": 0.6055480378890392,
6811
+ "grad_norm": 3.447753667831421,
6812
+ "learning_rate": 8.07516665475708e-06,
6813
+ "loss": 0.1373,
6814
+ "step": 895
6815
+ },
6816
+ {
6817
+ "epoch": 0.6062246278755075,
6818
+ "grad_norm": 4.001631259918213,
6819
+ "learning_rate": 8.051992755655672e-06,
6820
+ "loss": 0.1635,
6821
+ "step": 896
6822
+ },
6823
+ {
6824
+ "epoch": 0.6069012178619756,
6825
+ "grad_norm": 3.870436191558838,
6826
+ "learning_rate": 8.0288297254852e-06,
6827
+ "loss": 0.1659,
6828
+ "step": 897
6829
+ },
6830
+ {
6831
+ "epoch": 0.6075778078484438,
6832
+ "grad_norm": 7.8299479484558105,
6833
+ "learning_rate": 8.005677693484077e-06,
6834
+ "loss": 0.2432,
6835
+ "step": 898
6836
+ },
6837
+ {
6838
+ "epoch": 0.6082543978349121,
6839
+ "grad_norm": 2.8993029594421387,
6840
+ "learning_rate": 7.98253678882937e-06,
6841
+ "loss": 0.0963,
6842
+ "step": 899
6843
+ },
6844
+ {
6845
+ "epoch": 0.6089309878213802,
6846
+ "grad_norm": 3.3452529907226562,
6847
+ "learning_rate": 7.959407140636034e-06,
6848
+ "loss": 0.1336,
6849
+ "step": 900
6850
+ },
6851
+ {
6852
+ "epoch": 0.6089309878213802,
6853
+ "eval_accuracy": 0.8,
6854
+ "eval_f1": 0.47342995169082125,
6855
+ "eval_loss": 0.42485949397087097,
6856
+ "eval_precision": 0.8448275862068966,
6857
+ "eval_recall": 0.3288590604026846,
6858
+ "eval_runtime": 53.6449,
6859
+ "eval_samples_per_second": 5.555,
6860
+ "eval_steps_per_second": 0.186,
6861
+ "step": 900
6862
  }
6863
  ],
6864
  "logging_steps": 1,
 
6878
  "attributes": {}
6879
  }
6880
  },
6881
+ "total_flos": 2.732359159822418e+17,
6882
  "train_batch_size": 8,
6883
  "trial_name": null,
6884
  "trial_params": null