mtzig commited on
Commit
03df885
·
verified ·
1 Parent(s): 1f15dd5

Training in progress, step 1300, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e285698db4156337898b7507bc447cf892df1b2e2b1f627fbfa7fcf49ead7fe
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b360a4328f640ed51ddaf65beb21759c2322654758d2b7b7f6e00f66a17354f8
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a9347512a71b948ad7d0474b073744a28f38ea1b0f4808b47eaeee3bb038ee2a
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20d1395c5a780e12bd9c2d3c0a3a98e6d11c049377ae734be8b4c6bec63af7cd
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61fe8222129691fd1c629440ebc055a5e22b32348d82bc6fb97d18d537ba38e6
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b980d02c86a12c4ddd321afa25558b9bda6ce7377f5a7301fbc73043dd7e72fd
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fb287ee7b4f22bfca83b3038b7765964ff726a01edfa1c77cefcecc5baaede6f
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4716794fe32a12753a15aca9b69a92b8ff2a13cc9a1449ccd27487d4a1ca9a7d
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8135e2cfc3f870ad4d1b9488a555f6cbbcb61951312e0f574806197a3d04752
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c424477fe3f3aa933900f713ea30de6e63503f0eb3c14d4b5a3fd7be751453c
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3fd0182149b3046646213abcc88b729a39d44a31db12d71321dcf1672762dc92
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:992a85fd0f9141e2a7ce8e4ce2c770b6564f0c5de13f4c613cc4d93bc456ab03
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ecceaf4d23428de4f6eaf8a4db08e58b3b9e512e0fc350f3d39b90547824dde
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0f386445b7a0ecca12a354673d12666bd045fe42bc66c5282186ece7173d4fd
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93904d1910182fe133491da7a6c8bc9c6713b5f0c66d57fd0a846b185647198d
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81c7ec7bbec3615990bf78e011b0f7bc719d60680964d34bbac0633971dd9f36
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f18ad258e576a1beb656290ab7d2a2eb5c1c200ce0d83645abdc17af01ce6b3
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce0c8f8d9638136cb5308b0b5847756c4993f316ede670798b5676d4508282ce
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4328b792cfa04ae062613c520f6291678aade826256d6a52acb864dcba8e97aa
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc378caf9e3227b70a474c0063f96ad82cc21701d0d5fa1f12d57ba19770909f
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8758a5d59dbad9a4b9628b626e50cf69861f409943163aab71d6b7d54040e68
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41e92489ba1b6fe609dc774dd68b88282000969f034d53fc7540c25e859de003
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37d46b3ff156d0196e9a5d0a8efb49f4baca17f2c23d7f5843e853b9795049d4
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:037f3e8e143701c6dab9d7f5db31ada1d1f6e223405cca2ab7ccd4b03d64aac8
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:477a17a25cc7623279d8aa8946f887744ea0510845075294476c6dcaa37cf69c
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:230ef6b51382a71e81c933c6e0f89f49737687e37bb89c538f18f98f56a78ee9
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9184845005740528,
5
  "eval_steps": 20,
6
- "global_step": 1200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -9139,6 +9139,766 @@
9139
  "eval_samples_per_second": 6.853,
9140
  "eval_steps_per_second": 0.228,
9141
  "step": 1200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9142
  }
9143
  ],
9144
  "logging_steps": 1,
@@ -9158,7 +9918,7 @@
9158
  "attributes": {}
9159
  }
9160
  },
9161
- "total_flos": 1.8435297220388454e+17,
9162
  "train_batch_size": 8,
9163
  "trial_name": null,
9164
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9950248756218906,
5
  "eval_steps": 20,
6
+ "global_step": 1300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
9139
  "eval_samples_per_second": 6.853,
9140
  "eval_steps_per_second": 0.228,
9141
  "step": 1200
9142
+ },
9143
+ {
9144
+ "epoch": 0.9192499043245312,
9145
+ "grad_norm": 4.442579746246338,
9146
+ "learning_rate": 3.914879239610392e-07,
9147
+ "loss": 0.186,
9148
+ "step": 1201
9149
+ },
9150
+ {
9151
+ "epoch": 0.9200153080750095,
9152
+ "grad_norm": 5.45106315612793,
9153
+ "learning_rate": 3.8411440400117685e-07,
9154
+ "loss": 0.1837,
9155
+ "step": 1202
9156
+ },
9157
+ {
9158
+ "epoch": 0.9207807118254879,
9159
+ "grad_norm": 4.747509479522705,
9160
+ "learning_rate": 3.768096245974129e-07,
9161
+ "loss": 0.2562,
9162
+ "step": 1203
9163
+ },
9164
+ {
9165
+ "epoch": 0.9215461155759663,
9166
+ "grad_norm": 6.138671398162842,
9167
+ "learning_rate": 3.69573637969024e-07,
9168
+ "loss": 0.3244,
9169
+ "step": 1204
9170
+ },
9171
+ {
9172
+ "epoch": 0.9223115193264447,
9173
+ "grad_norm": 7.972070217132568,
9174
+ "learning_rate": 3.6240649584351137e-07,
9175
+ "loss": 0.4027,
9176
+ "step": 1205
9177
+ },
9178
+ {
9179
+ "epoch": 0.9230769230769231,
9180
+ "grad_norm": 8.4572172164917,
9181
+ "learning_rate": 3.553082494562354e-07,
9182
+ "loss": 0.4941,
9183
+ "step": 1206
9184
+ },
9185
+ {
9186
+ "epoch": 0.9238423268274014,
9187
+ "grad_norm": 9.352378845214844,
9188
+ "learning_rate": 3.4827894955003825e-07,
9189
+ "loss": 0.448,
9190
+ "step": 1207
9191
+ },
9192
+ {
9193
+ "epoch": 0.9246077305778798,
9194
+ "grad_norm": 7.637875556945801,
9195
+ "learning_rate": 3.413186463748941e-07,
9196
+ "loss": 0.2718,
9197
+ "step": 1208
9198
+ },
9199
+ {
9200
+ "epoch": 0.9253731343283582,
9201
+ "grad_norm": 8.596519470214844,
9202
+ "learning_rate": 3.3442738968754164e-07,
9203
+ "loss": 0.2043,
9204
+ "step": 1209
9205
+ },
9206
+ {
9207
+ "epoch": 0.9261385380788366,
9208
+ "grad_norm": 11.038840293884277,
9209
+ "learning_rate": 3.276052287511333e-07,
9210
+ "loss": 0.2731,
9211
+ "step": 1210
9212
+ },
9213
+ {
9214
+ "epoch": 0.926903941829315,
9215
+ "grad_norm": 6.149134635925293,
9216
+ "learning_rate": 3.2085221233487564e-07,
9217
+ "loss": 0.3046,
9218
+ "step": 1211
9219
+ },
9220
+ {
9221
+ "epoch": 0.9276693455797933,
9222
+ "grad_norm": 5.461088180541992,
9223
+ "learning_rate": 3.1416838871368925e-07,
9224
+ "loss": 0.2553,
9225
+ "step": 1212
9226
+ },
9227
+ {
9228
+ "epoch": 0.9284347493302717,
9229
+ "grad_norm": 8.15916919708252,
9230
+ "learning_rate": 3.0755380566785955e-07,
9231
+ "loss": 0.2793,
9232
+ "step": 1213
9233
+ },
9234
+ {
9235
+ "epoch": 0.9292001530807501,
9236
+ "grad_norm": 6.028532028198242,
9237
+ "learning_rate": 3.010085104826932e-07,
9238
+ "loss": 0.2108,
9239
+ "step": 1214
9240
+ },
9241
+ {
9242
+ "epoch": 0.9299655568312285,
9243
+ "grad_norm": 9.626595497131348,
9244
+ "learning_rate": 2.945325499481855e-07,
9245
+ "loss": 0.2889,
9246
+ "step": 1215
9247
+ },
9248
+ {
9249
+ "epoch": 0.9307309605817069,
9250
+ "grad_norm": 8.43061637878418,
9251
+ "learning_rate": 2.881259703586814e-07,
9252
+ "loss": 0.3819,
9253
+ "step": 1216
9254
+ },
9255
+ {
9256
+ "epoch": 0.9314963643321852,
9257
+ "grad_norm": 9.330650329589844,
9258
+ "learning_rate": 2.817888175125472e-07,
9259
+ "loss": 0.2979,
9260
+ "step": 1217
9261
+ },
9262
+ {
9263
+ "epoch": 0.9322617680826636,
9264
+ "grad_norm": 6.501589775085449,
9265
+ "learning_rate": 2.7552113671184264e-07,
9266
+ "loss": 0.293,
9267
+ "step": 1218
9268
+ },
9269
+ {
9270
+ "epoch": 0.933027171833142,
9271
+ "grad_norm": 6.367552757263184,
9272
+ "learning_rate": 2.693229727619906e-07,
9273
+ "loss": 0.3728,
9274
+ "step": 1219
9275
+ },
9276
+ {
9277
+ "epoch": 0.9337925755836204,
9278
+ "grad_norm": 6.511219501495361,
9279
+ "learning_rate": 2.631943699714712e-07,
9280
+ "loss": 0.2681,
9281
+ "step": 1220
9282
+ },
9283
+ {
9284
+ "epoch": 0.9337925755836204,
9285
+ "eval_accuracy": 0.8898916967509025,
9286
+ "eval_f1": 0.8390501319261213,
9287
+ "eval_loss": 0.2956056296825409,
9288
+ "eval_precision": 0.8932584269662921,
9289
+ "eval_recall": 0.7910447761194029,
9290
+ "eval_runtime": 43.3109,
9291
+ "eval_samples_per_second": 6.95,
9292
+ "eval_steps_per_second": 0.231,
9293
+ "step": 1220
9294
+ },
9295
+ {
9296
+ "epoch": 0.9345579793340988,
9297
+ "grad_norm": 5.723000526428223,
9298
+ "learning_rate": 2.571353721514913e-07,
9299
+ "loss": 0.2749,
9300
+ "step": 1221
9301
+ },
9302
+ {
9303
+ "epoch": 0.9353233830845771,
9304
+ "grad_norm": 8.66303825378418,
9305
+ "learning_rate": 2.51146022615677e-07,
9306
+ "loss": 0.2631,
9307
+ "step": 1222
9308
+ },
9309
+ {
9310
+ "epoch": 0.9360887868350555,
9311
+ "grad_norm": 6.536643981933594,
9312
+ "learning_rate": 2.452263641797659e-07,
9313
+ "loss": 0.2504,
9314
+ "step": 1223
9315
+ },
9316
+ {
9317
+ "epoch": 0.9368541905855339,
9318
+ "grad_norm": 5.747756481170654,
9319
+ "learning_rate": 2.3937643916129404e-07,
9320
+ "loss": 0.2857,
9321
+ "step": 1224
9322
+ },
9323
+ {
9324
+ "epoch": 0.9376195943360123,
9325
+ "grad_norm": 13.398006439208984,
9326
+ "learning_rate": 2.3359628937930422e-07,
9327
+ "loss": 0.4189,
9328
+ "step": 1225
9329
+ },
9330
+ {
9331
+ "epoch": 0.9383849980864907,
9332
+ "grad_norm": 5.998396396636963,
9333
+ "learning_rate": 2.2788595615403475e-07,
9334
+ "loss": 0.3231,
9335
+ "step": 1226
9336
+ },
9337
+ {
9338
+ "epoch": 0.939150401836969,
9339
+ "grad_norm": 6.068146705627441,
9340
+ "learning_rate": 2.222454803066332e-07,
9341
+ "loss": 0.3236,
9342
+ "step": 1227
9343
+ },
9344
+ {
9345
+ "epoch": 0.9399158055874474,
9346
+ "grad_norm": 5.644654750823975,
9347
+ "learning_rate": 2.16674902158861e-07,
9348
+ "loss": 0.3332,
9349
+ "step": 1228
9350
+ },
9351
+ {
9352
+ "epoch": 0.9406812093379258,
9353
+ "grad_norm": 4.82579231262207,
9354
+ "learning_rate": 2.111742615328083e-07,
9355
+ "loss": 0.2132,
9356
+ "step": 1229
9357
+ },
9358
+ {
9359
+ "epoch": 0.9414466130884042,
9360
+ "grad_norm": 4.6144256591796875,
9361
+ "learning_rate": 2.057435977506028e-07,
9362
+ "loss": 0.2308,
9363
+ "step": 1230
9364
+ },
9365
+ {
9366
+ "epoch": 0.9422120168388826,
9367
+ "grad_norm": 10.00190258026123,
9368
+ "learning_rate": 2.0038294963413251e-07,
9369
+ "loss": 0.373,
9370
+ "step": 1231
9371
+ },
9372
+ {
9373
+ "epoch": 0.9429774205893608,
9374
+ "grad_norm": 5.754945755004883,
9375
+ "learning_rate": 1.9509235550477123e-07,
9376
+ "loss": 0.2395,
9377
+ "step": 1232
9378
+ },
9379
+ {
9380
+ "epoch": 0.9437428243398392,
9381
+ "grad_norm": 6.360520362854004,
9382
+ "learning_rate": 1.8987185318310009e-07,
9383
+ "loss": 0.1902,
9384
+ "step": 1233
9385
+ },
9386
+ {
9387
+ "epoch": 0.9445082280903176,
9388
+ "grad_norm": 9.590492248535156,
9389
+ "learning_rate": 1.8472147998863877e-07,
9390
+ "loss": 0.3155,
9391
+ "step": 1234
9392
+ },
9393
+ {
9394
+ "epoch": 0.945273631840796,
9395
+ "grad_norm": 7.996187686920166,
9396
+ "learning_rate": 1.796412727395802e-07,
9397
+ "loss": 0.3433,
9398
+ "step": 1235
9399
+ },
9400
+ {
9401
+ "epoch": 0.9460390355912744,
9402
+ "grad_norm": 4.422671794891357,
9403
+ "learning_rate": 1.7463126775252192e-07,
9404
+ "loss": 0.237,
9405
+ "step": 1236
9406
+ },
9407
+ {
9408
+ "epoch": 0.9468044393417527,
9409
+ "grad_norm": 6.761044979095459,
9410
+ "learning_rate": 1.6969150084221399e-07,
9411
+ "loss": 0.3662,
9412
+ "step": 1237
9413
+ },
9414
+ {
9415
+ "epoch": 0.9475698430922311,
9416
+ "grad_norm": 5.3165411949157715,
9417
+ "learning_rate": 1.6482200732129804e-07,
9418
+ "loss": 0.2149,
9419
+ "step": 1238
9420
+ },
9421
+ {
9422
+ "epoch": 0.9483352468427095,
9423
+ "grad_norm": 8.114785194396973,
9424
+ "learning_rate": 1.600228220000577e-07,
9425
+ "loss": 0.3416,
9426
+ "step": 1239
9427
+ },
9428
+ {
9429
+ "epoch": 0.9491006505931879,
9430
+ "grad_norm": 10.293120384216309,
9431
+ "learning_rate": 1.552939791861663e-07,
9432
+ "loss": 0.3409,
9433
+ "step": 1240
9434
+ },
9435
+ {
9436
+ "epoch": 0.9491006505931879,
9437
+ "eval_accuracy": 0.8880866425992779,
9438
+ "eval_f1": 0.8368421052631579,
9439
+ "eval_loss": 0.29501873254776,
9440
+ "eval_precision": 0.888268156424581,
9441
+ "eval_recall": 0.7910447761194029,
9442
+ "eval_runtime": 43.815,
9443
+ "eval_samples_per_second": 6.87,
9444
+ "eval_steps_per_second": 0.228,
9445
+ "step": 1240
9446
+ },
9447
+ {
9448
+ "epoch": 0.9498660543436663,
9449
+ "grad_norm": 6.4339799880981445,
9450
+ "learning_rate": 1.5063551268444275e-07,
9451
+ "loss": 0.3244,
9452
+ "step": 1241
9453
+ },
9454
+ {
9455
+ "epoch": 0.9506314580941446,
9456
+ "grad_norm": 5.49373722076416,
9457
+ "learning_rate": 1.4604745579661405e-07,
9458
+ "loss": 0.1764,
9459
+ "step": 1242
9460
+ },
9461
+ {
9462
+ "epoch": 0.951396861844623,
9463
+ "grad_norm": 6.4061126708984375,
9464
+ "learning_rate": 1.4152984132106972e-07,
9465
+ "loss": 0.3189,
9466
+ "step": 1243
9467
+ },
9468
+ {
9469
+ "epoch": 0.9521622655951014,
9470
+ "grad_norm": 5.936630725860596,
9471
+ "learning_rate": 1.370827015526355e-07,
9472
+ "loss": 0.3355,
9473
+ "step": 1244
9474
+ },
9475
+ {
9476
+ "epoch": 0.9529276693455798,
9477
+ "grad_norm": 14.100617408752441,
9478
+ "learning_rate": 1.3270606828233668e-07,
9479
+ "loss": 0.5053,
9480
+ "step": 1245
9481
+ },
9482
+ {
9483
+ "epoch": 0.9536930730960582,
9484
+ "grad_norm": 8.441110610961914,
9485
+ "learning_rate": 1.2839997279717075e-07,
9486
+ "loss": 0.274,
9487
+ "step": 1246
9488
+ },
9489
+ {
9490
+ "epoch": 0.9544584768465365,
9491
+ "grad_norm": 6.178558826446533,
9492
+ "learning_rate": 1.241644458798885e-07,
9493
+ "loss": 0.2966,
9494
+ "step": 1247
9495
+ },
9496
+ {
9497
+ "epoch": 0.9552238805970149,
9498
+ "grad_norm": 6.316476345062256,
9499
+ "learning_rate": 1.1999951780876872e-07,
9500
+ "loss": 0.2785,
9501
+ "step": 1248
9502
+ },
9503
+ {
9504
+ "epoch": 0.9559892843474933,
9505
+ "grad_norm": 6.520962238311768,
9506
+ "learning_rate": 1.159052183574072e-07,
9507
+ "loss": 0.2933,
9508
+ "step": 1249
9509
+ },
9510
+ {
9511
+ "epoch": 0.9567546880979717,
9512
+ "grad_norm": 6.651547431945801,
9513
+ "learning_rate": 1.1188157679449585e-07,
9514
+ "loss": 0.2775,
9515
+ "step": 1250
9516
+ },
9517
+ {
9518
+ "epoch": 0.9575200918484501,
9519
+ "grad_norm": 5.902339935302734,
9520
+ "learning_rate": 1.0792862188362396e-07,
9521
+ "loss": 0.2386,
9522
+ "step": 1251
9523
+ },
9524
+ {
9525
+ "epoch": 0.9582854955989284,
9526
+ "grad_norm": 7.483514308929443,
9527
+ "learning_rate": 1.0404638188306504e-07,
9528
+ "loss": 0.2501,
9529
+ "step": 1252
9530
+ },
9531
+ {
9532
+ "epoch": 0.9590508993494068,
9533
+ "grad_norm": 6.495910167694092,
9534
+ "learning_rate": 1.002348845455725e-07,
9535
+ "loss": 0.3872,
9536
+ "step": 1253
9537
+ },
9538
+ {
9539
+ "epoch": 0.9598163030998852,
9540
+ "grad_norm": 6.121851921081543,
9541
+ "learning_rate": 9.64941571181921e-08,
9542
+ "loss": 0.3186,
9543
+ "step": 1254
9544
+ },
9545
+ {
9546
+ "epoch": 0.9605817068503636,
9547
+ "grad_norm": 6.671183109283447,
9548
+ "learning_rate": 9.282422634205645e-08,
9549
+ "loss": 0.2947,
9550
+ "step": 1255
9551
+ },
9552
+ {
9553
+ "epoch": 0.961347110600842,
9554
+ "grad_norm": 5.844105243682861,
9555
+ "learning_rate": 8.922511845219972e-08,
9556
+ "loss": 0.2272,
9557
+ "step": 1256
9558
+ },
9559
+ {
9560
+ "epoch": 0.9621125143513203,
9561
+ "grad_norm": 6.843101501464844,
9562
+ "learning_rate": 8.569685917736659e-08,
9563
+ "loss": 0.2826,
9564
+ "step": 1257
9565
+ },
9566
+ {
9567
+ "epoch": 0.9628779181017987,
9568
+ "grad_norm": 6.810047626495361,
9569
+ "learning_rate": 8.223947373983354e-08,
9570
+ "loss": 0.2737,
9571
+ "step": 1258
9572
+ },
9573
+ {
9574
+ "epoch": 0.9636433218522771,
9575
+ "grad_norm": 6.269131660461426,
9576
+ "learning_rate": 7.885298685522235e-08,
9577
+ "loss": 0.3041,
9578
+ "step": 1259
9579
+ },
9580
+ {
9581
+ "epoch": 0.9644087256027555,
9582
+ "grad_norm": 7.05451774597168,
9583
+ "learning_rate": 7.553742273232578e-08,
9584
+ "loss": 0.3316,
9585
+ "step": 1260
9586
+ },
9587
+ {
9588
+ "epoch": 0.9644087256027555,
9589
+ "eval_accuracy": 0.8898916967509025,
9590
+ "eval_f1": 0.8390501319261213,
9591
+ "eval_loss": 0.2938833236694336,
9592
+ "eval_precision": 0.8932584269662921,
9593
+ "eval_recall": 0.7910447761194029,
9594
+ "eval_runtime": 43.817,
9595
+ "eval_samples_per_second": 6.869,
9596
+ "eval_steps_per_second": 0.228,
9597
+ "step": 1260
9598
+ },
9599
+ {
9600
+ "epoch": 0.9651741293532339,
9601
+ "grad_norm": 7.257000923156738,
9602
+ "learning_rate": 7.229280507293657e-08,
9603
+ "loss": 0.3027,
9604
+ "step": 1261
9605
+ },
9606
+ {
9607
+ "epoch": 0.9659395331037122,
9608
+ "grad_norm": 8.234956741333008,
9609
+ "learning_rate": 6.911915707167538e-08,
9610
+ "loss": 0.3549,
9611
+ "step": 1262
9612
+ },
9613
+ {
9614
+ "epoch": 0.9667049368541906,
9615
+ "grad_norm": 6.89831018447876,
9616
+ "learning_rate": 6.601650141582649e-08,
9617
+ "loss": 0.2276,
9618
+ "step": 1263
9619
+ },
9620
+ {
9621
+ "epoch": 0.967470340604669,
9622
+ "grad_norm": 5.264804840087891,
9623
+ "learning_rate": 6.29848602851768e-08,
9624
+ "loss": 0.2677,
9625
+ "step": 1264
9626
+ },
9627
+ {
9628
+ "epoch": 0.9682357443551474,
9629
+ "grad_norm": 7.13667631149292,
9630
+ "learning_rate": 6.002425535185041e-08,
9631
+ "loss": 0.3305,
9632
+ "step": 1265
9633
+ },
9634
+ {
9635
+ "epoch": 0.9690011481056258,
9636
+ "grad_norm": 5.207520008087158,
9637
+ "learning_rate": 5.713470778016539e-08,
9638
+ "loss": 0.2083,
9639
+ "step": 1266
9640
+ },
9641
+ {
9642
+ "epoch": 0.969766551856104,
9643
+ "grad_norm": 5.961206436157227,
9644
+ "learning_rate": 5.4316238226469476e-08,
9645
+ "loss": 0.2633,
9646
+ "step": 1267
9647
+ },
9648
+ {
9649
+ "epoch": 0.9705319556065825,
9650
+ "grad_norm": 11.930121421813965,
9651
+ "learning_rate": 5.1568866839003525e-08,
9652
+ "loss": 0.3997,
9653
+ "step": 1268
9654
+ },
9655
+ {
9656
+ "epoch": 0.9712973593570609,
9657
+ "grad_norm": 6.59713077545166,
9658
+ "learning_rate": 4.889261325775163e-08,
9659
+ "loss": 0.2437,
9660
+ "step": 1269
9661
+ },
9662
+ {
9663
+ "epoch": 0.9720627631075393,
9664
+ "grad_norm": 7.702863693237305,
9665
+ "learning_rate": 4.628749661430121e-08,
9666
+ "loss": 0.3456,
9667
+ "step": 1270
9668
+ },
9669
+ {
9670
+ "epoch": 0.9728281668580177,
9671
+ "grad_norm": 7.830643177032471,
9672
+ "learning_rate": 4.375353553170647e-08,
9673
+ "loss": 0.3608,
9674
+ "step": 1271
9675
+ },
9676
+ {
9677
+ "epoch": 0.9735935706084959,
9678
+ "grad_norm": 7.027949333190918,
9679
+ "learning_rate": 4.1290748124358513e-08,
9680
+ "loss": 0.2728,
9681
+ "step": 1272
9682
+ },
9683
+ {
9684
+ "epoch": 0.9743589743589743,
9685
+ "grad_norm": 9.216780662536621,
9686
+ "learning_rate": 3.889915199784877e-08,
9687
+ "loss": 0.3055,
9688
+ "step": 1273
9689
+ },
9690
+ {
9691
+ "epoch": 0.9751243781094527,
9692
+ "grad_norm": 5.373678684234619,
9693
+ "learning_rate": 3.657876424885243e-08,
9694
+ "loss": 0.2806,
9695
+ "step": 1274
9696
+ },
9697
+ {
9698
+ "epoch": 0.9758897818599311,
9699
+ "grad_norm": 6.474977970123291,
9700
+ "learning_rate": 3.432960146499631e-08,
9701
+ "loss": 0.3257,
9702
+ "step": 1275
9703
+ },
9704
+ {
9705
+ "epoch": 0.9766551856104095,
9706
+ "grad_norm": 8.3179292678833,
9707
+ "learning_rate": 3.2151679724748974e-08,
9708
+ "loss": 0.3389,
9709
+ "step": 1276
9710
+ },
9711
+ {
9712
+ "epoch": 0.9774205893608878,
9713
+ "grad_norm": 5.711795806884766,
9714
+ "learning_rate": 3.0045014597299695e-08,
9715
+ "loss": 0.2503,
9716
+ "step": 1277
9717
+ },
9718
+ {
9719
+ "epoch": 0.9781859931113662,
9720
+ "grad_norm": 5.385677337646484,
9721
+ "learning_rate": 2.800962114245076e-08,
9722
+ "loss": 0.2485,
9723
+ "step": 1278
9724
+ },
9725
+ {
9726
+ "epoch": 0.9789513968618446,
9727
+ "grad_norm": 3.9317917823791504,
9728
+ "learning_rate": 2.6045513910509802e-08,
9729
+ "loss": 0.212,
9730
+ "step": 1279
9731
+ },
9732
+ {
9733
+ "epoch": 0.979716800612323,
9734
+ "grad_norm": 4.621948719024658,
9735
+ "learning_rate": 2.415270694217986e-08,
9736
+ "loss": 0.1957,
9737
+ "step": 1280
9738
+ },
9739
+ {
9740
+ "epoch": 0.979716800612323,
9741
+ "eval_accuracy": 0.8898916967509025,
9742
+ "eval_f1": 0.8390501319261213,
9743
+ "eval_loss": 0.2945975959300995,
9744
+ "eval_precision": 0.8932584269662921,
9745
+ "eval_recall": 0.7910447761194029,
9746
+ "eval_runtime": 42.919,
9747
+ "eval_samples_per_second": 7.013,
9748
+ "eval_steps_per_second": 0.233,
9749
+ "step": 1280
9750
+ },
9751
+ {
9752
+ "epoch": 0.9804822043628014,
9753
+ "grad_norm": 6.141805648803711,
9754
+ "learning_rate": 2.2331213768468363e-08,
9755
+ "loss": 0.2438,
9756
+ "step": 1281
9757
+ },
9758
+ {
9759
+ "epoch": 0.9812476081132797,
9760
+ "grad_norm": 5.874077320098877,
9761
+ "learning_rate": 2.0581047410583865e-08,
9762
+ "loss": 0.343,
9763
+ "step": 1282
9764
+ },
9765
+ {
9766
+ "epoch": 0.9820130118637581,
9767
+ "grad_norm": 9.686785697937012,
9768
+ "learning_rate": 1.8902220379846125e-08,
9769
+ "loss": 0.4448,
9770
+ "step": 1283
9771
+ },
9772
+ {
9773
+ "epoch": 0.9827784156142365,
9774
+ "grad_norm": 6.589422225952148,
9775
+ "learning_rate": 1.7294744677591733e-08,
9776
+ "loss": 0.3774,
9777
+ "step": 1284
9778
+ },
9779
+ {
9780
+ "epoch": 0.9835438193647149,
9781
+ "grad_norm": 7.531107425689697,
9782
+ "learning_rate": 1.57586317950964e-08,
9783
+ "loss": 0.2591,
9784
+ "step": 1285
9785
+ },
9786
+ {
9787
+ "epoch": 0.9843092231151933,
9788
+ "grad_norm": 6.169864654541016,
9789
+ "learning_rate": 1.4293892713486135e-08,
9790
+ "loss": 0.3366,
9791
+ "step": 1286
9792
+ },
9793
+ {
9794
+ "epoch": 0.9850746268656716,
9795
+ "grad_norm": 7.703701496124268,
9796
+ "learning_rate": 1.2900537903660637e-08,
9797
+ "loss": 0.2595,
9798
+ "step": 1287
9799
+ },
9800
+ {
9801
+ "epoch": 0.98584003061615,
9802
+ "grad_norm": 5.90448522567749,
9803
+ "learning_rate": 1.157857732622003e-08,
9804
+ "loss": 0.2492,
9805
+ "step": 1288
9806
+ },
9807
+ {
9808
+ "epoch": 0.9866054343666284,
9809
+ "grad_norm": 5.025811672210693,
9810
+ "learning_rate": 1.0328020431391583e-08,
9811
+ "loss": 0.2422,
9812
+ "step": 1289
9813
+ },
9814
+ {
9815
+ "epoch": 0.9873708381171068,
9816
+ "grad_norm": 5.388332843780518,
9817
+ "learning_rate": 9.148876158961983e-09,
9818
+ "loss": 0.2482,
9819
+ "step": 1290
9820
+ },
9821
+ {
9822
+ "epoch": 0.9881362418675852,
9823
+ "grad_norm": 4.219669342041016,
9824
+ "learning_rate": 8.041152938216278e-09,
9825
+ "loss": 0.2682,
9826
+ "step": 1291
9827
+ },
9828
+ {
9829
+ "epoch": 0.9889016456180635,
9830
+ "grad_norm": 7.032052516937256,
9831
+ "learning_rate": 7.004858687874594e-09,
9832
+ "loss": 0.2261,
9833
+ "step": 1292
9834
+ },
9835
+ {
9836
+ "epoch": 0.9896670493685419,
9837
+ "grad_norm": 5.230202674865723,
9838
+ "learning_rate": 6.040000816037728e-09,
9839
+ "loss": 0.2749,
9840
+ "step": 1293
9841
+ },
9842
+ {
9843
+ "epoch": 0.9904324531190203,
9844
+ "grad_norm": 6.469751358032227,
9845
+ "learning_rate": 5.146586220131644e-09,
9846
+ "loss": 0.1947,
9847
+ "step": 1294
9848
+ },
9849
+ {
9850
+ "epoch": 0.9911978568694987,
9851
+ "grad_norm": 4.652950286865234,
9852
+ "learning_rate": 4.324621286861952e-09,
9853
+ "loss": 0.1941,
9854
+ "step": 1295
9855
+ },
9856
+ {
9857
+ "epoch": 0.9919632606199771,
9858
+ "grad_norm": 9.259235382080078,
9859
+ "learning_rate": 3.5741118921628346e-09,
9860
+ "loss": 0.2713,
9861
+ "step": 1296
9862
+ },
9863
+ {
9864
+ "epoch": 0.9927286643704554,
9865
+ "grad_norm": 6.85486364364624,
9866
+ "learning_rate": 2.895063401160414e-09,
9867
+ "loss": 0.3251,
9868
+ "step": 1297
9869
+ },
9870
+ {
9871
+ "epoch": 0.9934940681209338,
9872
+ "grad_norm": 9.239498138427734,
9873
+ "learning_rate": 2.2874806681305593e-09,
9874
+ "loss": 0.2696,
9875
+ "step": 1298
9876
+ },
9877
+ {
9878
+ "epoch": 0.9942594718714122,
9879
+ "grad_norm": 4.937226295471191,
9880
+ "learning_rate": 1.7513680364689145e-09,
9881
+ "loss": 0.2714,
9882
+ "step": 1299
9883
+ },
9884
+ {
9885
+ "epoch": 0.9950248756218906,
9886
+ "grad_norm": 8.691539764404297,
9887
+ "learning_rate": 1.2867293386531476e-09,
9888
+ "loss": 0.2439,
9889
+ "step": 1300
9890
+ },
9891
+ {
9892
+ "epoch": 0.9950248756218906,
9893
+ "eval_accuracy": 0.8898916967509025,
9894
+ "eval_f1": 0.8390501319261213,
9895
+ "eval_loss": 0.2946934700012207,
9896
+ "eval_precision": 0.8932584269662921,
9897
+ "eval_recall": 0.7910447761194029,
9898
+ "eval_runtime": 43.3576,
9899
+ "eval_samples_per_second": 6.942,
9900
+ "eval_steps_per_second": 0.231,
9901
+ "step": 1300
9902
  }
9903
  ],
9904
  "logging_steps": 1,
 
9918
  "attributes": {}
9919
  }
9920
  },
9921
+ "total_flos": 1.9972530726187827e+17,
9922
  "train_batch_size": 8,
9923
  "trial_name": null,
9924
  "trial_params": null