mtzig commited on
Commit
e0631a2
·
verified ·
1 Parent(s): fdd872f

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4963b2cd50e57f6b515f56f974dfe45d992517004fc520f6955d81d176f790a7
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec0b23fb29b3168d711126e63c390cefd28562954b8b8ef4840f478aa2aec88c
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c68fdb607ac4683a8dca2ee6fa979f122c8b9e4f5a8be05c199df9ffdf462632
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:719e2f980220aec49e7260281d780c4b705e08e44da393275851a1c0ada4a677
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8f1c5f0040695754e95cea31a92ba95b979e25d690f9d64c169d66dc419f228
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9348a9304f5360f4c21ad45a44bbc0168ae2b80712245674a9a769f6eb7aa152
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8e9491dc652893fb060e965d69962558957f031ecac809634247164f574e608
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9e3bcd9ed5fbe41d6c66f117fd9b6d9de2950c64c39151e84cf8161db4402f2
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d035efe305175a2f8bfbab2f8d3e62cee6f2ced0a908d7eb6c1e495846d0737b
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b902873a146629a1ece23ec06ea7f89258be805f622edd3f56bb4e27b370d1f
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc84a14f60fa5fc54638a6ef9c91a376a58f7ab98ea47d3c31d76ce89c680414
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90be759819f75fd58133ca2a31ffb7e3abdb5e55026b34e76783d4cbdb7645cc
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac4073bdc83a079a4b6896bc4a6496aeea9ed2771c055d341f2925ac3e15b804
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bac79d5ff2444d45cca1c402876704174077427c6b7d2902ab84bdd3aeb6a4c1
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a66dd53cdd3497de414d53e774c65997b79be629a8b45594ad103171f71988a
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36fa53207112e96cf1d931008a46af86708bec88e31fc02618c631b73b238844
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30f7efb14f830613ce1feb32c656507e2b2715abb7e03351d4ab24fdac833de9
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:002fb55933219d3afc15cf13593cee3b4cc68a24a920a24f43ed82f5a081cc35
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b570acbde5d360a8111669f646b87b723b1561b932314383883cb8d7631cf6a5
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7218e38300ea7b9c89377dc67299aac317cf89fbd5fed41930f1224bfa1fab89
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce2f6e01a41ead5e1f0563b520db5e55e49adbcab1ba2767d62506fd2c2fd350
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dd6521f2fc3283d23bd25a27f2810cab0424e95a40073d1ed6cfb13ed15dc2f
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b05228164f2365d0c73a013ea3f0d6ff1002ba4a518c68a6670e0c417be0df02
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b47a7144b9ff045949aa91aea787001a456200397c8e845704484a0656bbbacc
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:feec47af43281c02d149007021796dc5997bea57643d860b75184d48d617927b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af32edbfbf253ca5324b65d305f359aaf2d7238a6c9110be03e0839d25660469
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.2706359945872801,
5
  "eval_steps": 20,
6
- "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3059,6 +3059,766 @@
3059
  "eval_samples_per_second": 5.737,
3060
  "eval_steps_per_second": 0.193,
3061
  "step": 400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3062
  }
3063
  ],
3064
  "logging_steps": 1,
@@ -3078,7 +3838,7 @@
3078
  "attributes": {}
3079
  }
3080
  },
3081
- "total_flos": 1.2098518370798797e+17,
3082
  "train_batch_size": 8,
3083
  "trial_name": null,
3084
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.3382949932341001,
5
  "eval_steps": 20,
6
+ "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3059
  "eval_samples_per_second": 5.737,
3060
  "eval_steps_per_second": 0.193,
3061
  "step": 400
3062
+ },
3063
+ {
3064
+ "epoch": 0.2713125845737483,
3065
+ "grad_norm": 3.803469181060791,
3066
+ "learning_rate": 1.8266820575458908e-05,
3067
+ "loss": 0.1116,
3068
+ "step": 401
3069
+ },
3070
+ {
3071
+ "epoch": 0.2719891745602165,
3072
+ "grad_norm": 4.727139949798584,
3073
+ "learning_rate": 1.8253506714608176e-05,
3074
+ "loss": 0.1973,
3075
+ "step": 402
3076
+ },
3077
+ {
3078
+ "epoch": 0.27266576454668473,
3079
+ "grad_norm": 4.788311004638672,
3080
+ "learning_rate": 1.8240146803212854e-05,
3081
+ "loss": 0.2294,
3082
+ "step": 403
3083
+ },
3084
+ {
3085
+ "epoch": 0.2733423545331529,
3086
+ "grad_norm": 9.147326469421387,
3087
+ "learning_rate": 1.822674091581474e-05,
3088
+ "loss": 0.1194,
3089
+ "step": 404
3090
+ },
3091
+ {
3092
+ "epoch": 0.2740189445196211,
3093
+ "grad_norm": 3.2791059017181396,
3094
+ "learning_rate": 1.8213289127212152e-05,
3095
+ "loss": 0.0931,
3096
+ "step": 405
3097
+ },
3098
+ {
3099
+ "epoch": 0.2746955345060893,
3100
+ "grad_norm": 4.282406330108643,
3101
+ "learning_rate": 1.8199791512459507e-05,
3102
+ "loss": 0.1893,
3103
+ "step": 406
3104
+ },
3105
+ {
3106
+ "epoch": 0.2753721244925575,
3107
+ "grad_norm": 5.307563781738281,
3108
+ "learning_rate": 1.8186248146866928e-05,
3109
+ "loss": 0.1627,
3110
+ "step": 407
3111
+ },
3112
+ {
3113
+ "epoch": 0.27604871447902574,
3114
+ "grad_norm": 3.126235246658325,
3115
+ "learning_rate": 1.817265910599978e-05,
3116
+ "loss": 0.1254,
3117
+ "step": 408
3118
+ },
3119
+ {
3120
+ "epoch": 0.2767253044654939,
3121
+ "grad_norm": 3.751150369644165,
3122
+ "learning_rate": 1.81590244656783e-05,
3123
+ "loss": 0.1884,
3124
+ "step": 409
3125
+ },
3126
+ {
3127
+ "epoch": 0.2774018944519621,
3128
+ "grad_norm": 4.908536434173584,
3129
+ "learning_rate": 1.8145344301977126e-05,
3130
+ "loss": 0.1522,
3131
+ "step": 410
3132
+ },
3133
+ {
3134
+ "epoch": 0.2780784844384303,
3135
+ "grad_norm": 3.765190601348877,
3136
+ "learning_rate": 1.8131618691224916e-05,
3137
+ "loss": 0.1509,
3138
+ "step": 411
3139
+ },
3140
+ {
3141
+ "epoch": 0.2787550744248985,
3142
+ "grad_norm": 4.558242321014404,
3143
+ "learning_rate": 1.811784771000387e-05,
3144
+ "loss": 0.1151,
3145
+ "step": 412
3146
+ },
3147
+ {
3148
+ "epoch": 0.2794316644113667,
3149
+ "grad_norm": 3.2288334369659424,
3150
+ "learning_rate": 1.8104031435149366e-05,
3151
+ "loss": 0.1238,
3152
+ "step": 413
3153
+ },
3154
+ {
3155
+ "epoch": 0.28010825439783493,
3156
+ "grad_norm": 3.91261625289917,
3157
+ "learning_rate": 1.8090169943749477e-05,
3158
+ "loss": 0.1979,
3159
+ "step": 414
3160
+ },
3161
+ {
3162
+ "epoch": 0.2807848443843031,
3163
+ "grad_norm": 5.789203643798828,
3164
+ "learning_rate": 1.8076263313144568e-05,
3165
+ "loss": 0.1015,
3166
+ "step": 415
3167
+ },
3168
+ {
3169
+ "epoch": 0.2814614343707713,
3170
+ "grad_norm": 3.4970619678497314,
3171
+ "learning_rate": 1.806231162092686e-05,
3172
+ "loss": 0.1626,
3173
+ "step": 416
3174
+ },
3175
+ {
3176
+ "epoch": 0.2821380243572395,
3177
+ "grad_norm": 2.941303253173828,
3178
+ "learning_rate": 1.804831494494e-05,
3179
+ "loss": 0.1262,
3180
+ "step": 417
3181
+ },
3182
+ {
3183
+ "epoch": 0.2828146143437077,
3184
+ "grad_norm": 3.1387312412261963,
3185
+ "learning_rate": 1.8034273363278615e-05,
3186
+ "loss": 0.112,
3187
+ "step": 418
3188
+ },
3189
+ {
3190
+ "epoch": 0.28349120433017594,
3191
+ "grad_norm": 3.1637914180755615,
3192
+ "learning_rate": 1.8020186954287883e-05,
3193
+ "loss": 0.1387,
3194
+ "step": 419
3195
+ },
3196
+ {
3197
+ "epoch": 0.28416779431664413,
3198
+ "grad_norm": 7.6281538009643555,
3199
+ "learning_rate": 1.8006055796563103e-05,
3200
+ "loss": 0.1498,
3201
+ "step": 420
3202
+ },
3203
+ {
3204
+ "epoch": 0.28416779431664413,
3205
+ "eval_accuracy": 0.7880733944954128,
3206
+ "eval_f1": 0.43795620437956206,
3207
+ "eval_loss": 0.4631403982639313,
3208
+ "eval_precision": 0.7964601769911505,
3209
+ "eval_recall": 0.30201342281879195,
3210
+ "eval_runtime": 53.6246,
3211
+ "eval_samples_per_second": 5.557,
3212
+ "eval_steps_per_second": 0.186,
3213
+ "step": 420
3214
+ },
3215
+ {
3216
+ "epoch": 0.2848443843031123,
3217
+ "grad_norm": 7.874175071716309,
3218
+ "learning_rate": 1.7991879968949248e-05,
3219
+ "loss": 0.1542,
3220
+ "step": 421
3221
+ },
3222
+ {
3223
+ "epoch": 0.2855209742895805,
3224
+ "grad_norm": 2.5916264057159424,
3225
+ "learning_rate": 1.797765955054053e-05,
3226
+ "loss": 0.1319,
3227
+ "step": 422
3228
+ },
3229
+ {
3230
+ "epoch": 0.2861975642760487,
3231
+ "grad_norm": 9.027409553527832,
3232
+ "learning_rate": 1.7963394620679945e-05,
3233
+ "loss": 0.2224,
3234
+ "step": 423
3235
+ },
3236
+ {
3237
+ "epoch": 0.2868741542625169,
3238
+ "grad_norm": 2.5223119258880615,
3239
+ "learning_rate": 1.7949085258958853e-05,
3240
+ "loss": 0.1183,
3241
+ "step": 424
3242
+ },
3243
+ {
3244
+ "epoch": 0.28755074424898514,
3245
+ "grad_norm": 1.9095633029937744,
3246
+ "learning_rate": 1.7934731545216515e-05,
3247
+ "loss": 0.1178,
3248
+ "step": 425
3249
+ },
3250
+ {
3251
+ "epoch": 0.2882273342354533,
3252
+ "grad_norm": 3.547039031982422,
3253
+ "learning_rate": 1.792033355953966e-05,
3254
+ "loss": 0.1246,
3255
+ "step": 426
3256
+ },
3257
+ {
3258
+ "epoch": 0.2889039242219215,
3259
+ "grad_norm": 3.945955991744995,
3260
+ "learning_rate": 1.790589138226203e-05,
3261
+ "loss": 0.2155,
3262
+ "step": 427
3263
+ },
3264
+ {
3265
+ "epoch": 0.2895805142083897,
3266
+ "grad_norm": 1.659956932067871,
3267
+ "learning_rate": 1.789140509396394e-05,
3268
+ "loss": 0.0721,
3269
+ "step": 428
3270
+ },
3271
+ {
3272
+ "epoch": 0.2902571041948579,
3273
+ "grad_norm": 3.547576427459717,
3274
+ "learning_rate": 1.7876874775471806e-05,
3275
+ "loss": 0.1895,
3276
+ "step": 429
3277
+ },
3278
+ {
3279
+ "epoch": 0.29093369418132614,
3280
+ "grad_norm": 5.191123008728027,
3281
+ "learning_rate": 1.7862300507857733e-05,
3282
+ "loss": 0.2105,
3283
+ "step": 430
3284
+ },
3285
+ {
3286
+ "epoch": 0.29161028416779433,
3287
+ "grad_norm": 4.68615198135376,
3288
+ "learning_rate": 1.7847682372439024e-05,
3289
+ "loss": 0.2427,
3290
+ "step": 431
3291
+ },
3292
+ {
3293
+ "epoch": 0.2922868741542625,
3294
+ "grad_norm": 7.467837333679199,
3295
+ "learning_rate": 1.7833020450777756e-05,
3296
+ "loss": 0.255,
3297
+ "step": 432
3298
+ },
3299
+ {
3300
+ "epoch": 0.2929634641407307,
3301
+ "grad_norm": 4.769316673278809,
3302
+ "learning_rate": 1.78183148246803e-05,
3303
+ "loss": 0.2349,
3304
+ "step": 433
3305
+ },
3306
+ {
3307
+ "epoch": 0.2936400541271989,
3308
+ "grad_norm": 2.3752694129943848,
3309
+ "learning_rate": 1.7803565576196884e-05,
3310
+ "loss": 0.1347,
3311
+ "step": 434
3312
+ },
3313
+ {
3314
+ "epoch": 0.2943166441136671,
3315
+ "grad_norm": 2.9256367683410645,
3316
+ "learning_rate": 1.7788772787621126e-05,
3317
+ "loss": 0.19,
3318
+ "step": 435
3319
+ },
3320
+ {
3321
+ "epoch": 0.29499323410013534,
3322
+ "grad_norm": 2.6127521991729736,
3323
+ "learning_rate": 1.7773936541489577e-05,
3324
+ "loss": 0.1579,
3325
+ "step": 436
3326
+ },
3327
+ {
3328
+ "epoch": 0.2956698240866035,
3329
+ "grad_norm": 1.9983330965042114,
3330
+ "learning_rate": 1.7759056920581256e-05,
3331
+ "loss": 0.1109,
3332
+ "step": 437
3333
+ },
3334
+ {
3335
+ "epoch": 0.2963464140730717,
3336
+ "grad_norm": 2.2543447017669678,
3337
+ "learning_rate": 1.7744134007917195e-05,
3338
+ "loss": 0.1244,
3339
+ "step": 438
3340
+ },
3341
+ {
3342
+ "epoch": 0.2970230040595399,
3343
+ "grad_norm": 3.1791696548461914,
3344
+ "learning_rate": 1.7729167886759974e-05,
3345
+ "loss": 0.1867,
3346
+ "step": 439
3347
+ },
3348
+ {
3349
+ "epoch": 0.2976995940460081,
3350
+ "grad_norm": 2.7958037853240967,
3351
+ "learning_rate": 1.771415864061326e-05,
3352
+ "loss": 0.1344,
3353
+ "step": 440
3354
+ },
3355
+ {
3356
+ "epoch": 0.2976995940460081,
3357
+ "eval_accuracy": 0.7963302752293578,
3358
+ "eval_f1": 0.47641509433962265,
3359
+ "eval_loss": 0.42485642433166504,
3360
+ "eval_precision": 0.8015873015873016,
3361
+ "eval_recall": 0.3389261744966443,
3362
+ "eval_runtime": 52.4984,
3363
+ "eval_samples_per_second": 5.676,
3364
+ "eval_steps_per_second": 0.19,
3365
+ "step": 440
3366
+ },
3367
+ {
3368
+ "epoch": 0.29837618403247634,
3369
+ "grad_norm": 2.20705509185791,
3370
+ "learning_rate": 1.7699106353221322e-05,
3371
+ "loss": 0.1233,
3372
+ "step": 441
3373
+ },
3374
+ {
3375
+ "epoch": 0.29905277401894453,
3376
+ "grad_norm": 2.328334331512451,
3377
+ "learning_rate": 1.7684011108568593e-05,
3378
+ "loss": 0.1142,
3379
+ "step": 442
3380
+ },
3381
+ {
3382
+ "epoch": 0.2997293640054127,
3383
+ "grad_norm": 3.256822109222412,
3384
+ "learning_rate": 1.7668872990879175e-05,
3385
+ "loss": 0.1556,
3386
+ "step": 443
3387
+ },
3388
+ {
3389
+ "epoch": 0.3004059539918809,
3390
+ "grad_norm": 2.4061648845672607,
3391
+ "learning_rate": 1.765369208461639e-05,
3392
+ "loss": 0.0828,
3393
+ "step": 444
3394
+ },
3395
+ {
3396
+ "epoch": 0.3010825439783491,
3397
+ "grad_norm": 5.99202299118042,
3398
+ "learning_rate": 1.7638468474482297e-05,
3399
+ "loss": 0.157,
3400
+ "step": 445
3401
+ },
3402
+ {
3403
+ "epoch": 0.3017591339648173,
3404
+ "grad_norm": 3.7360379695892334,
3405
+ "learning_rate": 1.762320224541722e-05,
3406
+ "loss": 0.1257,
3407
+ "step": 446
3408
+ },
3409
+ {
3410
+ "epoch": 0.30243572395128554,
3411
+ "grad_norm": 3.2651238441467285,
3412
+ "learning_rate": 1.760789348259927e-05,
3413
+ "loss": 0.1732,
3414
+ "step": 447
3415
+ },
3416
+ {
3417
+ "epoch": 0.30311231393775373,
3418
+ "grad_norm": 3.5508763790130615,
3419
+ "learning_rate": 1.7592542271443888e-05,
3420
+ "loss": 0.1644,
3421
+ "step": 448
3422
+ },
3423
+ {
3424
+ "epoch": 0.3037889039242219,
3425
+ "grad_norm": 4.703643798828125,
3426
+ "learning_rate": 1.757714869760335e-05,
3427
+ "loss": 0.2103,
3428
+ "step": 449
3429
+ },
3430
+ {
3431
+ "epoch": 0.3044654939106901,
3432
+ "grad_norm": 5.588313102722168,
3433
+ "learning_rate": 1.756171284696629e-05,
3434
+ "loss": 0.187,
3435
+ "step": 450
3436
+ },
3437
+ {
3438
+ "epoch": 0.3051420838971583,
3439
+ "grad_norm": 1.9803051948547363,
3440
+ "learning_rate": 1.7546234805657235e-05,
3441
+ "loss": 0.0944,
3442
+ "step": 451
3443
+ },
3444
+ {
3445
+ "epoch": 0.30581867388362655,
3446
+ "grad_norm": 3.1527740955352783,
3447
+ "learning_rate": 1.7530714660036112e-05,
3448
+ "loss": 0.1105,
3449
+ "step": 452
3450
+ },
3451
+ {
3452
+ "epoch": 0.30649526387009474,
3453
+ "grad_norm": 4.078627109527588,
3454
+ "learning_rate": 1.7515152496697765e-05,
3455
+ "loss": 0.161,
3456
+ "step": 453
3457
+ },
3458
+ {
3459
+ "epoch": 0.3071718538565629,
3460
+ "grad_norm": 4.430943489074707,
3461
+ "learning_rate": 1.749954840247148e-05,
3462
+ "loss": 0.1883,
3463
+ "step": 454
3464
+ },
3465
+ {
3466
+ "epoch": 0.3078484438430311,
3467
+ "grad_norm": 3.115837335586548,
3468
+ "learning_rate": 1.7483902464420507e-05,
3469
+ "loss": 0.1122,
3470
+ "step": 455
3471
+ },
3472
+ {
3473
+ "epoch": 0.3085250338294993,
3474
+ "grad_norm": 3.008695602416992,
3475
+ "learning_rate": 1.7468214769841542e-05,
3476
+ "loss": 0.1034,
3477
+ "step": 456
3478
+ },
3479
+ {
3480
+ "epoch": 0.3092016238159675,
3481
+ "grad_norm": 6.273781776428223,
3482
+ "learning_rate": 1.7452485406264278e-05,
3483
+ "loss": 0.1709,
3484
+ "step": 457
3485
+ },
3486
+ {
3487
+ "epoch": 0.30987821380243574,
3488
+ "grad_norm": 4.796054363250732,
3489
+ "learning_rate": 1.74367144614509e-05,
3490
+ "loss": 0.1932,
3491
+ "step": 458
3492
+ },
3493
+ {
3494
+ "epoch": 0.31055480378890393,
3495
+ "grad_norm": 10.217569351196289,
3496
+ "learning_rate": 1.742090202339559e-05,
3497
+ "loss": 0.19,
3498
+ "step": 459
3499
+ },
3500
+ {
3501
+ "epoch": 0.3112313937753721,
3502
+ "grad_norm": 3.481541395187378,
3503
+ "learning_rate": 1.7405048180324046e-05,
3504
+ "loss": 0.161,
3505
+ "step": 460
3506
+ },
3507
+ {
3508
+ "epoch": 0.3112313937753721,
3509
+ "eval_accuracy": 0.8091743119266055,
3510
+ "eval_f1": 0.5336322869955157,
3511
+ "eval_loss": 0.42575448751449585,
3512
+ "eval_precision": 0.8040540540540541,
3513
+ "eval_recall": 0.39932885906040266,
3514
+ "eval_runtime": 53.3596,
3515
+ "eval_samples_per_second": 5.585,
3516
+ "eval_steps_per_second": 0.187,
3517
+ "step": 460
3518
+ },
3519
+ {
3520
+ "epoch": 0.3119079837618403,
3521
+ "grad_norm": 3.4007368087768555,
3522
+ "learning_rate": 1.7389153020692985e-05,
3523
+ "loss": 0.1502,
3524
+ "step": 461
3525
+ },
3526
+ {
3527
+ "epoch": 0.3125845737483085,
3528
+ "grad_norm": 3.0644993782043457,
3529
+ "learning_rate": 1.7373216633189653e-05,
3530
+ "loss": 0.1749,
3531
+ "step": 462
3532
+ },
3533
+ {
3534
+ "epoch": 0.31326116373477675,
3535
+ "grad_norm": 4.407646179199219,
3536
+ "learning_rate": 1.735723910673132e-05,
3537
+ "loss": 0.1703,
3538
+ "step": 463
3539
+ },
3540
+ {
3541
+ "epoch": 0.31393775372124494,
3542
+ "grad_norm": 3.6031856536865234,
3543
+ "learning_rate": 1.7341220530464796e-05,
3544
+ "loss": 0.1745,
3545
+ "step": 464
3546
+ },
3547
+ {
3548
+ "epoch": 0.31461434370771313,
3549
+ "grad_norm": 5.204887390136719,
3550
+ "learning_rate": 1.7325160993765934e-05,
3551
+ "loss": 0.0987,
3552
+ "step": 465
3553
+ },
3554
+ {
3555
+ "epoch": 0.3152909336941813,
3556
+ "grad_norm": 2.864173173904419,
3557
+ "learning_rate": 1.7309060586239117e-05,
3558
+ "loss": 0.1985,
3559
+ "step": 466
3560
+ },
3561
+ {
3562
+ "epoch": 0.3159675236806495,
3563
+ "grad_norm": 3.125213861465454,
3564
+ "learning_rate": 1.7292919397716772e-05,
3565
+ "loss": 0.1482,
3566
+ "step": 467
3567
+ },
3568
+ {
3569
+ "epoch": 0.3166441136671177,
3570
+ "grad_norm": 5.636457920074463,
3571
+ "learning_rate": 1.7276737518258865e-05,
3572
+ "loss": 0.1882,
3573
+ "step": 468
3574
+ },
3575
+ {
3576
+ "epoch": 0.31732070365358594,
3577
+ "grad_norm": 4.034516334533691,
3578
+ "learning_rate": 1.7260515038152393e-05,
3579
+ "loss": 0.2319,
3580
+ "step": 469
3581
+ },
3582
+ {
3583
+ "epoch": 0.31799729364005414,
3584
+ "grad_norm": 2.29288911819458,
3585
+ "learning_rate": 1.7244252047910893e-05,
3586
+ "loss": 0.0806,
3587
+ "step": 470
3588
+ },
3589
+ {
3590
+ "epoch": 0.3186738836265223,
3591
+ "grad_norm": 4.686462879180908,
3592
+ "learning_rate": 1.7227948638273918e-05,
3593
+ "loss": 0.192,
3594
+ "step": 471
3595
+ },
3596
+ {
3597
+ "epoch": 0.3193504736129905,
3598
+ "grad_norm": 3.879487991333008,
3599
+ "learning_rate": 1.7211604900206552e-05,
3600
+ "loss": 0.1904,
3601
+ "step": 472
3602
+ },
3603
+ {
3604
+ "epoch": 0.3200270635994587,
3605
+ "grad_norm": 4.023051738739014,
3606
+ "learning_rate": 1.7195220924898883e-05,
3607
+ "loss": 0.2034,
3608
+ "step": 473
3609
+ },
3610
+ {
3611
+ "epoch": 0.32070365358592695,
3612
+ "grad_norm": 4.105659008026123,
3613
+ "learning_rate": 1.717879680376551e-05,
3614
+ "loss": 0.1803,
3615
+ "step": 474
3616
+ },
3617
+ {
3618
+ "epoch": 0.32138024357239514,
3619
+ "grad_norm": 5.522044658660889,
3620
+ "learning_rate": 1.7162332628445024e-05,
3621
+ "loss": 0.2052,
3622
+ "step": 475
3623
+ },
3624
+ {
3625
+ "epoch": 0.32205683355886333,
3626
+ "grad_norm": 4.441620349884033,
3627
+ "learning_rate": 1.7145828490799497e-05,
3628
+ "loss": 0.1982,
3629
+ "step": 476
3630
+ },
3631
+ {
3632
+ "epoch": 0.3227334235453315,
3633
+ "grad_norm": 2.258070707321167,
3634
+ "learning_rate": 1.7129284482913973e-05,
3635
+ "loss": 0.1493,
3636
+ "step": 477
3637
+ },
3638
+ {
3639
+ "epoch": 0.3234100135317997,
3640
+ "grad_norm": 4.115694522857666,
3641
+ "learning_rate": 1.7112700697095955e-05,
3642
+ "loss": 0.1957,
3643
+ "step": 478
3644
+ },
3645
+ {
3646
+ "epoch": 0.32408660351826796,
3647
+ "grad_norm": 4.366945743560791,
3648
+ "learning_rate": 1.709607722587488e-05,
3649
+ "loss": 0.2066,
3650
+ "step": 479
3651
+ },
3652
+ {
3653
+ "epoch": 0.32476319350473615,
3654
+ "grad_norm": 3.625458240509033,
3655
+ "learning_rate": 1.7079414162001617e-05,
3656
+ "loss": 0.2087,
3657
+ "step": 480
3658
+ },
3659
+ {
3660
+ "epoch": 0.32476319350473615,
3661
+ "eval_accuracy": 0.810091743119266,
3662
+ "eval_f1": 0.5152224824355972,
3663
+ "eval_loss": 0.4003700017929077,
3664
+ "eval_precision": 0.8527131782945736,
3665
+ "eval_recall": 0.3691275167785235,
3666
+ "eval_runtime": 52.4305,
3667
+ "eval_samples_per_second": 5.684,
3668
+ "eval_steps_per_second": 0.191,
3669
+ "step": 480
3670
+ },
3671
+ {
3672
+ "epoch": 0.32543978349120434,
3673
+ "grad_norm": 4.504916667938232,
3674
+ "learning_rate": 1.7062711598447936e-05,
3675
+ "loss": 0.168,
3676
+ "step": 481
3677
+ },
3678
+ {
3679
+ "epoch": 0.3261163734776725,
3680
+ "grad_norm": 3.6613380908966064,
3681
+ "learning_rate": 1.7045969628406013e-05,
3682
+ "loss": 0.1761,
3683
+ "step": 482
3684
+ },
3685
+ {
3686
+ "epoch": 0.3267929634641407,
3687
+ "grad_norm": 5.603222846984863,
3688
+ "learning_rate": 1.7029188345287868e-05,
3689
+ "loss": 0.1618,
3690
+ "step": 483
3691
+ },
3692
+ {
3693
+ "epoch": 0.3274695534506089,
3694
+ "grad_norm": 5.035453796386719,
3695
+ "learning_rate": 1.7012367842724887e-05,
3696
+ "loss": 0.1485,
3697
+ "step": 484
3698
+ },
3699
+ {
3700
+ "epoch": 0.32814614343707715,
3701
+ "grad_norm": 3.6518185138702393,
3702
+ "learning_rate": 1.6995508214567275e-05,
3703
+ "loss": 0.1497,
3704
+ "step": 485
3705
+ },
3706
+ {
3707
+ "epoch": 0.32882273342354534,
3708
+ "grad_norm": 3.421865463256836,
3709
+ "learning_rate": 1.6978609554883544e-05,
3710
+ "loss": 0.1269,
3711
+ "step": 486
3712
+ },
3713
+ {
3714
+ "epoch": 0.32949932341001353,
3715
+ "grad_norm": 1.6450062990188599,
3716
+ "learning_rate": 1.6961671957959967e-05,
3717
+ "loss": 0.0814,
3718
+ "step": 487
3719
+ },
3720
+ {
3721
+ "epoch": 0.3301759133964817,
3722
+ "grad_norm": 3.649200201034546,
3723
+ "learning_rate": 1.6944695518300087e-05,
3724
+ "loss": 0.0999,
3725
+ "step": 488
3726
+ },
3727
+ {
3728
+ "epoch": 0.3308525033829499,
3729
+ "grad_norm": 5.043969631195068,
3730
+ "learning_rate": 1.6927680330624165e-05,
3731
+ "loss": 0.2421,
3732
+ "step": 489
3733
+ },
3734
+ {
3735
+ "epoch": 0.33152909336941816,
3736
+ "grad_norm": 2.5395421981811523,
3737
+ "learning_rate": 1.691062648986865e-05,
3738
+ "loss": 0.1194,
3739
+ "step": 490
3740
+ },
3741
+ {
3742
+ "epoch": 0.33220568335588635,
3743
+ "grad_norm": 3.3575992584228516,
3744
+ "learning_rate": 1.6893534091185658e-05,
3745
+ "loss": 0.1217,
3746
+ "step": 491
3747
+ },
3748
+ {
3749
+ "epoch": 0.33288227334235454,
3750
+ "grad_norm": 3.968233823776245,
3751
+ "learning_rate": 1.6876403229942453e-05,
3752
+ "loss": 0.1636,
3753
+ "step": 492
3754
+ },
3755
+ {
3756
+ "epoch": 0.33355886332882273,
3757
+ "grad_norm": 4.133052825927734,
3758
+ "learning_rate": 1.6859234001720882e-05,
3759
+ "loss": 0.0986,
3760
+ "step": 493
3761
+ },
3762
+ {
3763
+ "epoch": 0.3342354533152909,
3764
+ "grad_norm": 6.324604034423828,
3765
+ "learning_rate": 1.6842026502316874e-05,
3766
+ "loss": 0.1145,
3767
+ "step": 494
3768
+ },
3769
+ {
3770
+ "epoch": 0.3349120433017591,
3771
+ "grad_norm": 4.619051933288574,
3772
+ "learning_rate": 1.682478082773989e-05,
3773
+ "loss": 0.2143,
3774
+ "step": 495
3775
+ },
3776
+ {
3777
+ "epoch": 0.33558863328822736,
3778
+ "grad_norm": 3.0241997241973877,
3779
+ "learning_rate": 1.680749707421238e-05,
3780
+ "loss": 0.1057,
3781
+ "step": 496
3782
+ },
3783
+ {
3784
+ "epoch": 0.33626522327469555,
3785
+ "grad_norm": 1.7503517866134644,
3786
+ "learning_rate": 1.6790175338169277e-05,
3787
+ "loss": 0.0634,
3788
+ "step": 497
3789
+ },
3790
+ {
3791
+ "epoch": 0.33694181326116374,
3792
+ "grad_norm": 4.2864990234375,
3793
+ "learning_rate": 1.6772815716257414e-05,
3794
+ "loss": 0.1524,
3795
+ "step": 498
3796
+ },
3797
+ {
3798
+ "epoch": 0.3376184032476319,
3799
+ "grad_norm": 4.454866886138916,
3800
+ "learning_rate": 1.6755418305335026e-05,
3801
+ "loss": 0.1908,
3802
+ "step": 499
3803
+ },
3804
+ {
3805
+ "epoch": 0.3382949932341001,
3806
+ "grad_norm": 5.34849739074707,
3807
+ "learning_rate": 1.673798320247118e-05,
3808
+ "loss": 0.1857,
3809
+ "step": 500
3810
+ },
3811
+ {
3812
+ "epoch": 0.3382949932341001,
3813
+ "eval_accuracy": 0.7862385321100918,
3814
+ "eval_f1": 0.391644908616188,
3815
+ "eval_loss": 0.5401991605758667,
3816
+ "eval_precision": 0.8823529411764706,
3817
+ "eval_recall": 0.2516778523489933,
3818
+ "eval_runtime": 52.9654,
3819
+ "eval_samples_per_second": 5.626,
3820
+ "eval_steps_per_second": 0.189,
3821
+ "step": 500
3822
  }
3823
  ],
3824
  "logging_steps": 1,
 
3838
  "attributes": {}
3839
  }
3840
  },
3841
+ "total_flos": 1.513912347435991e+17,
3842
  "train_batch_size": 8,
3843
  "trial_name": null,
3844
  "trial_params": null