mtzig commited on
Commit
67de92a
·
verified ·
1 Parent(s): 67e957b

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1b6729e5ec442a789c5534de429bbdd6ad74a54d5ab148d14bedb4c23a8a93c
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f4e1f5f1b463bbf7a2ef4d4af3eb26c728852abec7c6787de609ac33d09a95c
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3904e43332132ff406e596c21aff0aea66547b88a808d0b93bd70992806b298f
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90a4eef445695e00c0448191c471e20fc90fac55088f64ebc374faeb67378993
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f709b6b535d116a4c507da712164b2a513083e086ddbad328209690c3c4d840f
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c69c54a9cccf4257b34f1b979d5d539b31b5218794be3960611b7d2d897e994a
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8558d307d1f88b1327b3b2f17ea6c2f66defc8bd2c42208a49d3340fa5c0ff3d
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ed25b5e3653278282ac873c3af5310841446a9e13773d544889afce31024bcb
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5bbd86757f6d90709e53c4bee410c64f46e1e00a7b69516495436b3b19fa4bc
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39de24dbe6f2f3a1b4e34daf6b8e7473a3a38ea40a91769099a82e8f4ebd1d0b
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:927e5715a176e36583234bf60605473a6e6ee212a6ae9002c13b90bbdbfa2413
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98fb28188ad6ed7eb480ef9e981c73e6e5e156423f75a203ab35ebe4c0ee7122
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:803c82bb739590694cf28b3c107b771168530366a9df3d0d35a186cd19f75ac8
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8f0f9b2d5716e1b93fa3c48662c835a10bad645dcd88050a14008f0e7777a56
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d742b4fad568eaa03fc2dc329d88c7eed505e69160db904dc9c312d372f1582
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a1a3d5f6e07161b2ee73578e4b8d161f40891058e3f83f813289b5c369f350a
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b534588f047199495c247b01b639a4ec3dd05f1cd5d046f973172fb1baf268d
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c0996004e3280ba2b8c5308142e245e93b9a3d5870de383914360145085a647
3
  size 15088
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a128bd654cc64f4756f3ca659f2e594694372fb259352d497e7af42c47bdf834
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b99dae60d08ae089466b878474ef297a0b281547cd1097ea214ecee77244b16
3
  size 15088
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa7e301223140cc7be084a97964d668a32053ebf50ebcb22fe5f6447769a9865
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c3acfb47638e30fe1106672a6fd0db74c9187c94c19467e9d22bd366fbb5472
3
  size 15088
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f114cbbab4db7dbcae3790f915ff92c38c1bdffe548e2426dc312a0c4340440f
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9723827a668573edbd596a65e0f225b208491adf853284b8da3f11b792077fdc
3
  size 15088
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a88a4ccf03a33f28d34c318afb486939b571f6edba4a67537bf973b4e8c4ed91
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a124d1e9d8a7b4a76d7294be394802bfec19da05b0209e12c8dc6b8ab250293
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5154639175257731,
5
  "eval_steps": 20,
6
- "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3059,6 +3059,766 @@
3059
  "eval_samples_per_second": 5.398,
3060
  "eval_steps_per_second": 0.178,
3061
  "step": 400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3062
  }
3063
  ],
3064
  "logging_steps": 1,
@@ -3078,7 +3838,7 @@
3078
  "attributes": {}
3079
  }
3080
  },
3081
- "total_flos": 1.336068185962578e+17,
3082
  "train_batch_size": 8,
3083
  "trial_name": null,
3084
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.6443298969072165,
5
  "eval_steps": 20,
6
+ "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3059
  "eval_samples_per_second": 5.398,
3060
  "eval_steps_per_second": 0.178,
3061
  "step": 400
3062
+ },
3063
+ {
3064
+ "epoch": 0.5167525773195877,
3065
+ "grad_norm": 1.0079818964004517,
3066
+ "learning_rate": 1.1167551721113435e-05,
3067
+ "loss": 0.0049,
3068
+ "step": 401
3069
+ },
3070
+ {
3071
+ "epoch": 0.5180412371134021,
3072
+ "grad_norm": 0.5026824474334717,
3073
+ "learning_rate": 1.1122839380745738e-05,
3074
+ "loss": 0.0034,
3075
+ "step": 402
3076
+ },
3077
+ {
3078
+ "epoch": 0.5193298969072165,
3079
+ "grad_norm": 0.251522421836853,
3080
+ "learning_rate": 1.1078104294337806e-05,
3081
+ "loss": 0.0016,
3082
+ "step": 403
3083
+ },
3084
+ {
3085
+ "epoch": 0.520618556701031,
3086
+ "grad_norm": 0.2365703135728836,
3087
+ "learning_rate": 1.1033347368115494e-05,
3088
+ "loss": 0.0008,
3089
+ "step": 404
3090
+ },
3091
+ {
3092
+ "epoch": 0.5219072164948454,
3093
+ "grad_norm": 0.3773232400417328,
3094
+ "learning_rate": 1.0988569508747075e-05,
3095
+ "loss": 0.0018,
3096
+ "step": 405
3097
+ },
3098
+ {
3099
+ "epoch": 0.5231958762886598,
3100
+ "grad_norm": 1.486518383026123,
3101
+ "learning_rate": 1.0943771623324884e-05,
3102
+ "loss": 0.0276,
3103
+ "step": 406
3104
+ },
3105
+ {
3106
+ "epoch": 0.5244845360824743,
3107
+ "grad_norm": 0.07446420192718506,
3108
+ "learning_rate": 1.0898954619346924e-05,
3109
+ "loss": 0.0008,
3110
+ "step": 407
3111
+ },
3112
+ {
3113
+ "epoch": 0.5257731958762887,
3114
+ "grad_norm": 0.12031542509794235,
3115
+ "learning_rate": 1.085411940469851e-05,
3116
+ "loss": 0.0012,
3117
+ "step": 408
3118
+ },
3119
+ {
3120
+ "epoch": 0.5270618556701031,
3121
+ "grad_norm": 0.2832430601119995,
3122
+ "learning_rate": 1.0809266887633849e-05,
3123
+ "loss": 0.0009,
3124
+ "step": 409
3125
+ },
3126
+ {
3127
+ "epoch": 0.5283505154639175,
3128
+ "grad_norm": 0.21104812622070312,
3129
+ "learning_rate": 1.0764397976757658e-05,
3130
+ "loss": 0.0028,
3131
+ "step": 410
3132
+ },
3133
+ {
3134
+ "epoch": 0.529639175257732,
3135
+ "grad_norm": 2.558060884475708,
3136
+ "learning_rate": 1.0719513581006751e-05,
3137
+ "loss": 0.018,
3138
+ "step": 411
3139
+ },
3140
+ {
3141
+ "epoch": 0.5309278350515464,
3142
+ "grad_norm": 0.1067991852760315,
3143
+ "learning_rate": 1.0674614609631634e-05,
3144
+ "loss": 0.0004,
3145
+ "step": 412
3146
+ },
3147
+ {
3148
+ "epoch": 0.5322164948453608,
3149
+ "grad_norm": 1.812943935394287,
3150
+ "learning_rate": 1.062970197217808e-05,
3151
+ "loss": 0.0152,
3152
+ "step": 413
3153
+ },
3154
+ {
3155
+ "epoch": 0.5335051546391752,
3156
+ "grad_norm": 1.7201879024505615,
3157
+ "learning_rate": 1.0584776578468698e-05,
3158
+ "loss": 0.0071,
3159
+ "step": 414
3160
+ },
3161
+ {
3162
+ "epoch": 0.5347938144329897,
3163
+ "grad_norm": 0.17433598637580872,
3164
+ "learning_rate": 1.0539839338584509e-05,
3165
+ "loss": 0.0017,
3166
+ "step": 415
3167
+ },
3168
+ {
3169
+ "epoch": 0.5360824742268041,
3170
+ "grad_norm": 1.8511940240859985,
3171
+ "learning_rate": 1.0494891162846515e-05,
3172
+ "loss": 0.0036,
3173
+ "step": 416
3174
+ },
3175
+ {
3176
+ "epoch": 0.5373711340206185,
3177
+ "grad_norm": 0.22043751180171967,
3178
+ "learning_rate": 1.0449932961797249e-05,
3179
+ "loss": 0.0014,
3180
+ "step": 417
3181
+ },
3182
+ {
3183
+ "epoch": 0.538659793814433,
3184
+ "grad_norm": 4.1709370613098145,
3185
+ "learning_rate": 1.040496564618233e-05,
3186
+ "loss": 0.0426,
3187
+ "step": 418
3188
+ },
3189
+ {
3190
+ "epoch": 0.5399484536082474,
3191
+ "grad_norm": 2.242624282836914,
3192
+ "learning_rate": 1.0359990126932022e-05,
3193
+ "loss": 0.0141,
3194
+ "step": 419
3195
+ },
3196
+ {
3197
+ "epoch": 0.5412371134020618,
3198
+ "grad_norm": 0.7744148969650269,
3199
+ "learning_rate": 1.0315007315142772e-05,
3200
+ "loss": 0.0028,
3201
+ "step": 420
3202
+ },
3203
+ {
3204
+ "epoch": 0.5412371134020618,
3205
+ "eval_accuracy": 0.9965243296921549,
3206
+ "eval_f1": 0.9391304347826087,
3207
+ "eval_loss": 0.01408898364752531,
3208
+ "eval_precision": 0.9310344827586207,
3209
+ "eval_recall": 0.9473684210526315,
3210
+ "eval_runtime": 83.4124,
3211
+ "eval_samples_per_second": 5.455,
3212
+ "eval_steps_per_second": 0.18,
3213
+ "step": 420
3214
+ },
3215
+ {
3216
+ "epoch": 0.5425257731958762,
3217
+ "grad_norm": 2.3163211345672607,
3218
+ "learning_rate": 1.0270018122058753e-05,
3219
+ "loss": 0.0163,
3220
+ "step": 421
3221
+ },
3222
+ {
3223
+ "epoch": 0.5438144329896907,
3224
+ "grad_norm": 0.7905238270759583,
3225
+ "learning_rate": 1.0225023459053416e-05,
3226
+ "loss": 0.0029,
3227
+ "step": 422
3228
+ },
3229
+ {
3230
+ "epoch": 0.5451030927835051,
3231
+ "grad_norm": 1.9856590032577515,
3232
+ "learning_rate": 1.018002423761101e-05,
3233
+ "loss": 0.0499,
3234
+ "step": 423
3235
+ },
3236
+ {
3237
+ "epoch": 0.5463917525773195,
3238
+ "grad_norm": 0.17007720470428467,
3239
+ "learning_rate": 1.0135021369308138e-05,
3240
+ "loss": 0.0009,
3241
+ "step": 424
3242
+ },
3243
+ {
3244
+ "epoch": 0.5476804123711341,
3245
+ "grad_norm": 0.5280592441558838,
3246
+ "learning_rate": 1.0090015765795265e-05,
3247
+ "loss": 0.0031,
3248
+ "step": 425
3249
+ },
3250
+ {
3251
+ "epoch": 0.5489690721649485,
3252
+ "grad_norm": 1.507529616355896,
3253
+ "learning_rate": 1.004500833877828e-05,
3254
+ "loss": 0.0333,
3255
+ "step": 426
3256
+ },
3257
+ {
3258
+ "epoch": 0.5502577319587629,
3259
+ "grad_norm": 3.355781316757202,
3260
+ "learning_rate": 1e-05,
3261
+ "loss": 0.0136,
3262
+ "step": 427
3263
+ },
3264
+ {
3265
+ "epoch": 0.5515463917525774,
3266
+ "grad_norm": 2.624476909637451,
3267
+ "learning_rate": 9.954991661221724e-06,
3268
+ "loss": 0.0115,
3269
+ "step": 428
3270
+ },
3271
+ {
3272
+ "epoch": 0.5528350515463918,
3273
+ "grad_norm": 1.1731654405593872,
3274
+ "learning_rate": 9.909984234204738e-06,
3275
+ "loss": 0.0029,
3276
+ "step": 429
3277
+ },
3278
+ {
3279
+ "epoch": 0.5541237113402062,
3280
+ "grad_norm": 0.35924506187438965,
3281
+ "learning_rate": 9.864978630691865e-06,
3282
+ "loss": 0.0023,
3283
+ "step": 430
3284
+ },
3285
+ {
3286
+ "epoch": 0.5554123711340206,
3287
+ "grad_norm": 0.4166584610939026,
3288
+ "learning_rate": 9.819975762388993e-06,
3289
+ "loss": 0.0023,
3290
+ "step": 431
3291
+ },
3292
+ {
3293
+ "epoch": 0.5567010309278351,
3294
+ "grad_norm": 1.3430147171020508,
3295
+ "learning_rate": 9.774976540946589e-06,
3296
+ "loss": 0.022,
3297
+ "step": 432
3298
+ },
3299
+ {
3300
+ "epoch": 0.5579896907216495,
3301
+ "grad_norm": 1.9612890481948853,
3302
+ "learning_rate": 9.729981877941249e-06,
3303
+ "loss": 0.0268,
3304
+ "step": 433
3305
+ },
3306
+ {
3307
+ "epoch": 0.5592783505154639,
3308
+ "grad_norm": 1.5633459091186523,
3309
+ "learning_rate": 9.684992684857232e-06,
3310
+ "loss": 0.0058,
3311
+ "step": 434
3312
+ },
3313
+ {
3314
+ "epoch": 0.5605670103092784,
3315
+ "grad_norm": 1.8824856281280518,
3316
+ "learning_rate": 9.640009873067981e-06,
3317
+ "loss": 0.0161,
3318
+ "step": 435
3319
+ },
3320
+ {
3321
+ "epoch": 0.5618556701030928,
3322
+ "grad_norm": 0.3390294015407562,
3323
+ "learning_rate": 9.595034353817673e-06,
3324
+ "loss": 0.0011,
3325
+ "step": 436
3326
+ },
3327
+ {
3328
+ "epoch": 0.5631443298969072,
3329
+ "grad_norm": 1.143394947052002,
3330
+ "learning_rate": 9.550067038202756e-06,
3331
+ "loss": 0.0032,
3332
+ "step": 437
3333
+ },
3334
+ {
3335
+ "epoch": 0.5644329896907216,
3336
+ "grad_norm": 1.6857935190200806,
3337
+ "learning_rate": 9.505108837153489e-06,
3338
+ "loss": 0.0161,
3339
+ "step": 438
3340
+ },
3341
+ {
3342
+ "epoch": 0.5657216494845361,
3343
+ "grad_norm": 0.44908684492111206,
3344
+ "learning_rate": 9.460160661415496e-06,
3345
+ "loss": 0.0027,
3346
+ "step": 439
3347
+ },
3348
+ {
3349
+ "epoch": 0.5670103092783505,
3350
+ "grad_norm": 0.46384885907173157,
3351
+ "learning_rate": 9.415223421531308e-06,
3352
+ "loss": 0.0035,
3353
+ "step": 440
3354
+ },
3355
+ {
3356
+ "epoch": 0.5670103092783505,
3357
+ "eval_accuracy": 0.9960278053624627,
3358
+ "eval_f1": 0.9272727272727272,
3359
+ "eval_loss": 0.014668312855064869,
3360
+ "eval_precision": 0.9622641509433962,
3361
+ "eval_recall": 0.8947368421052632,
3362
+ "eval_runtime": 85.42,
3363
+ "eval_samples_per_second": 5.327,
3364
+ "eval_steps_per_second": 0.176,
3365
+ "step": 440
3366
+ },
3367
+ {
3368
+ "epoch": 0.5682989690721649,
3369
+ "grad_norm": 0.303989052772522,
3370
+ "learning_rate": 9.370298027821924e-06,
3371
+ "loss": 0.0011,
3372
+ "step": 441
3373
+ },
3374
+ {
3375
+ "epoch": 0.5695876288659794,
3376
+ "grad_norm": 1.179250717163086,
3377
+ "learning_rate": 9.325385390368367e-06,
3378
+ "loss": 0.0108,
3379
+ "step": 442
3380
+ },
3381
+ {
3382
+ "epoch": 0.5708762886597938,
3383
+ "grad_norm": 0.0955902561545372,
3384
+ "learning_rate": 9.280486418993254e-06,
3385
+ "loss": 0.0008,
3386
+ "step": 443
3387
+ },
3388
+ {
3389
+ "epoch": 0.5721649484536082,
3390
+ "grad_norm": 4.412278175354004,
3391
+ "learning_rate": 9.23560202324235e-06,
3392
+ "loss": 0.0331,
3393
+ "step": 444
3394
+ },
3395
+ {
3396
+ "epoch": 0.5734536082474226,
3397
+ "grad_norm": 0.4738692045211792,
3398
+ "learning_rate": 9.190733112366158e-06,
3399
+ "loss": 0.0024,
3400
+ "step": 445
3401
+ },
3402
+ {
3403
+ "epoch": 0.5747422680412371,
3404
+ "grad_norm": 1.4232240915298462,
3405
+ "learning_rate": 9.145880595301495e-06,
3406
+ "loss": 0.0095,
3407
+ "step": 446
3408
+ },
3409
+ {
3410
+ "epoch": 0.5760309278350515,
3411
+ "grad_norm": 1.06927490234375,
3412
+ "learning_rate": 9.101045380653076e-06,
3413
+ "loss": 0.0038,
3414
+ "step": 447
3415
+ },
3416
+ {
3417
+ "epoch": 0.5773195876288659,
3418
+ "grad_norm": 1.7943761348724365,
3419
+ "learning_rate": 9.056228376675118e-06,
3420
+ "loss": 0.0181,
3421
+ "step": 448
3422
+ },
3423
+ {
3424
+ "epoch": 0.5786082474226805,
3425
+ "grad_norm": 0.47978296875953674,
3426
+ "learning_rate": 9.011430491252924e-06,
3427
+ "loss": 0.0019,
3428
+ "step": 449
3429
+ },
3430
+ {
3431
+ "epoch": 0.5798969072164949,
3432
+ "grad_norm": 2.006948947906494,
3433
+ "learning_rate": 8.966652631884506e-06,
3434
+ "loss": 0.0152,
3435
+ "step": 450
3436
+ },
3437
+ {
3438
+ "epoch": 0.5811855670103093,
3439
+ "grad_norm": 0.14350372552871704,
3440
+ "learning_rate": 8.921895705662194e-06,
3441
+ "loss": 0.0009,
3442
+ "step": 451
3443
+ },
3444
+ {
3445
+ "epoch": 0.5824742268041238,
3446
+ "grad_norm": 1.1150081157684326,
3447
+ "learning_rate": 8.877160619254264e-06,
3448
+ "loss": 0.0036,
3449
+ "step": 452
3450
+ },
3451
+ {
3452
+ "epoch": 0.5837628865979382,
3453
+ "grad_norm": 0.7745972275733948,
3454
+ "learning_rate": 8.832448278886567e-06,
3455
+ "loss": 0.0061,
3456
+ "step": 453
3457
+ },
3458
+ {
3459
+ "epoch": 0.5850515463917526,
3460
+ "grad_norm": 0.15023173391819,
3461
+ "learning_rate": 8.787759590324177e-06,
3462
+ "loss": 0.0008,
3463
+ "step": 454
3464
+ },
3465
+ {
3466
+ "epoch": 0.586340206185567,
3467
+ "grad_norm": 0.15638376772403717,
3468
+ "learning_rate": 8.743095458853034e-06,
3469
+ "loss": 0.0015,
3470
+ "step": 455
3471
+ },
3472
+ {
3473
+ "epoch": 0.5876288659793815,
3474
+ "grad_norm": 0.8829141855239868,
3475
+ "learning_rate": 8.698456789261617e-06,
3476
+ "loss": 0.0047,
3477
+ "step": 456
3478
+ },
3479
+ {
3480
+ "epoch": 0.5889175257731959,
3481
+ "grad_norm": 0.13397619128227234,
3482
+ "learning_rate": 8.653844485822603e-06,
3483
+ "loss": 0.0008,
3484
+ "step": 457
3485
+ },
3486
+ {
3487
+ "epoch": 0.5902061855670103,
3488
+ "grad_norm": 0.54257732629776,
3489
+ "learning_rate": 8.609259452274559e-06,
3490
+ "loss": 0.0015,
3491
+ "step": 458
3492
+ },
3493
+ {
3494
+ "epoch": 0.5914948453608248,
3495
+ "grad_norm": 3.111884832382202,
3496
+ "learning_rate": 8.56470259180362e-06,
3497
+ "loss": 0.0354,
3498
+ "step": 459
3499
+ },
3500
+ {
3501
+ "epoch": 0.5927835051546392,
3502
+ "grad_norm": 0.41738268733024597,
3503
+ "learning_rate": 8.52017480702521e-06,
3504
+ "loss": 0.0016,
3505
+ "step": 460
3506
+ },
3507
+ {
3508
+ "epoch": 0.5927835051546392,
3509
+ "eval_accuracy": 0.9965243296921549,
3510
+ "eval_f1": 0.9357798165137615,
3511
+ "eval_loss": 0.01587463542819023,
3512
+ "eval_precision": 0.9807692307692307,
3513
+ "eval_recall": 0.8947368421052632,
3514
+ "eval_runtime": 84.0121,
3515
+ "eval_samples_per_second": 5.416,
3516
+ "eval_steps_per_second": 0.179,
3517
+ "step": 460
3518
+ },
3519
+ {
3520
+ "epoch": 0.5940721649484536,
3521
+ "grad_norm": 1.6076654195785522,
3522
+ "learning_rate": 8.475676999965747e-06,
3523
+ "loss": 0.0089,
3524
+ "step": 461
3525
+ },
3526
+ {
3527
+ "epoch": 0.595360824742268,
3528
+ "grad_norm": 0.16697372496128082,
3529
+ "learning_rate": 8.431210072044371e-06,
3530
+ "loss": 0.0007,
3531
+ "step": 462
3532
+ },
3533
+ {
3534
+ "epoch": 0.5966494845360825,
3535
+ "grad_norm": 1.6268433332443237,
3536
+ "learning_rate": 8.386774924054686e-06,
3537
+ "loss": 0.0048,
3538
+ "step": 463
3539
+ },
3540
+ {
3541
+ "epoch": 0.5979381443298969,
3542
+ "grad_norm": 1.5314583778381348,
3543
+ "learning_rate": 8.342372456146512e-06,
3544
+ "loss": 0.0074,
3545
+ "step": 464
3546
+ },
3547
+ {
3548
+ "epoch": 0.5992268041237113,
3549
+ "grad_norm": 2.308037519454956,
3550
+ "learning_rate": 8.29800356780764e-06,
3551
+ "loss": 0.0105,
3552
+ "step": 465
3553
+ },
3554
+ {
3555
+ "epoch": 0.6005154639175257,
3556
+ "grad_norm": 0.43206554651260376,
3557
+ "learning_rate": 8.253669157845632e-06,
3558
+ "loss": 0.0025,
3559
+ "step": 466
3560
+ },
3561
+ {
3562
+ "epoch": 0.6018041237113402,
3563
+ "grad_norm": 2.582341194152832,
3564
+ "learning_rate": 8.20937012436959e-06,
3565
+ "loss": 0.0118,
3566
+ "step": 467
3567
+ },
3568
+ {
3569
+ "epoch": 0.6030927835051546,
3570
+ "grad_norm": 0.5174412727355957,
3571
+ "learning_rate": 8.165107364771979e-06,
3572
+ "loss": 0.0015,
3573
+ "step": 468
3574
+ },
3575
+ {
3576
+ "epoch": 0.604381443298969,
3577
+ "grad_norm": 1.751904845237732,
3578
+ "learning_rate": 8.12088177571044e-06,
3579
+ "loss": 0.0129,
3580
+ "step": 469
3581
+ },
3582
+ {
3583
+ "epoch": 0.6056701030927835,
3584
+ "grad_norm": 0.601997435092926,
3585
+ "learning_rate": 8.076694253089632e-06,
3586
+ "loss": 0.0025,
3587
+ "step": 470
3588
+ },
3589
+ {
3590
+ "epoch": 0.6069587628865979,
3591
+ "grad_norm": 0.21518899500370026,
3592
+ "learning_rate": 8.032545692043068e-06,
3593
+ "loss": 0.0007,
3594
+ "step": 471
3595
+ },
3596
+ {
3597
+ "epoch": 0.6082474226804123,
3598
+ "grad_norm": 0.23358668386936188,
3599
+ "learning_rate": 7.988436986915005e-06,
3600
+ "loss": 0.0016,
3601
+ "step": 472
3602
+ },
3603
+ {
3604
+ "epoch": 0.6095360824742269,
3605
+ "grad_norm": 1.7518439292907715,
3606
+ "learning_rate": 7.944369031242307e-06,
3607
+ "loss": 0.0363,
3608
+ "step": 473
3609
+ },
3610
+ {
3611
+ "epoch": 0.6108247422680413,
3612
+ "grad_norm": 0.2232217937707901,
3613
+ "learning_rate": 7.900342717736354e-06,
3614
+ "loss": 0.0022,
3615
+ "step": 474
3616
+ },
3617
+ {
3618
+ "epoch": 0.6121134020618557,
3619
+ "grad_norm": 0.07207631319761276,
3620
+ "learning_rate": 7.856358938264953e-06,
3621
+ "loss": 0.0004,
3622
+ "step": 475
3623
+ },
3624
+ {
3625
+ "epoch": 0.6134020618556701,
3626
+ "grad_norm": 3.338632822036743,
3627
+ "learning_rate": 7.812418583834282e-06,
3628
+ "loss": 0.0105,
3629
+ "step": 476
3630
+ },
3631
+ {
3632
+ "epoch": 0.6146907216494846,
3633
+ "grad_norm": 0.6013636589050293,
3634
+ "learning_rate": 7.768522544570818e-06,
3635
+ "loss": 0.0021,
3636
+ "step": 477
3637
+ },
3638
+ {
3639
+ "epoch": 0.615979381443299,
3640
+ "grad_norm": 1.2868963479995728,
3641
+ "learning_rate": 7.724671709703328e-06,
3642
+ "loss": 0.0131,
3643
+ "step": 478
3644
+ },
3645
+ {
3646
+ "epoch": 0.6172680412371134,
3647
+ "grad_norm": 1.118053913116455,
3648
+ "learning_rate": 7.680866967544841e-06,
3649
+ "loss": 0.0023,
3650
+ "step": 479
3651
+ },
3652
+ {
3653
+ "epoch": 0.6185567010309279,
3654
+ "grad_norm": 3.8270020484924316,
3655
+ "learning_rate": 7.637109205474665e-06,
3656
+ "loss": 0.0262,
3657
+ "step": 480
3658
+ },
3659
+ {
3660
+ "epoch": 0.6185567010309279,
3661
+ "eval_accuracy": 0.997020854021847,
3662
+ "eval_f1": 0.9473684210526315,
3663
+ "eval_loss": 0.01405768096446991,
3664
+ "eval_precision": 0.9473684210526315,
3665
+ "eval_recall": 0.9473684210526315,
3666
+ "eval_runtime": 83.4682,
3667
+ "eval_samples_per_second": 5.451,
3668
+ "eval_steps_per_second": 0.18,
3669
+ "step": 480
3670
+ },
3671
+ {
3672
+ "epoch": 0.6198453608247423,
3673
+ "grad_norm": 4.576198577880859,
3674
+ "learning_rate": 7.5933993099203936e-06,
3675
+ "loss": 0.0198,
3676
+ "step": 481
3677
+ },
3678
+ {
3679
+ "epoch": 0.6211340206185567,
3680
+ "grad_norm": 1.4137742519378662,
3681
+ "learning_rate": 7.5497381663399716e-06,
3682
+ "loss": 0.012,
3683
+ "step": 482
3684
+ },
3685
+ {
3686
+ "epoch": 0.6224226804123711,
3687
+ "grad_norm": 0.7309709787368774,
3688
+ "learning_rate": 7.506126659203733e-06,
3689
+ "loss": 0.0052,
3690
+ "step": 483
3691
+ },
3692
+ {
3693
+ "epoch": 0.6237113402061856,
3694
+ "grad_norm": 1.414675235748291,
3695
+ "learning_rate": 7.462565671976504e-06,
3696
+ "loss": 0.005,
3697
+ "step": 484
3698
+ },
3699
+ {
3700
+ "epoch": 0.625,
3701
+ "grad_norm": 0.5044420957565308,
3702
+ "learning_rate": 7.419056087099695e-06,
3703
+ "loss": 0.0034,
3704
+ "step": 485
3705
+ },
3706
+ {
3707
+ "epoch": 0.6262886597938144,
3708
+ "grad_norm": 0.4570430815219879,
3709
+ "learning_rate": 7.375598785973429e-06,
3710
+ "loss": 0.0022,
3711
+ "step": 486
3712
+ },
3713
+ {
3714
+ "epoch": 0.6275773195876289,
3715
+ "grad_norm": 1.932029128074646,
3716
+ "learning_rate": 7.332194648938689e-06,
3717
+ "loss": 0.0102,
3718
+ "step": 487
3719
+ },
3720
+ {
3721
+ "epoch": 0.6288659793814433,
3722
+ "grad_norm": 1.0421007871627808,
3723
+ "learning_rate": 7.288844555259471e-06,
3724
+ "loss": 0.0039,
3725
+ "step": 488
3726
+ },
3727
+ {
3728
+ "epoch": 0.6301546391752577,
3729
+ "grad_norm": 0.5654887557029724,
3730
+ "learning_rate": 7.245549383104993e-06,
3731
+ "loss": 0.0022,
3732
+ "step": 489
3733
+ },
3734
+ {
3735
+ "epoch": 0.6314432989690721,
3736
+ "grad_norm": 0.5495522022247314,
3737
+ "learning_rate": 7.202310009531886e-06,
3738
+ "loss": 0.0036,
3739
+ "step": 490
3740
+ },
3741
+ {
3742
+ "epoch": 0.6327319587628866,
3743
+ "grad_norm": 2.5807251930236816,
3744
+ "learning_rate": 7.159127310466441e-06,
3745
+ "loss": 0.012,
3746
+ "step": 491
3747
+ },
3748
+ {
3749
+ "epoch": 0.634020618556701,
3750
+ "grad_norm": 0.829201877117157,
3751
+ "learning_rate": 7.116002160686851e-06,
3752
+ "loss": 0.006,
3753
+ "step": 492
3754
+ },
3755
+ {
3756
+ "epoch": 0.6353092783505154,
3757
+ "grad_norm": 0.9482012391090393,
3758
+ "learning_rate": 7.072935433805508e-06,
3759
+ "loss": 0.0089,
3760
+ "step": 493
3761
+ },
3762
+ {
3763
+ "epoch": 0.6365979381443299,
3764
+ "grad_norm": 1.9564292430877686,
3765
+ "learning_rate": 7.0299280022512875e-06,
3766
+ "loss": 0.0109,
3767
+ "step": 494
3768
+ },
3769
+ {
3770
+ "epoch": 0.6378865979381443,
3771
+ "grad_norm": 0.7540721297264099,
3772
+ "learning_rate": 6.986980737251889e-06,
3773
+ "loss": 0.0022,
3774
+ "step": 495
3775
+ },
3776
+ {
3777
+ "epoch": 0.6391752577319587,
3778
+ "grad_norm": 0.34916985034942627,
3779
+ "learning_rate": 6.944094508816182e-06,
3780
+ "loss": 0.004,
3781
+ "step": 496
3782
+ },
3783
+ {
3784
+ "epoch": 0.6404639175257731,
3785
+ "grad_norm": 0.06619101762771606,
3786
+ "learning_rate": 6.9012701857165755e-06,
3787
+ "loss": 0.0002,
3788
+ "step": 497
3789
+ },
3790
+ {
3791
+ "epoch": 0.6417525773195877,
3792
+ "grad_norm": 1.7265164852142334,
3793
+ "learning_rate": 6.858508635471428e-06,
3794
+ "loss": 0.0038,
3795
+ "step": 498
3796
+ },
3797
+ {
3798
+ "epoch": 0.6430412371134021,
3799
+ "grad_norm": 0.7744361758232117,
3800
+ "learning_rate": 6.815810724327469e-06,
3801
+ "loss": 0.0032,
3802
+ "step": 499
3803
+ },
3804
+ {
3805
+ "epoch": 0.6443298969072165,
3806
+ "grad_norm": 3.491551637649536,
3807
+ "learning_rate": 6.773177317242257e-06,
3808
+ "loss": 0.0294,
3809
+ "step": 500
3810
+ },
3811
+ {
3812
+ "epoch": 0.6443298969072165,
3813
+ "eval_accuracy": 0.997020854021847,
3814
+ "eval_f1": 0.9454545454545454,
3815
+ "eval_loss": 0.016488004475831985,
3816
+ "eval_precision": 0.9811320754716981,
3817
+ "eval_recall": 0.9122807017543859,
3818
+ "eval_runtime": 83.472,
3819
+ "eval_samples_per_second": 5.451,
3820
+ "eval_steps_per_second": 0.18,
3821
+ "step": 500
3822
  }
3823
  ],
3824
  "logging_steps": 1,
 
3838
  "attributes": {}
3839
  }
3840
  },
3841
+ "total_flos": 1.6687462625574912e+17,
3842
  "train_batch_size": 8,
3843
  "trial_name": null,
3844
  "trial_params": null