antonpolishko commited on
Commit
c6d8dd1
·
verified ·
1 Parent(s): 50d51bb

Training in progress, epoch 3, checkpoint

Browse files
last-checkpoint/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c060b8e42388903f369391791eb7945956545d305e3f2ec2bd4aee7d43c9627
3
  size 4903351912
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44918dfe39ee4b6d085b2fd0ede145b5142b915f183672ae9f4f3a923ec9ace2
3
  size 4903351912
last-checkpoint/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:230396faf4f924420f20c75b719bae88e31df325fc35e94f9fa673dc6bdd9774
3
  size 4947570872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1906b644e20f7418e0ff8861b4c47c96c8329cbafa385d6221bb549bd49078a6
3
  size 4947570872
last-checkpoint/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b28cd530d6ef53c146ea079e7092af33e6f1bc2e392efdc66ae16e2eae76a988
3
  size 4962221464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f784163add06741c32f71bfd53242b55e4499fad506d9059b5bffcf9bf06714a
3
  size 4962221464
last-checkpoint/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac1e152be2252f6499925b2928cee4c632ff7777a037b95b6e31767da1531e4a
3
  size 3670322200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c8c9b8ecede7d332419cd9140fecb55d03b3de4fdcc2b353e9acb041598999f
3
  size 3670322200
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0117f33fed181aa8ea76fa63db6fac08f4883c66b863ad19ae2b40826aef632b
3
- size 36967230034
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69d47d5b8f28bba9ccfb52093efcff4d4e0d7768a82f2cf566cfc4e6be74db24
3
+ size 33781260608
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c461c9d337dfc684e9352ec72bfa344e2f5d377f7cfc4475de9acae294dca89
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69ec6e3926fa071bede113523efa3dc6e630c3c7958c54a9ca321cf4d62ed145
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fae392ec6232cbf9da21d6ed12bc8247d0d24e7f3a3606acd23be00f3e8bbfc5
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6127ee4f0c13500ec5038fce65af8f7beec63c137c7d4b7c157aa6303cf5879
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbf3e7ca9991a58b0b16574a3c653483c551c270aa05aba06c162ea593f7b0f2
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da01d1c5eb2cc3a323f97c1f590d13ccfac2a4c5b1479bd378b4e643304f5a4f
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c695bebf6bcb75cbe26378bfe0ab7e2a33c49f713b9d6e4d10632b24322977e7
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49a3f04d76c0d3acc7d3dd95a04215f368f35a451ae8cba8a2fdba38cda9ca0a
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5ebb13c71265c5464c9aa9bb9b66f07764d73befe6cd63a2aaf8e781bf0a374
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df7d2c9825dba80cb544920f8cc0c72122f96514e6cd259052a8765b034393e2
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12cc6e245e189be568c8dfd43a4dd8f04bb3dbd9f17f41458107935d2c2a6a9d
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a20a42d44ff48cc162224010190e898fe28598ddad8cd1896d330a3bb1d8ec3
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36086646e9a8f76fea69f8a227112e83bb63524964ccdfb82f4cdad88b90e5e4
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18ac0dc4f09f25179860561fcea7c5c8f997aabdc46a170665f9dc5a72bc27c6
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b44153bacf860d0ca6ce4c6b9380a199feab8a72ca613e6745bfb671b02c4e4
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a16fcb5411ff961b47eff7378d85105fe9837e0492d19ea5ce3b7c4b77aa3b6
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:82d7549e2049af0aa601c2cb8b24328e3b8070b31576a769b0483a09a01779f5
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edc9a50bea580864bcd8cfa624a3000d833ef59c0757429ea149c48330d1c567
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.0,
5
  "eval_steps": 300,
6
- "global_step": 732,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1052,6 +1052,525 @@
1052
  "learning_rate": 5.066342480105459e-07,
1053
  "loss": 1.5082,
1054
  "step": 730
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1055
  }
1056
  ],
1057
  "logging_steps": 5,
@@ -1066,12 +1585,12 @@
1066
  "should_evaluate": false,
1067
  "should_log": false,
1068
  "should_save": true,
1069
- "should_training_stop": false
1070
  },
1071
  "attributes": {}
1072
  }
1073
  },
1074
- "total_flos": 4.791978551408591e+18,
1075
  "train_batch_size": 8,
1076
  "trial_name": null,
1077
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
  "eval_steps": 300,
6
+ "global_step": 1098,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1052
  "learning_rate": 5.066342480105459e-07,
1053
  "loss": 1.5082,
1054
  "step": 730
1055
+ },
1056
+ {
1057
+ "epoch": 2.0081967213114753,
1058
+ "grad_norm": 1.6484375,
1059
+ "learning_rate": 4.942189840051287e-07,
1060
+ "loss": 1.5054,
1061
+ "step": 735
1062
+ },
1063
+ {
1064
+ "epoch": 2.021857923497268,
1065
+ "grad_norm": 1.84375,
1066
+ "learning_rate": 4.819076099973152e-07,
1067
+ "loss": 1.4999,
1068
+ "step": 740
1069
+ },
1070
+ {
1071
+ "epoch": 2.0355191256830603,
1072
+ "grad_norm": 1.6171875,
1073
+ "learning_rate": 4.697026548060528e-07,
1074
+ "loss": 1.5094,
1075
+ "step": 745
1076
+ },
1077
+ {
1078
+ "epoch": 2.0491803278688523,
1079
+ "grad_norm": 1.703125,
1080
+ "learning_rate": 4.5760662539132077e-07,
1081
+ "loss": 1.5065,
1082
+ "step": 750
1083
+ },
1084
+ {
1085
+ "epoch": 2.0628415300546448,
1086
+ "grad_norm": 1.921875,
1087
+ "learning_rate": 4.4562200633918943e-07,
1088
+ "loss": 1.5013,
1089
+ "step": 755
1090
+ },
1091
+ {
1092
+ "epoch": 2.0765027322404372,
1093
+ "grad_norm": 1.625,
1094
+ "learning_rate": 4.337512593514728e-07,
1095
+ "loss": 1.5041,
1096
+ "step": 760
1097
+ },
1098
+ {
1099
+ "epoch": 2.0901639344262297,
1100
+ "grad_norm": 1.6640625,
1101
+ "learning_rate": 4.2199682274008255e-07,
1102
+ "loss": 1.5083,
1103
+ "step": 765
1104
+ },
1105
+ {
1106
+ "epoch": 2.1038251366120218,
1107
+ "grad_norm": 1.6171875,
1108
+ "learning_rate": 4.103611109261872e-07,
1109
+ "loss": 1.4982,
1110
+ "step": 770
1111
+ },
1112
+ {
1113
+ "epoch": 2.1174863387978142,
1114
+ "grad_norm": 1.6328125,
1115
+ "learning_rate": 3.9884651394427625e-07,
1116
+ "loss": 1.5051,
1117
+ "step": 775
1118
+ },
1119
+ {
1120
+ "epoch": 2.1311475409836067,
1121
+ "grad_norm": 1.671875,
1122
+ "learning_rate": 3.8745539695123577e-07,
1123
+ "loss": 1.4924,
1124
+ "step": 780
1125
+ },
1126
+ {
1127
+ "epoch": 2.1448087431693987,
1128
+ "grad_norm": 1.65625,
1129
+ "learning_rate": 3.761900997405332e-07,
1130
+ "loss": 1.5113,
1131
+ "step": 785
1132
+ },
1133
+ {
1134
+ "epoch": 2.158469945355191,
1135
+ "grad_norm": 1.6484375,
1136
+ "learning_rate": 3.6505293626161127e-07,
1137
+ "loss": 1.5058,
1138
+ "step": 790
1139
+ },
1140
+ {
1141
+ "epoch": 2.1721311475409837,
1142
+ "grad_norm": 1.625,
1143
+ "learning_rate": 3.5404619414459147e-07,
1144
+ "loss": 1.5019,
1145
+ "step": 795
1146
+ },
1147
+ {
1148
+ "epoch": 2.185792349726776,
1149
+ "grad_norm": 1.640625,
1150
+ "learning_rate": 3.4317213423038384e-07,
1151
+ "loss": 1.4948,
1152
+ "step": 800
1153
+ },
1154
+ {
1155
+ "epoch": 2.199453551912568,
1156
+ "grad_norm": 1.625,
1157
+ "learning_rate": 3.32432990106297e-07,
1158
+ "loss": 1.5092,
1159
+ "step": 805
1160
+ },
1161
+ {
1162
+ "epoch": 2.2131147540983607,
1163
+ "grad_norm": 1.734375,
1164
+ "learning_rate": 3.2183096764724914e-07,
1165
+ "loss": 1.5048,
1166
+ "step": 810
1167
+ },
1168
+ {
1169
+ "epoch": 2.226775956284153,
1170
+ "grad_norm": 1.8984375,
1171
+ "learning_rate": 3.1136824456267006e-07,
1172
+ "loss": 1.5005,
1173
+ "step": 815
1174
+ },
1175
+ {
1176
+ "epoch": 2.240437158469945,
1177
+ "grad_norm": 1.7421875,
1178
+ "learning_rate": 3.01046969949188e-07,
1179
+ "loss": 1.5097,
1180
+ "step": 820
1181
+ },
1182
+ {
1183
+ "epoch": 2.2540983606557377,
1184
+ "grad_norm": 1.6640625,
1185
+ "learning_rate": 2.908692638491945e-07,
1186
+ "loss": 1.5054,
1187
+ "step": 825
1188
+ },
1189
+ {
1190
+ "epoch": 2.26775956284153,
1191
+ "grad_norm": 1.65625,
1192
+ "learning_rate": 2.80837216815378e-07,
1193
+ "loss": 1.5131,
1194
+ "step": 830
1195
+ },
1196
+ {
1197
+ "epoch": 2.281420765027322,
1198
+ "grad_norm": 1.671875,
1199
+ "learning_rate": 2.7095288948131114e-07,
1200
+ "loss": 1.4999,
1201
+ "step": 835
1202
+ },
1203
+ {
1204
+ "epoch": 2.2950819672131146,
1205
+ "grad_norm": 1.6328125,
1206
+ "learning_rate": 2.6121831213818826e-07,
1207
+ "loss": 1.4989,
1208
+ "step": 840
1209
+ },
1210
+ {
1211
+ "epoch": 2.308743169398907,
1212
+ "grad_norm": 1.6953125,
1213
+ "learning_rate": 2.51635484317793e-07,
1214
+ "loss": 1.495,
1215
+ "step": 845
1216
+ },
1217
+ {
1218
+ "epoch": 2.3224043715846996,
1219
+ "grad_norm": 1.703125,
1220
+ "learning_rate": 2.4220637438178313e-07,
1221
+ "loss": 1.5125,
1222
+ "step": 850
1223
+ },
1224
+ {
1225
+ "epoch": 2.3360655737704916,
1226
+ "grad_norm": 1.703125,
1227
+ "learning_rate": 2.3293291911738078e-07,
1228
+ "loss": 1.5092,
1229
+ "step": 855
1230
+ },
1231
+ {
1232
+ "epoch": 2.349726775956284,
1233
+ "grad_norm": 1.6640625,
1234
+ "learning_rate": 2.2381702333954433e-07,
1235
+ "loss": 1.4965,
1236
+ "step": 860
1237
+ },
1238
+ {
1239
+ "epoch": 2.3633879781420766,
1240
+ "grad_norm": 1.6328125,
1241
+ "learning_rate": 2.148605594997115e-07,
1242
+ "loss": 1.497,
1243
+ "step": 865
1244
+ },
1245
+ {
1246
+ "epoch": 2.3770491803278686,
1247
+ "grad_norm": 1.765625,
1248
+ "learning_rate": 2.0606536730118763e-07,
1249
+ "loss": 1.5081,
1250
+ "step": 870
1251
+ },
1252
+ {
1253
+ "epoch": 2.390710382513661,
1254
+ "grad_norm": 1.625,
1255
+ "learning_rate": 1.9743325332126105e-07,
1256
+ "loss": 1.5091,
1257
+ "step": 875
1258
+ },
1259
+ {
1260
+ "epoch": 2.4043715846994536,
1261
+ "grad_norm": 1.6171875,
1262
+ "learning_rate": 1.8896599064012298e-07,
1263
+ "loss": 1.5045,
1264
+ "step": 880
1265
+ },
1266
+ {
1267
+ "epoch": 2.418032786885246,
1268
+ "grad_norm": 1.640625,
1269
+ "learning_rate": 1.8066531847666888e-07,
1270
+ "loss": 1.5008,
1271
+ "step": 885
1272
+ },
1273
+ {
1274
+ "epoch": 2.431693989071038,
1275
+ "grad_norm": 1.65625,
1276
+ "learning_rate": 1.7253294183125222e-07,
1277
+ "loss": 1.511,
1278
+ "step": 890
1279
+ },
1280
+ {
1281
+ "epoch": 2.4453551912568305,
1282
+ "grad_norm": 1.6328125,
1283
+ "learning_rate": 1.645705311354697e-07,
1284
+ "loss": 1.4998,
1285
+ "step": 895
1286
+ },
1287
+ {
1288
+ "epoch": 2.459016393442623,
1289
+ "grad_norm": 1.65625,
1290
+ "learning_rate": 1.5677972190904621e-07,
1291
+ "loss": 1.5021,
1292
+ "step": 900
1293
+ },
1294
+ {
1295
+ "epoch": 2.459016393442623,
1296
+ "eval_loss": 1.5163270235061646,
1297
+ "eval_runtime": 19.6134,
1298
+ "eval_samples_per_second": 36.098,
1299
+ "eval_steps_per_second": 1.173,
1300
+ "step": 900
1301
+ },
1302
+ {
1303
+ "epoch": 2.4726775956284155,
1304
+ "grad_norm": 1.6328125,
1305
+ "learning_rate": 1.4916211442389048e-07,
1306
+ "loss": 1.502,
1307
+ "step": 905
1308
+ },
1309
+ {
1310
+ "epoch": 2.4863387978142075,
1311
+ "grad_norm": 1.625,
1312
+ "learning_rate": 1.4171927337539104e-07,
1313
+ "loss": 1.5012,
1314
+ "step": 910
1315
+ },
1316
+ {
1317
+ "epoch": 2.5,
1318
+ "grad_norm": 1.640625,
1319
+ "learning_rate": 1.344527275610202e-07,
1320
+ "loss": 1.5019,
1321
+ "step": 915
1322
+ },
1323
+ {
1324
+ "epoch": 2.5136612021857925,
1325
+ "grad_norm": 1.640625,
1326
+ "learning_rate": 1.273639695663108e-07,
1327
+ "loss": 1.5085,
1328
+ "step": 920
1329
+ },
1330
+ {
1331
+ "epoch": 2.527322404371585,
1332
+ "grad_norm": 1.75,
1333
+ "learning_rate": 1.204544554582716e-07,
1334
+ "loss": 1.4973,
1335
+ "step": 925
1336
+ },
1337
+ {
1338
+ "epoch": 2.540983606557377,
1339
+ "grad_norm": 1.671875,
1340
+ "learning_rate": 1.1372560448630375e-07,
1341
+ "loss": 1.5037,
1342
+ "step": 930
1343
+ },
1344
+ {
1345
+ "epoch": 2.5546448087431695,
1346
+ "grad_norm": 1.65625,
1347
+ "learning_rate": 1.0717879879068004e-07,
1348
+ "loss": 1.5005,
1349
+ "step": 935
1350
+ },
1351
+ {
1352
+ "epoch": 2.5683060109289615,
1353
+ "grad_norm": 1.6796875,
1354
+ "learning_rate": 1.0081538311864568e-07,
1355
+ "loss": 1.5024,
1356
+ "step": 940
1357
+ },
1358
+ {
1359
+ "epoch": 2.581967213114754,
1360
+ "grad_norm": 1.625,
1361
+ "learning_rate": 9.463666454820118e-08,
1362
+ "loss": 1.4988,
1363
+ "step": 945
1364
+ },
1365
+ {
1366
+ "epoch": 2.5956284153005464,
1367
+ "grad_norm": 1.625,
1368
+ "learning_rate": 8.864391221962064e-08,
1369
+ "loss": 1.5053,
1370
+ "step": 950
1371
+ },
1372
+ {
1373
+ "epoch": 2.609289617486339,
1374
+ "grad_norm": 1.6328125,
1375
+ "learning_rate": 8.28383570747655e-08,
1376
+ "loss": 1.5044,
1377
+ "step": 955
1378
+ },
1379
+ {
1380
+ "epoch": 2.6229508196721314,
1381
+ "grad_norm": 1.609375,
1382
+ "learning_rate": 7.722119160424112e-08,
1383
+ "loss": 1.4995,
1384
+ "step": 960
1385
+ },
1386
+ {
1387
+ "epoch": 2.6366120218579234,
1388
+ "grad_norm": 1.640625,
1389
+ "learning_rate": 7.179356960245409e-08,
1390
+ "loss": 1.5122,
1391
+ "step": 965
1392
+ },
1393
+ {
1394
+ "epoch": 2.650273224043716,
1395
+ "grad_norm": 1.7890625,
1396
+ "learning_rate": 6.655660593061718e-08,
1397
+ "loss": 1.5054,
1398
+ "step": 970
1399
+ },
1400
+ {
1401
+ "epoch": 2.663934426229508,
1402
+ "grad_norm": 1.6484375,
1403
+ "learning_rate": 6.151137628775049e-08,
1404
+ "loss": 1.5108,
1405
+ "step": 975
1406
+ },
1407
+ {
1408
+ "epoch": 2.6775956284153004,
1409
+ "grad_norm": 1.609375,
1410
+ "learning_rate": 5.665891698972769e-08,
1411
+ "loss": 1.5003,
1412
+ "step": 980
1413
+ },
1414
+ {
1415
+ "epoch": 2.691256830601093,
1416
+ "grad_norm": 1.640625,
1417
+ "learning_rate": 5.200022475641153e-08,
1418
+ "loss": 1.5015,
1419
+ "step": 985
1420
+ },
1421
+ {
1422
+ "epoch": 2.7049180327868854,
1423
+ "grad_norm": 1.609375,
1424
+ "learning_rate": 4.75362565069225e-08,
1425
+ "loss": 1.5002,
1426
+ "step": 990
1427
+ },
1428
+ {
1429
+ "epoch": 2.718579234972678,
1430
+ "grad_norm": 1.640625,
1431
+ "learning_rate": 4.326792916308242e-08,
1432
+ "loss": 1.5029,
1433
+ "step": 995
1434
+ },
1435
+ {
1436
+ "epoch": 2.73224043715847,
1437
+ "grad_norm": 1.7421875,
1438
+ "learning_rate": 3.919611946107493e-08,
1439
+ "loss": 1.5068,
1440
+ "step": 1000
1441
+ },
1442
+ {
1443
+ "epoch": 2.7459016393442623,
1444
+ "grad_norm": 1.921875,
1445
+ "learning_rate": 3.532166377135814e-08,
1446
+ "loss": 1.4961,
1447
+ "step": 1005
1448
+ },
1449
+ {
1450
+ "epoch": 2.7595628415300544,
1451
+ "grad_norm": 1.6953125,
1452
+ "learning_rate": 3.164535792687095e-08,
1453
+ "loss": 1.5,
1454
+ "step": 1010
1455
+ },
1456
+ {
1457
+ "epoch": 2.773224043715847,
1458
+ "grad_norm": 1.6171875,
1459
+ "learning_rate": 2.8167957059564095e-08,
1460
+ "loss": 1.5035,
1461
+ "step": 1015
1462
+ },
1463
+ {
1464
+ "epoch": 2.7868852459016393,
1465
+ "grad_norm": 1.6484375,
1466
+ "learning_rate": 2.4890175445293147e-08,
1467
+ "loss": 1.5007,
1468
+ "step": 1020
1469
+ },
1470
+ {
1471
+ "epoch": 2.800546448087432,
1472
+ "grad_norm": 1.6328125,
1473
+ "learning_rate": 2.1812686357101428e-08,
1474
+ "loss": 1.505,
1475
+ "step": 1025
1476
+ },
1477
+ {
1478
+ "epoch": 2.8142076502732243,
1479
+ "grad_norm": 1.6484375,
1480
+ "learning_rate": 1.8936121926927507e-08,
1481
+ "loss": 1.5066,
1482
+ "step": 1030
1483
+ },
1484
+ {
1485
+ "epoch": 2.8278688524590163,
1486
+ "grad_norm": 1.640625,
1487
+ "learning_rate": 1.6261073015761072e-08,
1488
+ "loss": 1.502,
1489
+ "step": 1035
1490
+ },
1491
+ {
1492
+ "epoch": 2.841530054644809,
1493
+ "grad_norm": 1.75,
1494
+ "learning_rate": 1.3788089092277688e-08,
1495
+ "loss": 1.5016,
1496
+ "step": 1040
1497
+ },
1498
+ {
1499
+ "epoch": 2.855191256830601,
1500
+ "grad_norm": 1.65625,
1501
+ "learning_rate": 1.1517678119975061e-08,
1502
+ "loss": 1.5024,
1503
+ "step": 1045
1504
+ },
1505
+ {
1506
+ "epoch": 2.8688524590163933,
1507
+ "grad_norm": 1.609375,
1508
+ "learning_rate": 9.450306452834178e-09,
1509
+ "loss": 1.5,
1510
+ "step": 1050
1511
+ },
1512
+ {
1513
+ "epoch": 2.8825136612021858,
1514
+ "grad_norm": 1.734375,
1515
+ "learning_rate": 7.586398739528932e-09,
1516
+ "loss": 1.4989,
1517
+ "step": 1055
1518
+ },
1519
+ {
1520
+ "epoch": 2.8961748633879782,
1521
+ "grad_norm": 1.7890625,
1522
+ "learning_rate": 5.926337836199891e-09,
1523
+ "loss": 1.507,
1524
+ "step": 1060
1525
+ },
1526
+ {
1527
+ "epoch": 2.9098360655737707,
1528
+ "grad_norm": 1.7734375,
1529
+ "learning_rate": 4.470464727814538e-09,
1530
+ "loss": 1.5119,
1531
+ "step": 1065
1532
+ },
1533
+ {
1534
+ "epoch": 2.9234972677595628,
1535
+ "grad_norm": 1.7734375,
1536
+ "learning_rate": 3.219078458127078e-09,
1537
+ "loss": 1.5072,
1538
+ "step": 1070
1539
+ },
1540
+ {
1541
+ "epoch": 2.9371584699453552,
1542
+ "grad_norm": 1.6328125,
1543
+ "learning_rate": 2.172436068252792e-09,
1544
+ "loss": 1.5074,
1545
+ "step": 1075
1546
+ },
1547
+ {
1548
+ "epoch": 2.9508196721311473,
1549
+ "grad_norm": 1.6171875,
1550
+ "learning_rate": 1.330752543871161e-09,
1551
+ "loss": 1.4947,
1552
+ "step": 1080
1553
+ },
1554
+ {
1555
+ "epoch": 2.9644808743169397,
1556
+ "grad_norm": 1.671875,
1557
+ "learning_rate": 6.942007710665221e-10,
1558
+ "loss": 1.5025,
1559
+ "step": 1085
1560
+ },
1561
+ {
1562
+ "epoch": 2.978142076502732,
1563
+ "grad_norm": 1.6640625,
1564
+ "learning_rate": 2.6291150081603207e-10,
1565
+ "loss": 1.5014,
1566
+ "step": 1090
1567
+ },
1568
+ {
1569
+ "epoch": 2.9918032786885247,
1570
+ "grad_norm": 1.609375,
1571
+ "learning_rate": 3.697332213348225e-11,
1572
+ "loss": 1.4964,
1573
+ "step": 1095
1574
  }
1575
  ],
1576
  "logging_steps": 5,
 
1585
  "should_evaluate": false,
1586
  "should_log": false,
1587
  "should_save": true,
1588
+ "should_training_stop": true
1589
  },
1590
  "attributes": {}
1591
  }
1592
  },
1593
+ "total_flos": 7.187967826039144e+18,
1594
  "train_batch_size": 8,
1595
  "trial_name": null,
1596
  "trial_params": null