tyzhu commited on
Commit
564bdd6
·
verified ·
1 Parent(s): daf0df1

End of training

Browse files
Files changed (5) hide show
  1. README.md +14 -2
  2. all_results.json +10 -10
  3. eval_results.json +6 -6
  4. train_results.json +4 -4
  5. trainer_state.json +260 -260
README.md CHANGED
@@ -3,11 +3,23 @@ license: other
3
  base_model: Qwen/Qwen1.5-4B
4
  tags:
5
  - generated_from_trainer
 
 
6
  metrics:
7
  - accuracy
8
  model-index:
9
  - name: lmind_hotpot_train8000_eval7405_v1_doc_qa_Qwen_Qwen1.5-4B_lora2
10
- results: []
 
 
 
 
 
 
 
 
 
 
11
  library_name: peft
12
  ---
13
 
@@ -16,7 +28,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # lmind_hotpot_train8000_eval7405_v1_doc_qa_Qwen_Qwen1.5-4B_lora2
18
 
19
- This model is a fine-tuned version of [Qwen/Qwen1.5-4B](https://huggingface.co/Qwen/Qwen1.5-4B) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 3.6298
22
  - Accuracy: 0.5108
 
3
  base_model: Qwen/Qwen1.5-4B
4
  tags:
5
  - generated_from_trainer
6
+ datasets:
7
+ - tyzhu/lmind_hotpot_train8000_eval7405_v1_doc_qa
8
  metrics:
9
  - accuracy
10
  model-index:
11
  - name: lmind_hotpot_train8000_eval7405_v1_doc_qa_Qwen_Qwen1.5-4B_lora2
12
+ results:
13
+ - task:
14
+ name: Causal Language Modeling
15
+ type: text-generation
16
+ dataset:
17
+ name: tyzhu/lmind_hotpot_train8000_eval7405_v1_doc_qa
18
+ type: tyzhu/lmind_hotpot_train8000_eval7405_v1_doc_qa
19
+ metrics:
20
+ - name: Accuracy
21
+ type: accuracy
22
+ value: 0.5108253968253968
23
  library_name: peft
24
  ---
25
 
 
28
 
29
  # lmind_hotpot_train8000_eval7405_v1_doc_qa_Qwen_Qwen1.5-4B_lora2
30
 
31
+ This model is a fine-tuned version of [Qwen/Qwen1.5-4B](https://huggingface.co/Qwen/Qwen1.5-4B) on the tyzhu/lmind_hotpot_train8000_eval7405_v1_doc_qa dataset.
32
  It achieves the following results on the evaluation set:
33
  - Loss: 3.6298
34
  - Accuracy: 0.5108
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
  "epoch": 19.99770484278173,
3
- "eval_accuracy": 0.5088571428571429,
4
- "eval_loss": 3.6292061805725098,
5
- "eval_runtime": 5.7944,
6
  "eval_samples": 500,
7
- "eval_samples_per_second": 86.29,
8
- "eval_steps_per_second": 10.873,
9
- "perplexity": 37.68289132989432,
10
  "total_flos": 1.5027132150442885e+18,
11
- "train_loss": 0.24546462313859588,
12
- "train_runtime": 24136.6058,
13
  "train_samples": 34854,
14
- "train_samples_per_second": 28.881,
15
- "train_steps_per_second": 0.902
16
  }
 
1
  {
2
  "epoch": 19.99770484278173,
3
+ "eval_accuracy": 0.5108253968253968,
4
+ "eval_loss": 3.6298398971557617,
5
+ "eval_runtime": 6.2865,
6
  "eval_samples": 500,
7
+ "eval_samples_per_second": 79.535,
8
+ "eval_steps_per_second": 10.021,
9
+ "perplexity": 37.70677917129613,
10
  "total_flos": 1.5027132150442885e+18,
11
+ "train_loss": 0.2454464098857673,
12
+ "train_runtime": 25406.3112,
13
  "train_samples": 34854,
14
+ "train_samples_per_second": 27.437,
15
+ "train_steps_per_second": 0.857
16
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "epoch": 19.99770484278173,
3
- "eval_accuracy": 0.5088571428571429,
4
- "eval_loss": 3.6292061805725098,
5
- "eval_runtime": 5.7944,
6
  "eval_samples": 500,
7
- "eval_samples_per_second": 86.29,
8
- "eval_steps_per_second": 10.873,
9
- "perplexity": 37.68289132989432
10
  }
 
1
  {
2
  "epoch": 19.99770484278173,
3
+ "eval_accuracy": 0.5108253968253968,
4
+ "eval_loss": 3.6298398971557617,
5
+ "eval_runtime": 6.2865,
6
  "eval_samples": 500,
7
+ "eval_samples_per_second": 79.535,
8
+ "eval_steps_per_second": 10.021,
9
+ "perplexity": 37.70677917129613
10
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 19.99770484278173,
3
  "total_flos": 1.5027132150442885e+18,
4
- "train_loss": 0.24546462313859588,
5
- "train_runtime": 24136.6058,
6
  "train_samples": 34854,
7
- "train_samples_per_second": 28.881,
8
- "train_steps_per_second": 0.902
9
  }
 
1
  {
2
  "epoch": 19.99770484278173,
3
  "total_flos": 1.5027132150442885e+18,
4
+ "train_loss": 0.2454464098857673,
5
+ "train_runtime": 25406.3112,
6
  "train_samples": 34854,
7
+ "train_samples_per_second": 27.437,
8
+ "train_steps_per_second": 0.857
9
  }
trainer_state.json CHANGED
@@ -863,858 +863,858 @@
863
  },
864
  {
865
  "epoch": 10.100986917603857,
866
- "grad_norm": 1.4361774921417236,
867
  "learning_rate": 0.0001,
868
  "loss": 0.6334,
869
  "step": 11000
870
  },
871
  {
872
  "epoch": 10.192793206334635,
873
- "grad_norm": 1.2038971185684204,
874
  "learning_rate": 0.0001,
875
- "loss": 0.6482,
876
  "step": 11100
877
  },
878
  {
879
  "epoch": 10.284599495065413,
880
- "grad_norm": 1.1092814207077026,
881
  "learning_rate": 0.0001,
882
  "loss": 0.6724,
883
  "step": 11200
884
  },
885
  {
886
  "epoch": 10.37640578379619,
887
- "grad_norm": 1.1717180013656616,
888
  "learning_rate": 0.0001,
889
  "loss": 0.6647,
890
  "step": 11300
891
  },
892
  {
893
  "epoch": 10.468212072526969,
894
- "grad_norm": 1.4970725774765015,
895
  "learning_rate": 0.0001,
896
  "loss": 0.6738,
897
  "step": 11400
898
  },
899
  {
900
  "epoch": 10.560018361257747,
901
- "grad_norm": 1.2861392498016357,
902
  "learning_rate": 0.0001,
903
- "loss": 0.677,
904
  "step": 11500
905
  },
906
  {
907
  "epoch": 10.651824649988525,
908
- "grad_norm": 1.4063390493392944,
909
  "learning_rate": 0.0001,
910
- "loss": 0.6828,
911
  "step": 11600
912
  },
913
  {
914
  "epoch": 10.743630938719303,
915
- "grad_norm": 1.2314549684524536,
916
  "learning_rate": 0.0001,
917
- "loss": 0.6785,
918
  "step": 11700
919
  },
920
  {
921
  "epoch": 10.83543722745008,
922
- "grad_norm": 1.128456473350525,
923
  "learning_rate": 0.0001,
924
- "loss": 0.7077,
925
  "step": 11800
926
  },
927
  {
928
  "epoch": 10.927243516180859,
929
- "grad_norm": 1.0211490392684937,
930
  "learning_rate": 0.0001,
931
  "loss": 0.6899,
932
  "step": 11900
933
  },
934
  {
935
  "epoch": 10.999770484278173,
936
- "eval_accuracy": 0.5075238095238095,
937
- "eval_loss": 3.3183186054229736,
938
- "eval_runtime": 5.3999,
939
- "eval_samples_per_second": 92.595,
940
- "eval_steps_per_second": 11.667,
941
  "step": 11979
942
  },
943
  {
944
  "epoch": 11.019049804911637,
945
- "grad_norm": 1.0713156461715698,
946
  "learning_rate": 0.0001,
947
- "loss": 0.6723,
948
  "step": 12000
949
  },
950
  {
951
  "epoch": 11.110856093642415,
952
- "grad_norm": 1.1805211305618286,
953
  "learning_rate": 0.0001,
954
- "loss": 0.5788,
955
  "step": 12100
956
  },
957
  {
958
  "epoch": 11.202662382373193,
959
- "grad_norm": 1.2431073188781738,
960
  "learning_rate": 0.0001,
961
- "loss": 0.5996,
962
  "step": 12200
963
  },
964
  {
965
  "epoch": 11.29446867110397,
966
- "grad_norm": 1.3710986375808716,
967
  "learning_rate": 0.0001,
968
- "loss": 0.5991,
969
  "step": 12300
970
  },
971
  {
972
  "epoch": 11.386274959834749,
973
- "grad_norm": 1.3145737648010254,
974
  "learning_rate": 0.0001,
975
- "loss": 0.6217,
976
  "step": 12400
977
  },
978
  {
979
  "epoch": 11.478081248565527,
980
- "grad_norm": 1.4527884721755981,
981
  "learning_rate": 0.0001,
982
- "loss": 0.6101,
983
  "step": 12500
984
  },
985
  {
986
  "epoch": 11.569887537296305,
987
- "grad_norm": 1.8169633150100708,
988
  "learning_rate": 0.0001,
989
- "loss": 0.6158,
990
  "step": 12600
991
  },
992
  {
993
  "epoch": 11.661693826027083,
994
- "grad_norm": 1.3203802108764648,
995
  "learning_rate": 0.0001,
996
- "loss": 0.6283,
997
  "step": 12700
998
  },
999
  {
1000
  "epoch": 11.75350011475786,
1001
- "grad_norm": 1.4981048107147217,
1002
  "learning_rate": 0.0001,
1003
- "loss": 0.6286,
1004
  "step": 12800
1005
  },
1006
  {
1007
  "epoch": 11.845306403488639,
1008
- "grad_norm": 1.4082999229431152,
1009
  "learning_rate": 0.0001,
1010
- "loss": 0.6398,
1011
  "step": 12900
1012
  },
1013
  {
1014
  "epoch": 11.937112692219417,
1015
- "grad_norm": 1.158374309539795,
1016
  "learning_rate": 0.0001,
1017
- "loss": 0.6435,
1018
  "step": 13000
1019
  },
1020
  {
1021
  "epoch": 11.999540968556346,
1022
- "eval_accuracy": 0.5103174603174603,
1023
- "eval_loss": 3.3756327629089355,
1024
- "eval_runtime": 5.7365,
1025
- "eval_samples_per_second": 87.161,
1026
- "eval_steps_per_second": 10.982,
1027
  "step": 13068
1028
  },
1029
  {
1030
  "epoch": 12.028918980950195,
1031
- "grad_norm": 1.4804699420928955,
1032
  "learning_rate": 0.0001,
1033
- "loss": 0.6188,
1034
  "step": 13100
1035
  },
1036
  {
1037
  "epoch": 12.120725269680973,
1038
- "grad_norm": 1.203269362449646,
1039
  "learning_rate": 0.0001,
1040
- "loss": 0.5278,
1041
  "step": 13200
1042
  },
1043
  {
1044
  "epoch": 12.21253155841175,
1045
- "grad_norm": 1.3879435062408447,
1046
  "learning_rate": 0.0001,
1047
- "loss": 0.5469,
1048
  "step": 13300
1049
  },
1050
  {
1051
  "epoch": 12.304337847142529,
1052
- "grad_norm": 1.1464476585388184,
1053
  "learning_rate": 0.0001,
1054
- "loss": 0.5551,
1055
  "step": 13400
1056
  },
1057
  {
1058
  "epoch": 12.396144135873307,
1059
- "grad_norm": 1.2823903560638428,
1060
  "learning_rate": 0.0001,
1061
  "loss": 0.5665,
1062
  "step": 13500
1063
  },
1064
  {
1065
  "epoch": 12.487950424604085,
1066
- "grad_norm": 1.3198871612548828,
1067
  "learning_rate": 0.0001,
1068
- "loss": 0.5655,
1069
  "step": 13600
1070
  },
1071
  {
1072
  "epoch": 12.579756713334863,
1073
- "grad_norm": 1.573085904121399,
1074
  "learning_rate": 0.0001,
1075
- "loss": 0.5756,
1076
  "step": 13700
1077
  },
1078
  {
1079
  "epoch": 12.671563002065641,
1080
- "grad_norm": 1.3534272909164429,
1081
  "learning_rate": 0.0001,
1082
- "loss": 0.5762,
1083
  "step": 13800
1084
  },
1085
  {
1086
  "epoch": 12.763369290796419,
1087
- "grad_norm": 1.3139435052871704,
1088
  "learning_rate": 0.0001,
1089
- "loss": 0.5905,
1090
  "step": 13900
1091
  },
1092
  {
1093
  "epoch": 12.855175579527197,
1094
- "grad_norm": 1.867145299911499,
1095
  "learning_rate": 0.0001,
1096
- "loss": 0.5823,
1097
  "step": 14000
1098
  },
1099
  {
1100
  "epoch": 12.946981868257975,
1101
- "grad_norm": 1.517114281654358,
1102
  "learning_rate": 0.0001,
1103
- "loss": 0.6043,
1104
  "step": 14100
1105
  },
1106
  {
1107
  "epoch": 12.999311452834519,
1108
- "eval_accuracy": 0.5099047619047619,
1109
- "eval_loss": 3.3886983394622803,
1110
- "eval_runtime": 6.1125,
1111
- "eval_samples_per_second": 81.799,
1112
- "eval_steps_per_second": 10.307,
1113
  "step": 14157
1114
  },
1115
  {
1116
  "epoch": 13.038788156988753,
1117
- "grad_norm": 1.178884506225586,
1118
  "learning_rate": 0.0001,
1119
- "loss": 0.5569,
1120
  "step": 14200
1121
  },
1122
  {
1123
  "epoch": 13.130594445719531,
1124
- "grad_norm": 1.2415622472763062,
1125
  "learning_rate": 0.0001,
1126
  "loss": 0.4959,
1127
  "step": 14300
1128
  },
1129
  {
1130
  "epoch": 13.22240073445031,
1131
- "grad_norm": 1.6084754467010498,
1132
  "learning_rate": 0.0001,
1133
  "loss": 0.5088,
1134
  "step": 14400
1135
  },
1136
  {
1137
  "epoch": 13.314207023181089,
1138
- "grad_norm": 1.3151100873947144,
1139
  "learning_rate": 0.0001,
1140
- "loss": 0.5134,
1141
  "step": 14500
1142
  },
1143
  {
1144
  "epoch": 13.406013311911867,
1145
- "grad_norm": 1.323893427848816,
1146
  "learning_rate": 0.0001,
1147
- "loss": 0.5156,
1148
  "step": 14600
1149
  },
1150
  {
1151
  "epoch": 13.497819600642645,
1152
- "grad_norm": 1.1941570043563843,
1153
  "learning_rate": 0.0001,
1154
- "loss": 0.5291,
1155
  "step": 14700
1156
  },
1157
  {
1158
  "epoch": 13.589625889373423,
1159
- "grad_norm": 1.3182090520858765,
1160
  "learning_rate": 0.0001,
1161
- "loss": 0.5307,
1162
  "step": 14800
1163
  },
1164
  {
1165
  "epoch": 13.6814321781042,
1166
- "grad_norm": 1.2716799974441528,
1167
  "learning_rate": 0.0001,
1168
- "loss": 0.5365,
1169
  "step": 14900
1170
  },
1171
  {
1172
  "epoch": 13.773238466834979,
1173
- "grad_norm": 1.4822237491607666,
1174
  "learning_rate": 0.0001,
1175
- "loss": 0.5402,
1176
  "step": 15000
1177
  },
1178
  {
1179
  "epoch": 13.865044755565757,
1180
- "grad_norm": 1.7227895259857178,
1181
  "learning_rate": 0.0001,
1182
- "loss": 0.536,
1183
  "step": 15100
1184
  },
1185
  {
1186
  "epoch": 13.956851044296535,
1187
- "grad_norm": 2.5236029624938965,
1188
  "learning_rate": 0.0001,
1189
- "loss": 0.5504,
1190
  "step": 15200
1191
  },
1192
  {
1193
  "epoch": 14.0,
1194
- "eval_accuracy": 0.5090793650793651,
1195
- "eval_loss": 3.4403011798858643,
1196
- "eval_runtime": 5.8756,
1197
- "eval_samples_per_second": 85.098,
1198
- "eval_steps_per_second": 10.722,
1199
  "step": 15247
1200
  },
1201
  {
1202
  "epoch": 14.048657333027313,
1203
- "grad_norm": 2.555696725845337,
1204
  "learning_rate": 0.0001,
1205
- "loss": 0.5034,
1206
  "step": 15300
1207
  },
1208
  {
1209
  "epoch": 14.14046362175809,
1210
- "grad_norm": 1.1738946437835693,
1211
  "learning_rate": 0.0001,
1212
  "loss": 0.4616,
1213
  "step": 15400
1214
  },
1215
  {
1216
  "epoch": 14.232269910488869,
1217
- "grad_norm": 1.3358529806137085,
1218
  "learning_rate": 0.0001,
1219
- "loss": 0.4724,
1220
  "step": 15500
1221
  },
1222
  {
1223
  "epoch": 14.324076199219647,
1224
- "grad_norm": 1.310691237449646,
1225
  "learning_rate": 0.0001,
1226
- "loss": 0.4738,
1227
  "step": 15600
1228
  },
1229
  {
1230
  "epoch": 14.415882487950425,
1231
- "grad_norm": 1.6306800842285156,
1232
  "learning_rate": 0.0001,
1233
  "loss": 0.4854,
1234
  "step": 15700
1235
  },
1236
  {
1237
  "epoch": 14.507688776681203,
1238
- "grad_norm": 1.1620906591415405,
1239
  "learning_rate": 0.0001,
1240
- "loss": 0.4915,
1241
  "step": 15800
1242
  },
1243
  {
1244
  "epoch": 14.59949506541198,
1245
- "grad_norm": 1.4679458141326904,
1246
  "learning_rate": 0.0001,
1247
- "loss": 0.4908,
1248
  "step": 15900
1249
  },
1250
  {
1251
  "epoch": 14.691301354142759,
1252
- "grad_norm": 1.4514496326446533,
1253
  "learning_rate": 0.0001,
1254
- "loss": 0.4924,
1255
  "step": 16000
1256
  },
1257
  {
1258
  "epoch": 14.783107642873537,
1259
- "grad_norm": 1.4150872230529785,
1260
  "learning_rate": 0.0001,
1261
- "loss": 0.5083,
1262
  "step": 16100
1263
  },
1264
  {
1265
  "epoch": 14.874913931604315,
1266
- "grad_norm": 1.4393386840820312,
1267
  "learning_rate": 0.0001,
1268
- "loss": 0.5076,
1269
  "step": 16200
1270
  },
1271
  {
1272
  "epoch": 14.966720220335093,
1273
- "grad_norm": 1.4583277702331543,
1274
  "learning_rate": 0.0001,
1275
- "loss": 0.5091,
1276
  "step": 16300
1277
  },
1278
  {
1279
  "epoch": 14.999770484278173,
1280
- "eval_accuracy": 0.5091111111111111,
1281
- "eval_loss": 3.473123550415039,
1282
- "eval_runtime": 5.6077,
1283
- "eval_samples_per_second": 89.163,
1284
- "eval_steps_per_second": 11.235,
1285
  "step": 16336
1286
  },
1287
  {
1288
  "epoch": 15.05852650906587,
1289
- "grad_norm": 1.6890257596969604,
1290
  "learning_rate": 0.0001,
1291
- "loss": 0.4488,
1292
  "step": 16400
1293
  },
1294
  {
1295
  "epoch": 15.150332797796649,
1296
- "grad_norm": 1.2938202619552612,
1297
  "learning_rate": 0.0001,
1298
- "loss": 0.4334,
1299
  "step": 16500
1300
  },
1301
  {
1302
  "epoch": 15.242139086527427,
1303
- "grad_norm": 1.405485987663269,
1304
  "learning_rate": 0.0001,
1305
- "loss": 0.4258,
1306
  "step": 16600
1307
  },
1308
  {
1309
  "epoch": 15.333945375258205,
1310
- "grad_norm": 1.4180231094360352,
1311
  "learning_rate": 0.0001,
1312
- "loss": 0.4419,
1313
  "step": 16700
1314
  },
1315
  {
1316
  "epoch": 15.425751663988983,
1317
- "grad_norm": 1.2333263158798218,
1318
  "learning_rate": 0.0001,
1319
- "loss": 0.4518,
1320
  "step": 16800
1321
  },
1322
  {
1323
  "epoch": 15.517557952719761,
1324
- "grad_norm": 1.5479395389556885,
1325
  "learning_rate": 0.0001,
1326
- "loss": 0.4591,
1327
  "step": 16900
1328
  },
1329
  {
1330
  "epoch": 15.609364241450539,
1331
- "grad_norm": 1.4094911813735962,
1332
  "learning_rate": 0.0001,
1333
- "loss": 0.4608,
1334
  "step": 17000
1335
  },
1336
  {
1337
  "epoch": 15.701170530181317,
1338
- "grad_norm": 1.4307762384414673,
1339
  "learning_rate": 0.0001,
1340
- "loss": 0.4679,
1341
  "step": 17100
1342
  },
1343
  {
1344
  "epoch": 15.792976818912095,
1345
- "grad_norm": 1.5752439498901367,
1346
  "learning_rate": 0.0001,
1347
- "loss": 0.4781,
1348
  "step": 17200
1349
  },
1350
  {
1351
  "epoch": 15.884783107642873,
1352
- "grad_norm": 1.438144564628601,
1353
  "learning_rate": 0.0001,
1354
- "loss": 0.4766,
1355
  "step": 17300
1356
  },
1357
  {
1358
  "epoch": 15.976589396373651,
1359
- "grad_norm": 1.3546433448791504,
1360
  "learning_rate": 0.0001,
1361
- "loss": 0.4794,
1362
  "step": 17400
1363
  },
1364
  {
1365
  "epoch": 15.999540968556346,
1366
- "eval_accuracy": 0.5089206349206349,
1367
- "eval_loss": 3.517951250076294,
1368
- "eval_runtime": 6.4046,
1369
- "eval_samples_per_second": 78.069,
1370
- "eval_steps_per_second": 9.837,
1371
  "step": 17425
1372
  },
1373
  {
1374
  "epoch": 16.06839568510443,
1375
- "grad_norm": 1.139876127243042,
1376
  "learning_rate": 0.0001,
1377
- "loss": 0.4243,
1378
  "step": 17500
1379
  },
1380
  {
1381
  "epoch": 16.160201973835207,
1382
- "grad_norm": 1.5056086778640747,
1383
  "learning_rate": 0.0001,
1384
- "loss": 0.4015,
1385
  "step": 17600
1386
  },
1387
  {
1388
  "epoch": 16.252008262565987,
1389
- "grad_norm": 1.6973472833633423,
1390
  "learning_rate": 0.0001,
1391
- "loss": 0.4123,
1392
  "step": 17700
1393
  },
1394
  {
1395
  "epoch": 16.343814551296763,
1396
- "grad_norm": 1.3595877885818481,
1397
  "learning_rate": 0.0001,
1398
- "loss": 0.4129,
1399
  "step": 17800
1400
  },
1401
  {
1402
  "epoch": 16.435620840027543,
1403
- "grad_norm": 1.4490883350372314,
1404
  "learning_rate": 0.0001,
1405
- "loss": 0.4197,
1406
  "step": 17900
1407
  },
1408
  {
1409
  "epoch": 16.52742712875832,
1410
- "grad_norm": 1.9509937763214111,
1411
  "learning_rate": 0.0001,
1412
- "loss": 0.4274,
1413
  "step": 18000
1414
  },
1415
  {
1416
  "epoch": 16.6192334174891,
1417
- "grad_norm": 1.2110815048217773,
1418
  "learning_rate": 0.0001,
1419
- "loss": 0.4351,
1420
  "step": 18100
1421
  },
1422
  {
1423
  "epoch": 16.711039706219875,
1424
- "grad_norm": 1.4993358850479126,
1425
  "learning_rate": 0.0001,
1426
- "loss": 0.4369,
1427
  "step": 18200
1428
  },
1429
  {
1430
  "epoch": 16.802845994950655,
1431
- "grad_norm": 1.802101492881775,
1432
  "learning_rate": 0.0001,
1433
- "loss": 0.4432,
1434
  "step": 18300
1435
  },
1436
  {
1437
  "epoch": 16.89465228368143,
1438
- "grad_norm": 1.3281980752944946,
1439
  "learning_rate": 0.0001,
1440
- "loss": 0.4466,
1441
  "step": 18400
1442
  },
1443
  {
1444
  "epoch": 16.98645857241221,
1445
- "grad_norm": 1.52168869972229,
1446
  "learning_rate": 0.0001,
1447
- "loss": 0.4553,
1448
  "step": 18500
1449
  },
1450
  {
1451
  "epoch": 16.99931145283452,
1452
- "eval_accuracy": 0.5087619047619047,
1453
- "eval_loss": 3.555225133895874,
1454
- "eval_runtime": 6.0339,
1455
- "eval_samples_per_second": 82.865,
1456
- "eval_steps_per_second": 10.441,
1457
  "step": 18514
1458
  },
1459
  {
1460
  "epoch": 17.078264861142987,
1461
- "grad_norm": 1.1832448244094849,
1462
  "learning_rate": 0.0001,
1463
- "loss": 0.3783,
1464
  "step": 18600
1465
  },
1466
  {
1467
  "epoch": 17.170071149873767,
1468
- "grad_norm": 1.4022966623306274,
1469
  "learning_rate": 0.0001,
1470
- "loss": 0.3821,
1471
  "step": 18700
1472
  },
1473
  {
1474
  "epoch": 17.261877438604543,
1475
- "grad_norm": 1.237662672996521,
1476
  "learning_rate": 0.0001,
1477
- "loss": 0.3843,
1478
  "step": 18800
1479
  },
1480
  {
1481
  "epoch": 17.353683727335323,
1482
- "grad_norm": 1.3307769298553467,
1483
  "learning_rate": 0.0001,
1484
- "loss": 0.3988,
1485
  "step": 18900
1486
  },
1487
  {
1488
  "epoch": 17.4454900160661,
1489
- "grad_norm": 1.3363937139511108,
1490
  "learning_rate": 0.0001,
1491
- "loss": 0.4045,
1492
  "step": 19000
1493
  },
1494
  {
1495
  "epoch": 17.53729630479688,
1496
- "grad_norm": 1.421823501586914,
1497
  "learning_rate": 0.0001,
1498
- "loss": 0.4061,
1499
  "step": 19100
1500
  },
1501
  {
1502
  "epoch": 17.629102593527655,
1503
- "grad_norm": 1.4331218004226685,
1504
  "learning_rate": 0.0001,
1505
- "loss": 0.4049,
1506
  "step": 19200
1507
  },
1508
  {
1509
  "epoch": 17.720908882258435,
1510
- "grad_norm": 1.7339308261871338,
1511
  "learning_rate": 0.0001,
1512
- "loss": 0.4198,
1513
  "step": 19300
1514
  },
1515
  {
1516
  "epoch": 17.81271517098921,
1517
- "grad_norm": 1.3503917455673218,
1518
  "learning_rate": 0.0001,
1519
- "loss": 0.4187,
1520
  "step": 19400
1521
  },
1522
  {
1523
  "epoch": 17.90452145971999,
1524
- "grad_norm": 1.6054885387420654,
1525
  "learning_rate": 0.0001,
1526
- "loss": 0.4176,
1527
  "step": 19500
1528
  },
1529
  {
1530
  "epoch": 17.996327748450767,
1531
- "grad_norm": 1.571459412574768,
1532
  "learning_rate": 0.0001,
1533
- "loss": 0.4275,
1534
  "step": 19600
1535
  },
1536
  {
1537
  "epoch": 18.0,
1538
- "eval_accuracy": 0.5086349206349207,
1539
- "eval_loss": 3.6265830993652344,
1540
- "eval_runtime": 5.5465,
1541
- "eval_samples_per_second": 90.148,
1542
- "eval_steps_per_second": 11.359,
1543
  "step": 19604
1544
  },
1545
  {
1546
  "epoch": 18.088134037181547,
1547
- "grad_norm": 1.34700345993042,
1548
  "learning_rate": 0.0001,
1549
- "loss": 0.3492,
1550
  "step": 19700
1551
  },
1552
  {
1553
  "epoch": 18.179940325912327,
1554
- "grad_norm": 1.2837024927139282,
1555
  "learning_rate": 0.0001,
1556
- "loss": 0.3601,
1557
  "step": 19800
1558
  },
1559
  {
1560
  "epoch": 18.271746614643103,
1561
- "grad_norm": 1.8572251796722412,
1562
  "learning_rate": 0.0001,
1563
- "loss": 0.3727,
1564
  "step": 19900
1565
  },
1566
  {
1567
  "epoch": 18.363552903373883,
1568
- "grad_norm": 1.632940411567688,
1569
  "learning_rate": 0.0001,
1570
- "loss": 0.3704,
1571
  "step": 20000
1572
  },
1573
  {
1574
  "epoch": 18.45535919210466,
1575
- "grad_norm": 1.6444798707962036,
1576
  "learning_rate": 0.0001,
1577
- "loss": 0.3777,
1578
  "step": 20100
1579
  },
1580
  {
1581
  "epoch": 18.54716548083544,
1582
- "grad_norm": 1.6007230281829834,
1583
  "learning_rate": 0.0001,
1584
- "loss": 0.3798,
1585
  "step": 20200
1586
  },
1587
  {
1588
  "epoch": 18.638971769566215,
1589
- "grad_norm": 1.6421847343444824,
1590
  "learning_rate": 0.0001,
1591
- "loss": 0.387,
1592
  "step": 20300
1593
  },
1594
  {
1595
  "epoch": 18.730778058296995,
1596
- "grad_norm": 1.7184878587722778,
1597
  "learning_rate": 0.0001,
1598
- "loss": 0.3923,
1599
  "step": 20400
1600
  },
1601
  {
1602
  "epoch": 18.82258434702777,
1603
- "grad_norm": 1.5293742418289185,
1604
  "learning_rate": 0.0001,
1605
- "loss": 0.3974,
1606
  "step": 20500
1607
  },
1608
  {
1609
  "epoch": 18.91439063575855,
1610
- "grad_norm": 1.554432988166809,
1611
  "learning_rate": 0.0001,
1612
- "loss": 0.4086,
1613
  "step": 20600
1614
  },
1615
  {
1616
  "epoch": 18.999770484278173,
1617
- "eval_accuracy": 0.5072698412698413,
1618
- "eval_loss": 3.62077260017395,
1619
- "eval_runtime": 5.3313,
1620
- "eval_samples_per_second": 93.785,
1621
- "eval_steps_per_second": 11.817,
1622
  "step": 20693
1623
  },
1624
  {
1625
  "epoch": 19.006196924489327,
1626
- "grad_norm": 1.1894158124923706,
1627
  "learning_rate": 0.0001,
1628
- "loss": 0.3996,
1629
  "step": 20700
1630
  },
1631
  {
1632
  "epoch": 19.098003213220107,
1633
- "grad_norm": 1.4055882692337036,
1634
  "learning_rate": 0.0001,
1635
- "loss": 0.3328,
1636
  "step": 20800
1637
  },
1638
  {
1639
  "epoch": 19.189809501950883,
1640
- "grad_norm": 1.1423193216323853,
1641
  "learning_rate": 0.0001,
1642
- "loss": 0.3398,
1643
  "step": 20900
1644
  },
1645
  {
1646
  "epoch": 19.281615790681663,
1647
- "grad_norm": 1.5121031999588013,
1648
  "learning_rate": 0.0001,
1649
- "loss": 0.3494,
1650
  "step": 21000
1651
  },
1652
  {
1653
  "epoch": 19.37342207941244,
1654
- "grad_norm": 1.3979634046554565,
1655
  "learning_rate": 0.0001,
1656
- "loss": 0.3577,
1657
  "step": 21100
1658
  },
1659
  {
1660
  "epoch": 19.46522836814322,
1661
- "grad_norm": 1.574471354484558,
1662
  "learning_rate": 0.0001,
1663
- "loss": 0.3626,
1664
  "step": 21200
1665
  },
1666
  {
1667
  "epoch": 19.557034656873995,
1668
- "grad_norm": 1.3021608591079712,
1669
  "learning_rate": 0.0001,
1670
- "loss": 0.3629,
1671
  "step": 21300
1672
  },
1673
  {
1674
  "epoch": 19.648840945604775,
1675
- "grad_norm": 1.6208659410476685,
1676
  "learning_rate": 0.0001,
1677
- "loss": 0.3667,
1678
  "step": 21400
1679
  },
1680
  {
1681
  "epoch": 19.74064723433555,
1682
- "grad_norm": 1.689060091972351,
1683
  "learning_rate": 0.0001,
1684
- "loss": 0.3773,
1685
  "step": 21500
1686
  },
1687
  {
1688
  "epoch": 19.83245352306633,
1689
- "grad_norm": 1.8210320472717285,
1690
  "learning_rate": 0.0001,
1691
- "loss": 0.3891,
1692
  "step": 21600
1693
  },
1694
  {
1695
  "epoch": 19.924259811797107,
1696
- "grad_norm": 1.6032166481018066,
1697
  "learning_rate": 0.0001,
1698
- "loss": 0.3824,
1699
  "step": 21700
1700
  },
1701
  {
1702
  "epoch": 19.99770484278173,
1703
- "eval_accuracy": 0.5088571428571429,
1704
- "eval_loss": 3.6292061805725098,
1705
- "eval_runtime": 5.3221,
1706
- "eval_samples_per_second": 93.949,
1707
- "eval_steps_per_second": 11.838,
1708
  "step": 21780
1709
  },
1710
  {
1711
  "epoch": 19.99770484278173,
1712
  "step": 21780,
1713
  "total_flos": 1.5027132150442885e+18,
1714
- "train_loss": 0.24546462313859588,
1715
- "train_runtime": 24136.6058,
1716
- "train_samples_per_second": 28.881,
1717
- "train_steps_per_second": 0.902
1718
  }
1719
  ],
1720
  "logging_steps": 100,
 
863
  },
864
  {
865
  "epoch": 10.100986917603857,
866
+ "grad_norm": 1.4835282564163208,
867
  "learning_rate": 0.0001,
868
  "loss": 0.6334,
869
  "step": 11000
870
  },
871
  {
872
  "epoch": 10.192793206334635,
873
+ "grad_norm": 1.2020999193191528,
874
  "learning_rate": 0.0001,
875
+ "loss": 0.6483,
876
  "step": 11100
877
  },
878
  {
879
  "epoch": 10.284599495065413,
880
+ "grad_norm": 1.110913872718811,
881
  "learning_rate": 0.0001,
882
  "loss": 0.6724,
883
  "step": 11200
884
  },
885
  {
886
  "epoch": 10.37640578379619,
887
+ "grad_norm": 1.1754395961761475,
888
  "learning_rate": 0.0001,
889
  "loss": 0.6647,
890
  "step": 11300
891
  },
892
  {
893
  "epoch": 10.468212072526969,
894
+ "grad_norm": 1.4345954656600952,
895
  "learning_rate": 0.0001,
896
  "loss": 0.6738,
897
  "step": 11400
898
  },
899
  {
900
  "epoch": 10.560018361257747,
901
+ "grad_norm": 1.3362641334533691,
902
  "learning_rate": 0.0001,
903
+ "loss": 0.6769,
904
  "step": 11500
905
  },
906
  {
907
  "epoch": 10.651824649988525,
908
+ "grad_norm": 1.369678258895874,
909
  "learning_rate": 0.0001,
910
+ "loss": 0.6831,
911
  "step": 11600
912
  },
913
  {
914
  "epoch": 10.743630938719303,
915
+ "grad_norm": 1.195913553237915,
916
  "learning_rate": 0.0001,
917
+ "loss": 0.6787,
918
  "step": 11700
919
  },
920
  {
921
  "epoch": 10.83543722745008,
922
+ "grad_norm": 1.1389861106872559,
923
  "learning_rate": 0.0001,
924
+ "loss": 0.7081,
925
  "step": 11800
926
  },
927
  {
928
  "epoch": 10.927243516180859,
929
+ "grad_norm": 1.0441659688949585,
930
  "learning_rate": 0.0001,
931
  "loss": 0.6899,
932
  "step": 11900
933
  },
934
  {
935
  "epoch": 10.999770484278173,
936
+ "eval_accuracy": 0.5072063492063492,
937
+ "eval_loss": 3.3147079944610596,
938
+ "eval_runtime": 5.952,
939
+ "eval_samples_per_second": 84.006,
940
+ "eval_steps_per_second": 10.585,
941
  "step": 11979
942
  },
943
  {
944
  "epoch": 11.019049804911637,
945
+ "grad_norm": 1.0717581510543823,
946
  "learning_rate": 0.0001,
947
+ "loss": 0.6725,
948
  "step": 12000
949
  },
950
  {
951
  "epoch": 11.110856093642415,
952
+ "grad_norm": 1.109116554260254,
953
  "learning_rate": 0.0001,
954
+ "loss": 0.5792,
955
  "step": 12100
956
  },
957
  {
958
  "epoch": 11.202662382373193,
959
+ "grad_norm": 1.1636066436767578,
960
  "learning_rate": 0.0001,
961
+ "loss": 0.6003,
962
  "step": 12200
963
  },
964
  {
965
  "epoch": 11.29446867110397,
966
+ "grad_norm": 1.293394923210144,
967
  "learning_rate": 0.0001,
968
+ "loss": 0.5985,
969
  "step": 12300
970
  },
971
  {
972
  "epoch": 11.386274959834749,
973
+ "grad_norm": 1.3064810037612915,
974
  "learning_rate": 0.0001,
975
+ "loss": 0.6213,
976
  "step": 12400
977
  },
978
  {
979
  "epoch": 11.478081248565527,
980
+ "grad_norm": 1.4149938821792603,
981
  "learning_rate": 0.0001,
982
+ "loss": 0.6099,
983
  "step": 12500
984
  },
985
  {
986
  "epoch": 11.569887537296305,
987
+ "grad_norm": 1.7333831787109375,
988
  "learning_rate": 0.0001,
989
+ "loss": 0.6153,
990
  "step": 12600
991
  },
992
  {
993
  "epoch": 11.661693826027083,
994
+ "grad_norm": 1.2699064016342163,
995
  "learning_rate": 0.0001,
996
+ "loss": 0.6276,
997
  "step": 12700
998
  },
999
  {
1000
  "epoch": 11.75350011475786,
1001
+ "grad_norm": 1.5278961658477783,
1002
  "learning_rate": 0.0001,
1003
+ "loss": 0.6276,
1004
  "step": 12800
1005
  },
1006
  {
1007
  "epoch": 11.845306403488639,
1008
+ "grad_norm": 1.4876160621643066,
1009
  "learning_rate": 0.0001,
1010
+ "loss": 0.6402,
1011
  "step": 12900
1012
  },
1013
  {
1014
  "epoch": 11.937112692219417,
1015
+ "grad_norm": 1.173690676689148,
1016
  "learning_rate": 0.0001,
1017
+ "loss": 0.6427,
1018
  "step": 13000
1019
  },
1020
  {
1021
  "epoch": 11.999540968556346,
1022
+ "eval_accuracy": 0.5101269841269841,
1023
+ "eval_loss": 3.4025278091430664,
1024
+ "eval_runtime": 6.1807,
1025
+ "eval_samples_per_second": 80.897,
1026
+ "eval_steps_per_second": 10.193,
1027
  "step": 13068
1028
  },
1029
  {
1030
  "epoch": 12.028918980950195,
1031
+ "grad_norm": 1.4262442588806152,
1032
  "learning_rate": 0.0001,
1033
+ "loss": 0.6183,
1034
  "step": 13100
1035
  },
1036
  {
1037
  "epoch": 12.120725269680973,
1038
+ "grad_norm": 1.3088960647583008,
1039
  "learning_rate": 0.0001,
1040
+ "loss": 0.5263,
1041
  "step": 13200
1042
  },
1043
  {
1044
  "epoch": 12.21253155841175,
1045
+ "grad_norm": 1.416728138923645,
1046
  "learning_rate": 0.0001,
1047
+ "loss": 0.5468,
1048
  "step": 13300
1049
  },
1050
  {
1051
  "epoch": 12.304337847142529,
1052
+ "grad_norm": 1.216723918914795,
1053
  "learning_rate": 0.0001,
1054
+ "loss": 0.5546,
1055
  "step": 13400
1056
  },
1057
  {
1058
  "epoch": 12.396144135873307,
1059
+ "grad_norm": 1.2964662313461304,
1060
  "learning_rate": 0.0001,
1061
  "loss": 0.5665,
1062
  "step": 13500
1063
  },
1064
  {
1065
  "epoch": 12.487950424604085,
1066
+ "grad_norm": 1.2485017776489258,
1067
  "learning_rate": 0.0001,
1068
+ "loss": 0.5656,
1069
  "step": 13600
1070
  },
1071
  {
1072
  "epoch": 12.579756713334863,
1073
+ "grad_norm": 1.4807502031326294,
1074
  "learning_rate": 0.0001,
1075
+ "loss": 0.5751,
1076
  "step": 13700
1077
  },
1078
  {
1079
  "epoch": 12.671563002065641,
1080
+ "grad_norm": 1.2348891496658325,
1081
  "learning_rate": 0.0001,
1082
+ "loss": 0.5744,
1083
  "step": 13800
1084
  },
1085
  {
1086
  "epoch": 12.763369290796419,
1087
+ "grad_norm": 1.3481100797653198,
1088
  "learning_rate": 0.0001,
1089
+ "loss": 0.5904,
1090
  "step": 13900
1091
  },
1092
  {
1093
  "epoch": 12.855175579527197,
1094
+ "grad_norm": 1.5321805477142334,
1095
  "learning_rate": 0.0001,
1096
+ "loss": 0.5818,
1097
  "step": 14000
1098
  },
1099
  {
1100
  "epoch": 12.946981868257975,
1101
+ "grad_norm": 1.6983797550201416,
1102
  "learning_rate": 0.0001,
1103
+ "loss": 0.604,
1104
  "step": 14100
1105
  },
1106
  {
1107
  "epoch": 12.999311452834519,
1108
+ "eval_accuracy": 0.5102857142857142,
1109
+ "eval_loss": 3.390490770339966,
1110
+ "eval_runtime": 6.2076,
1111
+ "eval_samples_per_second": 80.546,
1112
+ "eval_steps_per_second": 10.149,
1113
  "step": 14157
1114
  },
1115
  {
1116
  "epoch": 13.038788156988753,
1117
+ "grad_norm": 1.336112141609192,
1118
  "learning_rate": 0.0001,
1119
+ "loss": 0.556,
1120
  "step": 14200
1121
  },
1122
  {
1123
  "epoch": 13.130594445719531,
1124
+ "grad_norm": 1.420715570449829,
1125
  "learning_rate": 0.0001,
1126
  "loss": 0.4959,
1127
  "step": 14300
1128
  },
1129
  {
1130
  "epoch": 13.22240073445031,
1131
+ "grad_norm": 1.3724342584609985,
1132
  "learning_rate": 0.0001,
1133
  "loss": 0.5088,
1134
  "step": 14400
1135
  },
1136
  {
1137
  "epoch": 13.314207023181089,
1138
+ "grad_norm": 1.2071592807769775,
1139
  "learning_rate": 0.0001,
1140
+ "loss": 0.5128,
1141
  "step": 14500
1142
  },
1143
  {
1144
  "epoch": 13.406013311911867,
1145
+ "grad_norm": 1.604084849357605,
1146
  "learning_rate": 0.0001,
1147
+ "loss": 0.5153,
1148
  "step": 14600
1149
  },
1150
  {
1151
  "epoch": 13.497819600642645,
1152
+ "grad_norm": 1.2753998041152954,
1153
  "learning_rate": 0.0001,
1154
+ "loss": 0.5299,
1155
  "step": 14700
1156
  },
1157
  {
1158
  "epoch": 13.589625889373423,
1159
+ "grad_norm": 1.4272007942199707,
1160
  "learning_rate": 0.0001,
1161
+ "loss": 0.5322,
1162
  "step": 14800
1163
  },
1164
  {
1165
  "epoch": 13.6814321781042,
1166
+ "grad_norm": 1.4577889442443848,
1167
  "learning_rate": 0.0001,
1168
+ "loss": 0.5368,
1169
  "step": 14900
1170
  },
1171
  {
1172
  "epoch": 13.773238466834979,
1173
+ "grad_norm": 2.467128038406372,
1174
  "learning_rate": 0.0001,
1175
+ "loss": 0.5413,
1176
  "step": 15000
1177
  },
1178
  {
1179
  "epoch": 13.865044755565757,
1180
+ "grad_norm": 1.6474366188049316,
1181
  "learning_rate": 0.0001,
1182
+ "loss": 0.5347,
1183
  "step": 15100
1184
  },
1185
  {
1186
  "epoch": 13.956851044296535,
1187
+ "grad_norm": 1.3380658626556396,
1188
  "learning_rate": 0.0001,
1189
+ "loss": 0.5507,
1190
  "step": 15200
1191
  },
1192
  {
1193
  "epoch": 14.0,
1194
+ "eval_accuracy": 0.5087936507936508,
1195
+ "eval_loss": 3.47397780418396,
1196
+ "eval_runtime": 5.9214,
1197
+ "eval_samples_per_second": 84.439,
1198
+ "eval_steps_per_second": 10.639,
1199
  "step": 15247
1200
  },
1201
  {
1202
  "epoch": 14.048657333027313,
1203
+ "grad_norm": 1.5243284702301025,
1204
  "learning_rate": 0.0001,
1205
+ "loss": 0.5031,
1206
  "step": 15300
1207
  },
1208
  {
1209
  "epoch": 14.14046362175809,
1210
+ "grad_norm": 1.3172513246536255,
1211
  "learning_rate": 0.0001,
1212
  "loss": 0.4616,
1213
  "step": 15400
1214
  },
1215
  {
1216
  "epoch": 14.232269910488869,
1217
+ "grad_norm": 1.3125884532928467,
1218
  "learning_rate": 0.0001,
1219
+ "loss": 0.4734,
1220
  "step": 15500
1221
  },
1222
  {
1223
  "epoch": 14.324076199219647,
1224
+ "grad_norm": 1.212815284729004,
1225
  "learning_rate": 0.0001,
1226
+ "loss": 0.4732,
1227
  "step": 15600
1228
  },
1229
  {
1230
  "epoch": 14.415882487950425,
1231
+ "grad_norm": 1.2093795537948608,
1232
  "learning_rate": 0.0001,
1233
  "loss": 0.4854,
1234
  "step": 15700
1235
  },
1236
  {
1237
  "epoch": 14.507688776681203,
1238
+ "grad_norm": 1.4243745803833008,
1239
  "learning_rate": 0.0001,
1240
+ "loss": 0.4928,
1241
  "step": 15800
1242
  },
1243
  {
1244
  "epoch": 14.59949506541198,
1245
+ "grad_norm": 1.4822219610214233,
1246
  "learning_rate": 0.0001,
1247
+ "loss": 0.4881,
1248
  "step": 15900
1249
  },
1250
  {
1251
  "epoch": 14.691301354142759,
1252
+ "grad_norm": 1.7908687591552734,
1253
  "learning_rate": 0.0001,
1254
+ "loss": 0.4939,
1255
  "step": 16000
1256
  },
1257
  {
1258
  "epoch": 14.783107642873537,
1259
+ "grad_norm": 1.3300451040267944,
1260
  "learning_rate": 0.0001,
1261
+ "loss": 0.5078,
1262
  "step": 16100
1263
  },
1264
  {
1265
  "epoch": 14.874913931604315,
1266
+ "grad_norm": 1.8892784118652344,
1267
  "learning_rate": 0.0001,
1268
+ "loss": 0.5084,
1269
  "step": 16200
1270
  },
1271
  {
1272
  "epoch": 14.966720220335093,
1273
+ "grad_norm": 1.3664839267730713,
1274
  "learning_rate": 0.0001,
1275
+ "loss": 0.5099,
1276
  "step": 16300
1277
  },
1278
  {
1279
  "epoch": 14.999770484278173,
1280
+ "eval_accuracy": 0.5085079365079365,
1281
+ "eval_loss": 3.477241277694702,
1282
+ "eval_runtime": 5.8872,
1283
+ "eval_samples_per_second": 84.93,
1284
+ "eval_steps_per_second": 10.701,
1285
  "step": 16336
1286
  },
1287
  {
1288
  "epoch": 15.05852650906587,
1289
+ "grad_norm": 1.5635462999343872,
1290
  "learning_rate": 0.0001,
1291
+ "loss": 0.4503,
1292
  "step": 16400
1293
  },
1294
  {
1295
  "epoch": 15.150332797796649,
1296
+ "grad_norm": 1.3071101903915405,
1297
  "learning_rate": 0.0001,
1298
+ "loss": 0.432,
1299
  "step": 16500
1300
  },
1301
  {
1302
  "epoch": 15.242139086527427,
1303
+ "grad_norm": 1.3490887880325317,
1304
  "learning_rate": 0.0001,
1305
+ "loss": 0.4278,
1306
  "step": 16600
1307
  },
1308
  {
1309
  "epoch": 15.333945375258205,
1310
+ "grad_norm": 1.2852087020874023,
1311
  "learning_rate": 0.0001,
1312
+ "loss": 0.4425,
1313
  "step": 16700
1314
  },
1315
  {
1316
  "epoch": 15.425751663988983,
1317
+ "grad_norm": 1.3694721460342407,
1318
  "learning_rate": 0.0001,
1319
+ "loss": 0.4509,
1320
  "step": 16800
1321
  },
1322
  {
1323
  "epoch": 15.517557952719761,
1324
+ "grad_norm": 1.6945849657058716,
1325
  "learning_rate": 0.0001,
1326
+ "loss": 0.459,
1327
  "step": 16900
1328
  },
1329
  {
1330
  "epoch": 15.609364241450539,
1331
+ "grad_norm": 1.1728637218475342,
1332
  "learning_rate": 0.0001,
1333
+ "loss": 0.4627,
1334
  "step": 17000
1335
  },
1336
  {
1337
  "epoch": 15.701170530181317,
1338
+ "grad_norm": 1.4016026258468628,
1339
  "learning_rate": 0.0001,
1340
+ "loss": 0.469,
1341
  "step": 17100
1342
  },
1343
  {
1344
  "epoch": 15.792976818912095,
1345
+ "grad_norm": 1.5266894102096558,
1346
  "learning_rate": 0.0001,
1347
+ "loss": 0.4761,
1348
  "step": 17200
1349
  },
1350
  {
1351
  "epoch": 15.884783107642873,
1352
+ "grad_norm": 1.6413285732269287,
1353
  "learning_rate": 0.0001,
1354
+ "loss": 0.4772,
1355
  "step": 17300
1356
  },
1357
  {
1358
  "epoch": 15.976589396373651,
1359
+ "grad_norm": 1.3661378622055054,
1360
  "learning_rate": 0.0001,
1361
+ "loss": 0.478,
1362
  "step": 17400
1363
  },
1364
  {
1365
  "epoch": 15.999540968556346,
1366
+ "eval_accuracy": 0.5088253968253968,
1367
+ "eval_loss": 3.5259175300598145,
1368
+ "eval_runtime": 6.3422,
1369
+ "eval_samples_per_second": 78.837,
1370
+ "eval_steps_per_second": 9.933,
1371
  "step": 17425
1372
  },
1373
  {
1374
  "epoch": 16.06839568510443,
1375
+ "grad_norm": 1.3857972621917725,
1376
  "learning_rate": 0.0001,
1377
+ "loss": 0.4229,
1378
  "step": 17500
1379
  },
1380
  {
1381
  "epoch": 16.160201973835207,
1382
+ "grad_norm": 1.477820634841919,
1383
  "learning_rate": 0.0001,
1384
+ "loss": 0.4018,
1385
  "step": 17600
1386
  },
1387
  {
1388
  "epoch": 16.252008262565987,
1389
+ "grad_norm": 1.4332579374313354,
1390
  "learning_rate": 0.0001,
1391
+ "loss": 0.4122,
1392
  "step": 17700
1393
  },
1394
  {
1395
  "epoch": 16.343814551296763,
1396
+ "grad_norm": 1.5828882455825806,
1397
  "learning_rate": 0.0001,
1398
+ "loss": 0.4154,
1399
  "step": 17800
1400
  },
1401
  {
1402
  "epoch": 16.435620840027543,
1403
+ "grad_norm": 1.2645188570022583,
1404
  "learning_rate": 0.0001,
1405
+ "loss": 0.4198,
1406
  "step": 17900
1407
  },
1408
  {
1409
  "epoch": 16.52742712875832,
1410
+ "grad_norm": 1.6579480171203613,
1411
  "learning_rate": 0.0001,
1412
+ "loss": 0.4271,
1413
  "step": 18000
1414
  },
1415
  {
1416
  "epoch": 16.6192334174891,
1417
+ "grad_norm": 1.538338541984558,
1418
  "learning_rate": 0.0001,
1419
+ "loss": 0.4348,
1420
  "step": 18100
1421
  },
1422
  {
1423
  "epoch": 16.711039706219875,
1424
+ "grad_norm": 1.3948062658309937,
1425
  "learning_rate": 0.0001,
1426
+ "loss": 0.4365,
1427
  "step": 18200
1428
  },
1429
  {
1430
  "epoch": 16.802845994950655,
1431
+ "grad_norm": 1.422324776649475,
1432
  "learning_rate": 0.0001,
1433
+ "loss": 0.442,
1434
  "step": 18300
1435
  },
1436
  {
1437
  "epoch": 16.89465228368143,
1438
+ "grad_norm": 1.2586045265197754,
1439
  "learning_rate": 0.0001,
1440
+ "loss": 0.4483,
1441
  "step": 18400
1442
  },
1443
  {
1444
  "epoch": 16.98645857241221,
1445
+ "grad_norm": 1.5145964622497559,
1446
  "learning_rate": 0.0001,
1447
+ "loss": 0.4545,
1448
  "step": 18500
1449
  },
1450
  {
1451
  "epoch": 16.99931145283452,
1452
+ "eval_accuracy": 0.5093650793650794,
1453
+ "eval_loss": 3.5390663146972656,
1454
+ "eval_runtime": 5.9935,
1455
+ "eval_samples_per_second": 83.424,
1456
+ "eval_steps_per_second": 10.511,
1457
  "step": 18514
1458
  },
1459
  {
1460
  "epoch": 17.078264861142987,
1461
+ "grad_norm": 1.3872510194778442,
1462
  "learning_rate": 0.0001,
1463
+ "loss": 0.3784,
1464
  "step": 18600
1465
  },
1466
  {
1467
  "epoch": 17.170071149873767,
1468
+ "grad_norm": 1.2367671728134155,
1469
  "learning_rate": 0.0001,
1470
+ "loss": 0.3818,
1471
  "step": 18700
1472
  },
1473
  {
1474
  "epoch": 17.261877438604543,
1475
+ "grad_norm": 1.5146794319152832,
1476
  "learning_rate": 0.0001,
1477
+ "loss": 0.3842,
1478
  "step": 18800
1479
  },
1480
  {
1481
  "epoch": 17.353683727335323,
1482
+ "grad_norm": 1.4367637634277344,
1483
  "learning_rate": 0.0001,
1484
+ "loss": 0.3983,
1485
  "step": 18900
1486
  },
1487
  {
1488
  "epoch": 17.4454900160661,
1489
+ "grad_norm": 1.3474266529083252,
1490
  "learning_rate": 0.0001,
1491
+ "loss": 0.4028,
1492
  "step": 19000
1493
  },
1494
  {
1495
  "epoch": 17.53729630479688,
1496
+ "grad_norm": 1.5168508291244507,
1497
  "learning_rate": 0.0001,
1498
+ "loss": 0.4064,
1499
  "step": 19100
1500
  },
1501
  {
1502
  "epoch": 17.629102593527655,
1503
+ "grad_norm": 1.4708962440490723,
1504
  "learning_rate": 0.0001,
1505
+ "loss": 0.4062,
1506
  "step": 19200
1507
  },
1508
  {
1509
  "epoch": 17.720908882258435,
1510
+ "grad_norm": 1.3981653451919556,
1511
  "learning_rate": 0.0001,
1512
+ "loss": 0.4194,
1513
  "step": 19300
1514
  },
1515
  {
1516
  "epoch": 17.81271517098921,
1517
+ "grad_norm": 1.739737868309021,
1518
  "learning_rate": 0.0001,
1519
+ "loss": 0.4167,
1520
  "step": 19400
1521
  },
1522
  {
1523
  "epoch": 17.90452145971999,
1524
+ "grad_norm": 1.5967693328857422,
1525
  "learning_rate": 0.0001,
1526
+ "loss": 0.4187,
1527
  "step": 19500
1528
  },
1529
  {
1530
  "epoch": 17.996327748450767,
1531
+ "grad_norm": 1.5139836072921753,
1532
  "learning_rate": 0.0001,
1533
+ "loss": 0.427,
1534
  "step": 19600
1535
  },
1536
  {
1537
  "epoch": 18.0,
1538
+ "eval_accuracy": 0.5095238095238095,
1539
+ "eval_loss": 3.588681697845459,
1540
+ "eval_runtime": 6.3123,
1541
+ "eval_samples_per_second": 79.211,
1542
+ "eval_steps_per_second": 9.981,
1543
  "step": 19604
1544
  },
1545
  {
1546
  "epoch": 18.088134037181547,
1547
+ "grad_norm": 1.3782016038894653,
1548
  "learning_rate": 0.0001,
1549
+ "loss": 0.3505,
1550
  "step": 19700
1551
  },
1552
  {
1553
  "epoch": 18.179940325912327,
1554
+ "grad_norm": 1.6956948041915894,
1555
  "learning_rate": 0.0001,
1556
+ "loss": 0.3589,
1557
  "step": 19800
1558
  },
1559
  {
1560
  "epoch": 18.271746614643103,
1561
+ "grad_norm": 1.4169180393218994,
1562
  "learning_rate": 0.0001,
1563
+ "loss": 0.3726,
1564
  "step": 19900
1565
  },
1566
  {
1567
  "epoch": 18.363552903373883,
1568
+ "grad_norm": 1.4360090494155884,
1569
  "learning_rate": 0.0001,
1570
+ "loss": 0.3706,
1571
  "step": 20000
1572
  },
1573
  {
1574
  "epoch": 18.45535919210466,
1575
+ "grad_norm": 1.3922706842422485,
1576
  "learning_rate": 0.0001,
1577
+ "loss": 0.3766,
1578
  "step": 20100
1579
  },
1580
  {
1581
  "epoch": 18.54716548083544,
1582
+ "grad_norm": 1.5002549886703491,
1583
  "learning_rate": 0.0001,
1584
+ "loss": 0.3794,
1585
  "step": 20200
1586
  },
1587
  {
1588
  "epoch": 18.638971769566215,
1589
+ "grad_norm": 1.7266395092010498,
1590
  "learning_rate": 0.0001,
1591
+ "loss": 0.3888,
1592
  "step": 20300
1593
  },
1594
  {
1595
  "epoch": 18.730778058296995,
1596
+ "grad_norm": 1.5613874197006226,
1597
  "learning_rate": 0.0001,
1598
+ "loss": 0.3937,
1599
  "step": 20400
1600
  },
1601
  {
1602
  "epoch": 18.82258434702777,
1603
+ "grad_norm": 1.5989686250686646,
1604
  "learning_rate": 0.0001,
1605
+ "loss": 0.3967,
1606
  "step": 20500
1607
  },
1608
  {
1609
  "epoch": 18.91439063575855,
1610
+ "grad_norm": 2.0064892768859863,
1611
  "learning_rate": 0.0001,
1612
+ "loss": 0.4083,
1613
  "step": 20600
1614
  },
1615
  {
1616
  "epoch": 18.999770484278173,
1617
+ "eval_accuracy": 0.5096507936507937,
1618
+ "eval_loss": 3.5945370197296143,
1619
+ "eval_runtime": 6.3499,
1620
+ "eval_samples_per_second": 78.742,
1621
+ "eval_steps_per_second": 9.921,
1622
  "step": 20693
1623
  },
1624
  {
1625
  "epoch": 19.006196924489327,
1626
+ "grad_norm": 1.1501438617706299,
1627
  "learning_rate": 0.0001,
1628
+ "loss": 0.3995,
1629
  "step": 20700
1630
  },
1631
  {
1632
  "epoch": 19.098003213220107,
1633
+ "grad_norm": 1.6791703701019287,
1634
  "learning_rate": 0.0001,
1635
+ "loss": 0.3333,
1636
  "step": 20800
1637
  },
1638
  {
1639
  "epoch": 19.189809501950883,
1640
+ "grad_norm": 1.3187992572784424,
1641
  "learning_rate": 0.0001,
1642
+ "loss": 0.3401,
1643
  "step": 20900
1644
  },
1645
  {
1646
  "epoch": 19.281615790681663,
1647
+ "grad_norm": 1.3106017112731934,
1648
  "learning_rate": 0.0001,
1649
+ "loss": 0.3503,
1650
  "step": 21000
1651
  },
1652
  {
1653
  "epoch": 19.37342207941244,
1654
+ "grad_norm": 1.3490264415740967,
1655
  "learning_rate": 0.0001,
1656
+ "loss": 0.3588,
1657
  "step": 21100
1658
  },
1659
  {
1660
  "epoch": 19.46522836814322,
1661
+ "grad_norm": 1.3568042516708374,
1662
  "learning_rate": 0.0001,
1663
+ "loss": 0.363,
1664
  "step": 21200
1665
  },
1666
  {
1667
  "epoch": 19.557034656873995,
1668
+ "grad_norm": 1.2495017051696777,
1669
  "learning_rate": 0.0001,
1670
+ "loss": 0.3604,
1671
  "step": 21300
1672
  },
1673
  {
1674
  "epoch": 19.648840945604775,
1675
+ "grad_norm": 1.6772491931915283,
1676
  "learning_rate": 0.0001,
1677
+ "loss": 0.367,
1678
  "step": 21400
1679
  },
1680
  {
1681
  "epoch": 19.74064723433555,
1682
+ "grad_norm": 1.5906344652175903,
1683
  "learning_rate": 0.0001,
1684
+ "loss": 0.3777,
1685
  "step": 21500
1686
  },
1687
  {
1688
  "epoch": 19.83245352306633,
1689
+ "grad_norm": 1.5872870683670044,
1690
  "learning_rate": 0.0001,
1691
+ "loss": 0.3892,
1692
  "step": 21600
1693
  },
1694
  {
1695
  "epoch": 19.924259811797107,
1696
+ "grad_norm": 1.5069637298583984,
1697
  "learning_rate": 0.0001,
1698
+ "loss": 0.3818,
1699
  "step": 21700
1700
  },
1701
  {
1702
  "epoch": 19.99770484278173,
1703
+ "eval_accuracy": 0.5108253968253968,
1704
+ "eval_loss": 3.6298398971557617,
1705
+ "eval_runtime": 5.9317,
1706
+ "eval_samples_per_second": 84.293,
1707
+ "eval_steps_per_second": 10.621,
1708
  "step": 21780
1709
  },
1710
  {
1711
  "epoch": 19.99770484278173,
1712
  "step": 21780,
1713
  "total_flos": 1.5027132150442885e+18,
1714
+ "train_loss": 0.2454464098857673,
1715
+ "train_runtime": 25406.3112,
1716
+ "train_samples_per_second": 27.437,
1717
+ "train_steps_per_second": 0.857
1718
  }
1719
  ],
1720
  "logging_steps": 100,