|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 25.0, |
|
"eval_steps": 500, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 4.823320729485639, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.7211, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 4.751505642660104, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.7397, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 4.487357353479602, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.5714, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.036784685130891, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.4755, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 2.7951107515008022, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 1.0114, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 3.3636166511290284, |
|
"learning_rate": 5e-05, |
|
"loss": 0.866, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 2.077830674223075, |
|
"learning_rate": 5.833333333333334e-05, |
|
"loss": 0.5604, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.1666324696634576, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.339, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 0.8956349805282244, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.2396, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 1.6517703519389952, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 0.2824, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 1.4629478660244202, |
|
"learning_rate": 9.166666666666667e-05, |
|
"loss": 0.247, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.8768061602490982, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1893, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 0.8390983969336053, |
|
"learning_rate": 0.00010833333333333333, |
|
"loss": 0.2698, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 0.9480328834691413, |
|
"learning_rate": 0.00011666666666666668, |
|
"loss": 0.1832, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.9607185561225253, |
|
"learning_rate": 0.000125, |
|
"loss": 0.1928, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.4976206492811293, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.1599, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"grad_norm": 0.6230231499462036, |
|
"learning_rate": 0.00014166666666666668, |
|
"loss": 0.2803, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 0.4748261671538313, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.1435, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"grad_norm": 0.41444527359590055, |
|
"learning_rate": 0.00015833333333333332, |
|
"loss": 0.1798, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.458788407239296, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 0.1989, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"grad_norm": 0.4563665244618309, |
|
"learning_rate": 0.000175, |
|
"loss": 0.1647, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 0.4105054285151717, |
|
"learning_rate": 0.00018333333333333334, |
|
"loss": 0.1164, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"grad_norm": 0.3562784794883969, |
|
"learning_rate": 0.00019166666666666667, |
|
"loss": 0.1172, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.38610424041542285, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1056, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 0.35527586407983786, |
|
"learning_rate": 0.00019999918050612108, |
|
"loss": 0.1217, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 0.3633401519435876, |
|
"learning_rate": 0.00019999672203791565, |
|
"loss": 0.1458, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"grad_norm": 0.3603552448206864, |
|
"learning_rate": 0.00019999262463567773, |
|
"loss": 0.1261, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.3388971002892635, |
|
"learning_rate": 0.00019998688836656323, |
|
"loss": 0.1309, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"grad_norm": 0.4061212900739158, |
|
"learning_rate": 0.0001999795133245889, |
|
"loss": 0.1091, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.39345006056668225, |
|
"learning_rate": 0.0001999704996306308, |
|
"loss": 0.1627, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"grad_norm": 0.38142281255979, |
|
"learning_rate": 0.00019995984743242226, |
|
"loss": 0.1327, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.5305995721603599, |
|
"learning_rate": 0.00019994755690455152, |
|
"loss": 0.156, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"grad_norm": 0.2546721306857309, |
|
"learning_rate": 0.00019993362824845875, |
|
"loss": 0.0744, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 0.3997709948362388, |
|
"learning_rate": 0.000199918061692433, |
|
"loss": 0.1605, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 0.39616508988629523, |
|
"learning_rate": 0.00019990085749160822, |
|
"loss": 0.1306, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.4619577933372822, |
|
"learning_rate": 0.0001998820159279591, |
|
"loss": 0.1318, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"grad_norm": 0.2968290985862986, |
|
"learning_rate": 0.00019986153731029656, |
|
"loss": 0.0788, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 0.4062269625672033, |
|
"learning_rate": 0.0001998394219742627, |
|
"loss": 0.1207, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"grad_norm": 0.36908209897646804, |
|
"learning_rate": 0.00019981567028232514, |
|
"loss": 0.1006, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.23625961054729414, |
|
"learning_rate": 0.00019979028262377118, |
|
"loss": 0.1083, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"grad_norm": 0.3069294649987226, |
|
"learning_rate": 0.00019976325941470146, |
|
"loss": 0.0962, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 0.23363462936052565, |
|
"learning_rate": 0.00019973460109802305, |
|
"loss": 0.096, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"grad_norm": 0.29106062929023385, |
|
"learning_rate": 0.0001997043081434423, |
|
"loss": 0.134, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.18087001511209616, |
|
"learning_rate": 0.00019967238104745696, |
|
"loss": 0.1031, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 0.24123960365724528, |
|
"learning_rate": 0.00019963882033334826, |
|
"loss": 0.1082, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 0.25054721428161647, |
|
"learning_rate": 0.00019960362655117218, |
|
"loss": 0.0798, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"grad_norm": 0.5422539067423491, |
|
"learning_rate": 0.00019956680027775051, |
|
"loss": 0.1254, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.2565364589386019, |
|
"learning_rate": 0.0001995283421166614, |
|
"loss": 0.0979, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 3.0625, |
|
"grad_norm": 0.215553765023481, |
|
"learning_rate": 0.00019948825269822934, |
|
"loss": 0.0656, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 0.20810732914954436, |
|
"learning_rate": 0.00019944653267951504, |
|
"loss": 0.0793, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 3.1875, |
|
"grad_norm": 0.28614832461281026, |
|
"learning_rate": 0.00019940318274430449, |
|
"loss": 0.099, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.32282441297094233, |
|
"learning_rate": 0.00019935820360309777, |
|
"loss": 0.0958, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 3.3125, |
|
"grad_norm": 0.1946365159456168, |
|
"learning_rate": 0.00019931159599309757, |
|
"loss": 0.0808, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"grad_norm": 0.327313439667935, |
|
"learning_rate": 0.00019926336067819684, |
|
"loss": 0.081, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"grad_norm": 0.25559584990032447, |
|
"learning_rate": 0.00019921349844896654, |
|
"loss": 0.0855, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.42835063024229314, |
|
"learning_rate": 0.00019916201012264254, |
|
"loss": 0.0863, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 3.5625, |
|
"grad_norm": 0.4079608215806712, |
|
"learning_rate": 0.00019910889654311208, |
|
"loss": 0.0749, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"grad_norm": 0.2658074683405381, |
|
"learning_rate": 0.00019905415858090036, |
|
"loss": 0.0829, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 3.6875, |
|
"grad_norm": 0.26176221704098207, |
|
"learning_rate": 0.00019899779713315575, |
|
"loss": 0.0711, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.4107999604774971, |
|
"learning_rate": 0.00019893981312363562, |
|
"loss": 0.0925, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 3.8125, |
|
"grad_norm": 0.28972083293965484, |
|
"learning_rate": 0.00019888020750269067, |
|
"loss": 0.1033, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 3.875, |
|
"grad_norm": 0.22772110434236376, |
|
"learning_rate": 0.00019881898124724981, |
|
"loss": 0.0858, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 3.9375, |
|
"grad_norm": 0.27554810738804436, |
|
"learning_rate": 0.0001987561353608038, |
|
"loss": 0.0988, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.33177229863886515, |
|
"learning_rate": 0.00019869167087338907, |
|
"loss": 0.0558, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 4.0625, |
|
"grad_norm": 0.307474933124812, |
|
"learning_rate": 0.00019862558884157068, |
|
"loss": 0.099, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 4.125, |
|
"grad_norm": 0.2608601940862889, |
|
"learning_rate": 0.00019855789034842504, |
|
"loss": 0.0633, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 4.1875, |
|
"grad_norm": 0.17638905908873861, |
|
"learning_rate": 0.00019848857650352214, |
|
"loss": 0.0646, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.29040712597623347, |
|
"learning_rate": 0.00019841764844290744, |
|
"loss": 0.1065, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 4.3125, |
|
"grad_norm": 0.2712830927822363, |
|
"learning_rate": 0.00019834510732908315, |
|
"loss": 0.0829, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"grad_norm": 0.23863314085657264, |
|
"learning_rate": 0.00019827095435098925, |
|
"loss": 0.0745, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 4.4375, |
|
"grad_norm": 0.1793388758656021, |
|
"learning_rate": 0.000198195190723984, |
|
"loss": 0.072, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.23980978867337854, |
|
"learning_rate": 0.0001981178176898239, |
|
"loss": 0.0838, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 4.5625, |
|
"grad_norm": 0.2043201014604402, |
|
"learning_rate": 0.0001980388365166436, |
|
"loss": 0.0632, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 4.625, |
|
"grad_norm": 0.15180530742317308, |
|
"learning_rate": 0.0001979582484989348, |
|
"loss": 0.0424, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 4.6875, |
|
"grad_norm": 0.2547988574530317, |
|
"learning_rate": 0.00019787605495752528, |
|
"loss": 0.0851, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.1869380541972453, |
|
"learning_rate": 0.00019779225723955707, |
|
"loss": 0.0565, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 4.8125, |
|
"grad_norm": 0.2716796793411313, |
|
"learning_rate": 0.00019770685671846456, |
|
"loss": 0.0772, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 4.875, |
|
"grad_norm": 0.17433079754490768, |
|
"learning_rate": 0.0001976198547939518, |
|
"loss": 0.0482, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 4.9375, |
|
"grad_norm": 0.18011278074792128, |
|
"learning_rate": 0.0001975312528919697, |
|
"loss": 0.0518, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.31968800498614336, |
|
"learning_rate": 0.00019744105246469263, |
|
"loss": 0.083, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 5.0625, |
|
"grad_norm": 0.2357655152058512, |
|
"learning_rate": 0.00019734925499049447, |
|
"loss": 0.069, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 5.125, |
|
"grad_norm": 0.34723380394306, |
|
"learning_rate": 0.0001972558619739246, |
|
"loss": 0.0514, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 5.1875, |
|
"grad_norm": 0.21060705394218132, |
|
"learning_rate": 0.00019716087494568317, |
|
"loss": 0.0587, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 0.24633955772510746, |
|
"learning_rate": 0.00019706429546259593, |
|
"loss": 0.0534, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 5.3125, |
|
"grad_norm": 0.2306490127122376, |
|
"learning_rate": 0.00019696612510758876, |
|
"loss": 0.0585, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 5.375, |
|
"grad_norm": 0.21690646083214937, |
|
"learning_rate": 0.00019686636548966178, |
|
"loss": 0.0545, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 5.4375, |
|
"grad_norm": 0.25251567374450357, |
|
"learning_rate": 0.00019676501824386294, |
|
"loss": 0.049, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.19640965019118795, |
|
"learning_rate": 0.00019666208503126112, |
|
"loss": 0.0419, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 5.5625, |
|
"grad_norm": 0.268693556790637, |
|
"learning_rate": 0.00019655756753891916, |
|
"loss": 0.0619, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 5.625, |
|
"grad_norm": 0.18291834239390523, |
|
"learning_rate": 0.0001964514674798659, |
|
"loss": 0.0295, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 5.6875, |
|
"grad_norm": 0.24981548084974023, |
|
"learning_rate": 0.00019634378659306832, |
|
"loss": 0.0699, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 0.2086985689956262, |
|
"learning_rate": 0.00019623452664340306, |
|
"loss": 0.0496, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 5.8125, |
|
"grad_norm": 0.2043098976035677, |
|
"learning_rate": 0.0001961236894216272, |
|
"loss": 0.0407, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 5.875, |
|
"grad_norm": 0.2730615592143012, |
|
"learning_rate": 0.00019601127674434928, |
|
"loss": 0.0631, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 5.9375, |
|
"grad_norm": 0.3501805095305464, |
|
"learning_rate": 0.00019589729045399934, |
|
"loss": 0.071, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.20729413014232384, |
|
"learning_rate": 0.00019578173241879872, |
|
"loss": 0.045, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 6.0625, |
|
"grad_norm": 0.15750376698735663, |
|
"learning_rate": 0.00019566460453272945, |
|
"loss": 0.0346, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 6.125, |
|
"grad_norm": 0.3137680018534623, |
|
"learning_rate": 0.0001955459087155033, |
|
"loss": 0.0442, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 6.1875, |
|
"grad_norm": 0.24917821149174516, |
|
"learning_rate": 0.0001954256469125301, |
|
"loss": 0.0425, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 0.15986754420947197, |
|
"learning_rate": 0.0001953038210948861, |
|
"loss": 0.0468, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 6.3125, |
|
"grad_norm": 0.22286136536824913, |
|
"learning_rate": 0.00019518043325928157, |
|
"loss": 0.0506, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 6.375, |
|
"grad_norm": 0.24914639465088984, |
|
"learning_rate": 0.00019505548542802804, |
|
"loss": 0.0706, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 6.4375, |
|
"grad_norm": 0.1752134277500243, |
|
"learning_rate": 0.00019492897964900512, |
|
"loss": 0.0468, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.1927795196511069, |
|
"learning_rate": 0.00019480091799562704, |
|
"loss": 0.0561, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 6.5625, |
|
"grad_norm": 0.1700835087697781, |
|
"learning_rate": 0.00019467130256680868, |
|
"loss": 0.0381, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 6.625, |
|
"grad_norm": 0.2497630067036982, |
|
"learning_rate": 0.00019454013548693102, |
|
"loss": 0.0437, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 6.6875, |
|
"grad_norm": 0.18837543811238555, |
|
"learning_rate": 0.00019440741890580643, |
|
"loss": 0.0526, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 0.22518955186910106, |
|
"learning_rate": 0.00019427315499864344, |
|
"loss": 0.0424, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 6.8125, |
|
"grad_norm": 0.20500175172697058, |
|
"learning_rate": 0.00019413734596601104, |
|
"loss": 0.052, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 6.875, |
|
"grad_norm": 0.12045773353666347, |
|
"learning_rate": 0.00019399999403380266, |
|
"loss": 0.0189, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 6.9375, |
|
"grad_norm": 0.2148224667509155, |
|
"learning_rate": 0.00019386110145319963, |
|
"loss": 0.0461, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.23216240017196932, |
|
"learning_rate": 0.00019372067050063438, |
|
"loss": 0.0628, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 7.0625, |
|
"grad_norm": 0.27496419377253983, |
|
"learning_rate": 0.000193578703477753, |
|
"loss": 0.0504, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 7.125, |
|
"grad_norm": 0.18897397234008034, |
|
"learning_rate": 0.00019343520271137763, |
|
"loss": 0.0418, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 7.1875, |
|
"grad_norm": 0.17029862867910886, |
|
"learning_rate": 0.0001932901705534683, |
|
"loss": 0.0364, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"grad_norm": 0.1963658278569445, |
|
"learning_rate": 0.00019314360938108425, |
|
"loss": 0.0495, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 7.3125, |
|
"grad_norm": 0.24460981520111755, |
|
"learning_rate": 0.00019299552159634517, |
|
"loss": 0.052, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 7.375, |
|
"grad_norm": 0.2841837300999824, |
|
"learning_rate": 0.00019284590962639176, |
|
"loss": 0.0251, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 7.4375, |
|
"grad_norm": 0.28136757069268153, |
|
"learning_rate": 0.0001926947759233459, |
|
"loss": 0.0583, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.14123250014906416, |
|
"learning_rate": 0.00019254212296427044, |
|
"loss": 0.0247, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 7.5625, |
|
"grad_norm": 0.24291209764375074, |
|
"learning_rate": 0.0001923879532511287, |
|
"loss": 0.041, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 7.625, |
|
"grad_norm": 0.16313867008770622, |
|
"learning_rate": 0.0001922322693107434, |
|
"loss": 0.0267, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 7.6875, |
|
"grad_norm": 0.20009496777109764, |
|
"learning_rate": 0.0001920750736947553, |
|
"loss": 0.0451, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"grad_norm": 0.33391229054639826, |
|
"learning_rate": 0.00019191636897958122, |
|
"loss": 0.0659, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 7.8125, |
|
"grad_norm": 0.20026703724460174, |
|
"learning_rate": 0.0001917561577663721, |
|
"loss": 0.0309, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 7.875, |
|
"grad_norm": 0.22455588585805783, |
|
"learning_rate": 0.00019159444268097012, |
|
"loss": 0.0396, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 7.9375, |
|
"grad_norm": 0.26799981180801946, |
|
"learning_rate": 0.00019143122637386566, |
|
"loss": 0.0541, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.19547469523965652, |
|
"learning_rate": 0.00019126651152015403, |
|
"loss": 0.0551, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 8.0625, |
|
"grad_norm": 0.1544644919932735, |
|
"learning_rate": 0.00019110030081949156, |
|
"loss": 0.025, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 8.125, |
|
"grad_norm": 0.19630976778261258, |
|
"learning_rate": 0.00019093259699605125, |
|
"loss": 0.0301, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 8.1875, |
|
"grad_norm": 0.17570512618519246, |
|
"learning_rate": 0.0001907634027984782, |
|
"loss": 0.0284, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"grad_norm": 0.15656419342775668, |
|
"learning_rate": 0.0001905927209998447, |
|
"loss": 0.0316, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 8.3125, |
|
"grad_norm": 0.20086086826122584, |
|
"learning_rate": 0.00019042055439760444, |
|
"loss": 0.0406, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 8.375, |
|
"grad_norm": 0.17929729489809332, |
|
"learning_rate": 0.000190246905813547, |
|
"loss": 0.0306, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 8.4375, |
|
"grad_norm": 0.17408633598619777, |
|
"learning_rate": 0.0001900717780937514, |
|
"loss": 0.0331, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.22245736981289757, |
|
"learning_rate": 0.00018989517410853955, |
|
"loss": 0.0296, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 8.5625, |
|
"grad_norm": 0.15106374249798415, |
|
"learning_rate": 0.0001897170967524291, |
|
"loss": 0.0195, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 8.625, |
|
"grad_norm": 0.19583907014296853, |
|
"learning_rate": 0.00018953754894408616, |
|
"loss": 0.034, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 8.6875, |
|
"grad_norm": 0.2516341266285177, |
|
"learning_rate": 0.0001893565336262773, |
|
"loss": 0.0397, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 0.21858233256386844, |
|
"learning_rate": 0.00018917405376582145, |
|
"loss": 0.0413, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 8.8125, |
|
"grad_norm": 0.2198644417350149, |
|
"learning_rate": 0.00018899011235354115, |
|
"loss": 0.037, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 8.875, |
|
"grad_norm": 0.14391410038600197, |
|
"learning_rate": 0.00018880471240421365, |
|
"loss": 0.0243, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 8.9375, |
|
"grad_norm": 0.19142537938933432, |
|
"learning_rate": 0.00018861785695652142, |
|
"loss": 0.0378, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.273105976253114, |
|
"learning_rate": 0.00018842954907300236, |
|
"loss": 0.0335, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 9.0625, |
|
"grad_norm": 0.1764202905014519, |
|
"learning_rate": 0.00018823979183999964, |
|
"loss": 0.0269, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 9.125, |
|
"grad_norm": 0.16768199512274265, |
|
"learning_rate": 0.00018804858836761107, |
|
"loss": 0.0274, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 9.1875, |
|
"grad_norm": 0.11993797625263519, |
|
"learning_rate": 0.0001878559417896382, |
|
"loss": 0.0197, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 0.1113638157849448, |
|
"learning_rate": 0.0001876618552635348, |
|
"loss": 0.0144, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 9.3125, |
|
"grad_norm": 0.404272192796613, |
|
"learning_rate": 0.00018746633197035527, |
|
"loss": 0.0623, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 9.375, |
|
"grad_norm": 0.21941870781755787, |
|
"learning_rate": 0.00018726937511470246, |
|
"loss": 0.0353, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 9.4375, |
|
"grad_norm": 0.15013935972184475, |
|
"learning_rate": 0.00018707098792467515, |
|
"loss": 0.0212, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 0.15110805919842035, |
|
"learning_rate": 0.00018687117365181512, |
|
"loss": 0.0218, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 9.5625, |
|
"grad_norm": 0.2279637543691053, |
|
"learning_rate": 0.00018666993557105377, |
|
"loss": 0.0384, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 9.625, |
|
"grad_norm": 0.22049558442795594, |
|
"learning_rate": 0.00018646727698065865, |
|
"loss": 0.0386, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 9.6875, |
|
"grad_norm": 0.19273043393336428, |
|
"learning_rate": 0.00018626320120217923, |
|
"loss": 0.0261, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"grad_norm": 0.28617400835044193, |
|
"learning_rate": 0.00018605771158039253, |
|
"loss": 0.0366, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 9.8125, |
|
"grad_norm": 0.18709984153605747, |
|
"learning_rate": 0.00018585081148324832, |
|
"loss": 0.0291, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 9.875, |
|
"grad_norm": 0.1292782602493134, |
|
"learning_rate": 0.00018564250430181387, |
|
"loss": 0.0199, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 9.9375, |
|
"grad_norm": 0.13979049475637031, |
|
"learning_rate": 0.00018543279345021834, |
|
"loss": 0.0157, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.19965128862936724, |
|
"learning_rate": 0.00018522168236559695, |
|
"loss": 0.0323, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 10.0625, |
|
"grad_norm": 0.15854406411462987, |
|
"learning_rate": 0.0001850091745080345, |
|
"loss": 0.029, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 10.125, |
|
"grad_norm": 0.21210803120758442, |
|
"learning_rate": 0.00018479527336050878, |
|
"loss": 0.0275, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 10.1875, |
|
"grad_norm": 0.14570318547786973, |
|
"learning_rate": 0.00018457998242883344, |
|
"loss": 0.0198, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 10.25, |
|
"grad_norm": 0.1365650856121692, |
|
"learning_rate": 0.00018436330524160047, |
|
"loss": 0.0187, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 10.3125, |
|
"grad_norm": 0.14366514107884812, |
|
"learning_rate": 0.00018414524535012244, |
|
"loss": 0.0201, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 10.375, |
|
"grad_norm": 0.11689977724032004, |
|
"learning_rate": 0.00018392580632837423, |
|
"loss": 0.0127, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 10.4375, |
|
"grad_norm": 0.14547947591736377, |
|
"learning_rate": 0.00018370499177293464, |
|
"loss": 0.021, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 10.5, |
|
"grad_norm": 0.13962090636283936, |
|
"learning_rate": 0.00018348280530292713, |
|
"loss": 0.0198, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 10.5625, |
|
"grad_norm": 0.16513279241014592, |
|
"learning_rate": 0.00018325925055996076, |
|
"loss": 0.0292, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 10.625, |
|
"grad_norm": 0.09476674044103778, |
|
"learning_rate": 0.0001830343312080704, |
|
"loss": 0.0163, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 10.6875, |
|
"grad_norm": 0.2189319278786043, |
|
"learning_rate": 0.00018280805093365672, |
|
"loss": 0.0267, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 10.75, |
|
"grad_norm": 0.11950416301901717, |
|
"learning_rate": 0.00018258041344542566, |
|
"loss": 0.0162, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 10.8125, |
|
"grad_norm": 0.21271468962636456, |
|
"learning_rate": 0.00018235142247432782, |
|
"loss": 0.0341, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 10.875, |
|
"grad_norm": 0.26291334746224676, |
|
"learning_rate": 0.0001821210817734972, |
|
"loss": 0.0183, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 10.9375, |
|
"grad_norm": 0.7541232207929679, |
|
"learning_rate": 0.00018188939511818965, |
|
"loss": 0.0341, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 0.2668657582109521, |
|
"learning_rate": 0.0001816563663057211, |
|
"loss": 0.0248, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 11.0625, |
|
"grad_norm": 0.188760844502015, |
|
"learning_rate": 0.00018142199915540527, |
|
"loss": 0.0167, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 11.125, |
|
"grad_norm": 0.13049188810156132, |
|
"learning_rate": 0.00018118629750849105, |
|
"loss": 0.0111, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 11.1875, |
|
"grad_norm": 0.12147377234191868, |
|
"learning_rate": 0.0001809492652280996, |
|
"loss": 0.0132, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 11.25, |
|
"grad_norm": 0.1561909420624479, |
|
"learning_rate": 0.00018071090619916093, |
|
"loss": 0.017, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 11.3125, |
|
"grad_norm": 0.2761203872951007, |
|
"learning_rate": 0.00018047122432835038, |
|
"loss": 0.0242, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 11.375, |
|
"grad_norm": 0.13770345739451892, |
|
"learning_rate": 0.0001802302235440245, |
|
"loss": 0.0168, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 11.4375, |
|
"grad_norm": 0.18722163720923593, |
|
"learning_rate": 0.0001799879077961566, |
|
"loss": 0.0301, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 11.5, |
|
"grad_norm": 0.16157498504579615, |
|
"learning_rate": 0.00017974428105627208, |
|
"loss": 0.0188, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 11.5625, |
|
"grad_norm": 0.11914193260548754, |
|
"learning_rate": 0.00017949934731738347, |
|
"loss": 0.0167, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 11.625, |
|
"grad_norm": 0.21366231459710572, |
|
"learning_rate": 0.0001792531105939247, |
|
"loss": 0.0307, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 11.6875, |
|
"grad_norm": 0.12591549609993827, |
|
"learning_rate": 0.0001790055749216856, |
|
"loss": 0.0156, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 11.75, |
|
"grad_norm": 0.17099000320728966, |
|
"learning_rate": 0.00017875674435774547, |
|
"loss": 0.0213, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 11.8125, |
|
"grad_norm": 0.21223314589021008, |
|
"learning_rate": 0.00017850662298040678, |
|
"loss": 0.0303, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 11.875, |
|
"grad_norm": 0.18967657804115237, |
|
"learning_rate": 0.0001782552148891283, |
|
"loss": 0.0147, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 11.9375, |
|
"grad_norm": 0.2411319114734683, |
|
"learning_rate": 0.00017800252420445788, |
|
"loss": 0.0356, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.19592901070523805, |
|
"learning_rate": 0.00017774855506796496, |
|
"loss": 0.0361, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 12.0625, |
|
"grad_norm": 0.1922774431645892, |
|
"learning_rate": 0.0001774933116421725, |
|
"loss": 0.0163, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 12.125, |
|
"grad_norm": 0.1336099627438813, |
|
"learning_rate": 0.00017723679811048904, |
|
"loss": 0.016, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 12.1875, |
|
"grad_norm": 0.11902506990463049, |
|
"learning_rate": 0.00017697901867713995, |
|
"loss": 0.0128, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 12.25, |
|
"grad_norm": 0.15265650075862036, |
|
"learning_rate": 0.00017671997756709863, |
|
"loss": 0.0158, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 12.3125, |
|
"grad_norm": 0.10409493403901945, |
|
"learning_rate": 0.0001764596790260171, |
|
"loss": 0.01, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 12.375, |
|
"grad_norm": 0.12592944148803245, |
|
"learning_rate": 0.00017619812732015664, |
|
"loss": 0.0081, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 12.4375, |
|
"grad_norm": 0.15892735182879292, |
|
"learning_rate": 0.00017593532673631766, |
|
"loss": 0.0226, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 12.5, |
|
"grad_norm": 0.16370083252883327, |
|
"learning_rate": 0.00017567128158176953, |
|
"loss": 0.0156, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 12.5625, |
|
"grad_norm": 0.1926319403468188, |
|
"learning_rate": 0.00017540599618418007, |
|
"loss": 0.0198, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 12.625, |
|
"grad_norm": 0.1361218264727559, |
|
"learning_rate": 0.00017513947489154443, |
|
"loss": 0.014, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 12.6875, |
|
"grad_norm": 0.15346622202020466, |
|
"learning_rate": 0.00017487172207211396, |
|
"loss": 0.0149, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 12.75, |
|
"grad_norm": 0.1582659958129119, |
|
"learning_rate": 0.0001746027421143246, |
|
"loss": 0.0209, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 12.8125, |
|
"grad_norm": 0.08422642994909509, |
|
"learning_rate": 0.00017433253942672496, |
|
"loss": 0.0107, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 12.875, |
|
"grad_norm": 0.14151191719126865, |
|
"learning_rate": 0.000174061118437904, |
|
"loss": 0.0163, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 12.9375, |
|
"grad_norm": 0.21170441141264745, |
|
"learning_rate": 0.00017378848359641847, |
|
"loss": 0.0248, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 0.21854641357858184, |
|
"learning_rate": 0.00017351463937072004, |
|
"loss": 0.0314, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 13.0625, |
|
"grad_norm": 0.1460880269124164, |
|
"learning_rate": 0.00017323959024908209, |
|
"loss": 0.01, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 13.125, |
|
"grad_norm": 0.2314775735978235, |
|
"learning_rate": 0.00017296334073952605, |
|
"loss": 0.0158, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 13.1875, |
|
"grad_norm": 0.25609276829659133, |
|
"learning_rate": 0.0001726858953697475, |
|
"loss": 0.0228, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 13.25, |
|
"grad_norm": 0.05892147625450794, |
|
"learning_rate": 0.00017240725868704218, |
|
"loss": 0.0054, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 13.3125, |
|
"grad_norm": 0.09265829411728602, |
|
"learning_rate": 0.00017212743525823112, |
|
"loss": 0.0121, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 13.375, |
|
"grad_norm": 0.28766545739529126, |
|
"learning_rate": 0.0001718464296695861, |
|
"loss": 0.0166, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 13.4375, |
|
"grad_norm": 0.22056507132944547, |
|
"learning_rate": 0.0001715642465267543, |
|
"loss": 0.0189, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 13.5, |
|
"grad_norm": 0.20425585831942672, |
|
"learning_rate": 0.00017128089045468294, |
|
"loss": 0.0226, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 13.5625, |
|
"grad_norm": 0.08920463828133202, |
|
"learning_rate": 0.00017099636609754329, |
|
"loss": 0.0111, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 13.625, |
|
"grad_norm": 0.07587770729105793, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 0.0107, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 13.6875, |
|
"grad_norm": 0.2436581509018354, |
|
"learning_rate": 0.00017042383120040834, |
|
"loss": 0.0122, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 13.75, |
|
"grad_norm": 0.2776351805516073, |
|
"learning_rate": 0.00017013583004418993, |
|
"loss": 0.0338, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 13.8125, |
|
"grad_norm": 0.1500695552486999, |
|
"learning_rate": 0.00016984667937030318, |
|
"loss": 0.0128, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 13.875, |
|
"grad_norm": 0.19022226617986523, |
|
"learning_rate": 0.00016955638391789228, |
|
"loss": 0.0182, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 13.9375, |
|
"grad_norm": 0.07668703300214032, |
|
"learning_rate": 0.00016926494844486412, |
|
"loss": 0.0081, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 0.09791844205901491, |
|
"learning_rate": 0.00016897237772781044, |
|
"loss": 0.0102, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 14.0625, |
|
"grad_norm": 0.13334638868212356, |
|
"learning_rate": 0.00016867867656192946, |
|
"loss": 0.0154, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 14.125, |
|
"grad_norm": 0.0443847074815828, |
|
"learning_rate": 0.00016838384976094738, |
|
"loss": 0.0038, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 14.1875, |
|
"grad_norm": 0.22900928937804826, |
|
"learning_rate": 0.00016808790215703935, |
|
"loss": 0.0146, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 14.25, |
|
"grad_norm": 0.09820050963868261, |
|
"learning_rate": 0.00016779083860075033, |
|
"loss": 0.0139, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 14.3125, |
|
"grad_norm": 0.12261110863547245, |
|
"learning_rate": 0.0001674926639609157, |
|
"loss": 0.0081, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 14.375, |
|
"grad_norm": 0.4737769261848427, |
|
"learning_rate": 0.00016719338312458124, |
|
"loss": 0.0196, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 14.4375, |
|
"grad_norm": 0.0719988039842083, |
|
"learning_rate": 0.00016689300099692332, |
|
"loss": 0.0075, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 14.5, |
|
"grad_norm": 0.24824987963252254, |
|
"learning_rate": 0.00016659152250116812, |
|
"loss": 0.0095, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 14.5625, |
|
"grad_norm": 0.1806409419450783, |
|
"learning_rate": 0.00016628895257851135, |
|
"loss": 0.0177, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 14.625, |
|
"grad_norm": 0.12022842892878163, |
|
"learning_rate": 0.000165985296188037, |
|
"loss": 0.0108, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 14.6875, |
|
"grad_norm": 0.27573651025583323, |
|
"learning_rate": 0.0001656805583066361, |
|
"loss": 0.0408, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 14.75, |
|
"grad_norm": 0.13027830763516066, |
|
"learning_rate": 0.00016537474392892528, |
|
"loss": 0.0164, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 14.8125, |
|
"grad_norm": 0.2190711849455461, |
|
"learning_rate": 0.00016506785806716465, |
|
"loss": 0.0381, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 14.875, |
|
"grad_norm": 0.18889153886713622, |
|
"learning_rate": 0.00016475990575117605, |
|
"loss": 0.0137, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 14.9375, |
|
"grad_norm": 0.17427492795979294, |
|
"learning_rate": 0.0001644508920282601, |
|
"loss": 0.0259, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.13217744356726124, |
|
"learning_rate": 0.000164140821963114, |
|
"loss": 0.0099, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 15.0625, |
|
"grad_norm": 0.08062959617570911, |
|
"learning_rate": 0.0001638297006377481, |
|
"loss": 0.0065, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 15.125, |
|
"grad_norm": 0.11874477325134665, |
|
"learning_rate": 0.00016351753315140287, |
|
"loss": 0.0132, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 15.1875, |
|
"grad_norm": 0.08293002909973335, |
|
"learning_rate": 0.00016320432462046516, |
|
"loss": 0.0093, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 15.25, |
|
"grad_norm": 0.08700808300439221, |
|
"learning_rate": 0.00016289008017838445, |
|
"loss": 0.0077, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 15.3125, |
|
"grad_norm": 0.12073296585647869, |
|
"learning_rate": 0.00016257480497558873, |
|
"loss": 0.0096, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 15.375, |
|
"grad_norm": 0.07654303999452532, |
|
"learning_rate": 0.0001622585041793999, |
|
"loss": 0.0059, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 15.4375, |
|
"grad_norm": 0.2562520960634689, |
|
"learning_rate": 0.00016194118297394936, |
|
"loss": 0.0263, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 15.5, |
|
"grad_norm": 0.08068310167444095, |
|
"learning_rate": 0.00016162284656009274, |
|
"loss": 0.0062, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 15.5625, |
|
"grad_norm": 0.2090301776269612, |
|
"learning_rate": 0.00016130350015532496, |
|
"loss": 0.0201, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 15.625, |
|
"grad_norm": 0.18851005491544473, |
|
"learning_rate": 0.00016098314899369446, |
|
"loss": 0.0129, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 15.6875, |
|
"grad_norm": 0.10484133416084232, |
|
"learning_rate": 0.0001606617983257176, |
|
"loss": 0.0058, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 15.75, |
|
"grad_norm": 0.17883267452407117, |
|
"learning_rate": 0.00016033945341829248, |
|
"loss": 0.0194, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 15.8125, |
|
"grad_norm": 0.15051127763163427, |
|
"learning_rate": 0.00016001611955461265, |
|
"loss": 0.011, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 15.875, |
|
"grad_norm": 0.09161429352244004, |
|
"learning_rate": 0.0001596918020340805, |
|
"loss": 0.0045, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 15.9375, |
|
"grad_norm": 0.11734353010884248, |
|
"learning_rate": 0.00015936650617222063, |
|
"loss": 0.007, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.1956323556889479, |
|
"learning_rate": 0.00015904023730059228, |
|
"loss": 0.0165, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 16.0625, |
|
"grad_norm": 0.14747898179025032, |
|
"learning_rate": 0.00015871300076670234, |
|
"loss": 0.0146, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 16.125, |
|
"grad_norm": 0.18498598991778836, |
|
"learning_rate": 0.00015838480193391754, |
|
"loss": 0.0102, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 16.1875, |
|
"grad_norm": 0.2419734729440462, |
|
"learning_rate": 0.0001580556461813766, |
|
"loss": 0.02, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 16.25, |
|
"grad_norm": 0.13549389704091608, |
|
"learning_rate": 0.00015772553890390197, |
|
"loss": 0.0096, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 16.3125, |
|
"grad_norm": 0.09488023406511628, |
|
"learning_rate": 0.0001573944855119115, |
|
"loss": 0.0142, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 16.375, |
|
"grad_norm": 0.08589054588641899, |
|
"learning_rate": 0.00015706249143132982, |
|
"loss": 0.0086, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 16.4375, |
|
"grad_norm": 0.18599116361768675, |
|
"learning_rate": 0.00015672956210349923, |
|
"loss": 0.0158, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 16.5, |
|
"grad_norm": 0.13790931505797482, |
|
"learning_rate": 0.00015639570298509064, |
|
"loss": 0.0076, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 16.5625, |
|
"grad_norm": 0.14746745744271494, |
|
"learning_rate": 0.0001560609195480142, |
|
"loss": 0.0144, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 16.625, |
|
"grad_norm": 0.1747110630319595, |
|
"learning_rate": 0.00015572521727932935, |
|
"loss": 0.0209, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 16.6875, |
|
"grad_norm": 0.12294368607669189, |
|
"learning_rate": 0.00015538860168115527, |
|
"loss": 0.0076, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 16.75, |
|
"grad_norm": 0.20591709989710746, |
|
"learning_rate": 0.00015505107827058036, |
|
"loss": 0.0109, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 16.8125, |
|
"grad_norm": 0.12788905057548938, |
|
"learning_rate": 0.00015471265257957202, |
|
"loss": 0.0137, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 16.875, |
|
"grad_norm": 0.0907643332122881, |
|
"learning_rate": 0.00015437333015488587, |
|
"loss": 0.004, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 16.9375, |
|
"grad_norm": 0.11598695941727767, |
|
"learning_rate": 0.00015403311655797492, |
|
"loss": 0.0173, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 0.1274596211332466, |
|
"learning_rate": 0.0001536920173648984, |
|
"loss": 0.0137, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 17.0625, |
|
"grad_norm": 0.11710054449245828, |
|
"learning_rate": 0.00015335003816623028, |
|
"loss": 0.0114, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 17.125, |
|
"grad_norm": 0.10810173543943251, |
|
"learning_rate": 0.00015300718456696778, |
|
"loss": 0.0083, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 17.1875, |
|
"grad_norm": 0.11490194551290545, |
|
"learning_rate": 0.00015266346218643947, |
|
"loss": 0.0104, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 17.25, |
|
"grad_norm": 0.10496790427956344, |
|
"learning_rate": 0.000152318876658213, |
|
"loss": 0.0117, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 17.3125, |
|
"grad_norm": 0.13303819260520472, |
|
"learning_rate": 0.00015197343363000307, |
|
"loss": 0.0172, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 17.375, |
|
"grad_norm": 0.07976187691489159, |
|
"learning_rate": 0.00015162713876357858, |
|
"loss": 0.0118, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 17.4375, |
|
"grad_norm": 0.020200494852139077, |
|
"learning_rate": 0.00015127999773467002, |
|
"loss": 0.0018, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 17.5, |
|
"grad_norm": 0.08302979036485983, |
|
"learning_rate": 0.00015093201623287631, |
|
"loss": 0.0074, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 17.5625, |
|
"grad_norm": 0.09709353670094575, |
|
"learning_rate": 0.00015058319996157172, |
|
"loss": 0.0141, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 17.625, |
|
"grad_norm": 0.10500205069763988, |
|
"learning_rate": 0.0001502335546378122, |
|
"loss": 0.0082, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 17.6875, |
|
"grad_norm": 0.045653846262314286, |
|
"learning_rate": 0.00014988308599224183, |
|
"loss": 0.0037, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 17.75, |
|
"grad_norm": 0.07350542815715672, |
|
"learning_rate": 0.00014953179976899878, |
|
"loss": 0.007, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 17.8125, |
|
"grad_norm": 0.0740842955766792, |
|
"learning_rate": 0.0001491797017256212, |
|
"loss": 0.0041, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 17.875, |
|
"grad_norm": 0.11610913098575787, |
|
"learning_rate": 0.00014882679763295306, |
|
"loss": 0.0177, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 17.9375, |
|
"grad_norm": 0.11006673147506568, |
|
"learning_rate": 0.0001484730932750491, |
|
"loss": 0.0124, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.17310654116013702, |
|
"learning_rate": 0.00014811859444908052, |
|
"loss": 0.0174, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 18.0625, |
|
"grad_norm": 0.05397642477813828, |
|
"learning_rate": 0.00014776330696523963, |
|
"loss": 0.0047, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 18.125, |
|
"grad_norm": 0.07510434928834142, |
|
"learning_rate": 0.00014740723664664483, |
|
"loss": 0.0084, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 18.1875, |
|
"grad_norm": 0.15571308024563857, |
|
"learning_rate": 0.00014705038932924503, |
|
"loss": 0.0061, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 18.25, |
|
"grad_norm": 0.04276361043100037, |
|
"learning_rate": 0.00014669277086172406, |
|
"loss": 0.0052, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 18.3125, |
|
"grad_norm": 0.10566723422506075, |
|
"learning_rate": 0.00014633438710540489, |
|
"loss": 0.0095, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 18.375, |
|
"grad_norm": 0.12332021633992264, |
|
"learning_rate": 0.00014597524393415335, |
|
"loss": 0.0125, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 18.4375, |
|
"grad_norm": 0.06085873636051086, |
|
"learning_rate": 0.00014561534723428205, |
|
"loss": 0.0036, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 18.5, |
|
"grad_norm": 0.08889363982250575, |
|
"learning_rate": 0.00014525470290445392, |
|
"loss": 0.0062, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 18.5625, |
|
"grad_norm": 0.03473335852451447, |
|
"learning_rate": 0.00014489331685558525, |
|
"loss": 0.0026, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 18.625, |
|
"grad_norm": 0.08852650667058536, |
|
"learning_rate": 0.00014453119501074924, |
|
"loss": 0.011, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 18.6875, |
|
"grad_norm": 0.03288743535270982, |
|
"learning_rate": 0.00014416834330507856, |
|
"loss": 0.0031, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 18.75, |
|
"grad_norm": 0.16831306636699117, |
|
"learning_rate": 0.00014380476768566824, |
|
"loss": 0.0142, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 18.8125, |
|
"grad_norm": 0.060653415545579646, |
|
"learning_rate": 0.00014344047411147818, |
|
"loss": 0.0084, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 18.875, |
|
"grad_norm": 0.0379853861449074, |
|
"learning_rate": 0.00014307546855323549, |
|
"loss": 0.0022, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 18.9375, |
|
"grad_norm": 0.09664333466138222, |
|
"learning_rate": 0.00014270975699333654, |
|
"loss": 0.0057, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 0.09719048721891094, |
|
"learning_rate": 0.00014234334542574906, |
|
"loss": 0.0092, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 19.0625, |
|
"grad_norm": 0.04572304214406104, |
|
"learning_rate": 0.00014197623985591373, |
|
"loss": 0.0018, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 19.125, |
|
"grad_norm": 0.05097257049081179, |
|
"learning_rate": 0.00014160844630064595, |
|
"loss": 0.0057, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 19.1875, |
|
"grad_norm": 0.030553512605281084, |
|
"learning_rate": 0.00014123997078803707, |
|
"loss": 0.0026, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 19.25, |
|
"grad_norm": 0.05445407477912121, |
|
"learning_rate": 0.00014087081935735564, |
|
"loss": 0.0057, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 19.3125, |
|
"grad_norm": 0.06371575221400023, |
|
"learning_rate": 0.00014050099805894837, |
|
"loss": 0.0084, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 19.375, |
|
"grad_norm": 0.06261401628069665, |
|
"learning_rate": 0.00014013051295414108, |
|
"loss": 0.0081, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 19.4375, |
|
"grad_norm": 0.035834190161274884, |
|
"learning_rate": 0.00013975937011513932, |
|
"loss": 0.0045, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 19.5, |
|
"grad_norm": 0.0524760162484543, |
|
"learning_rate": 0.00013938757562492873, |
|
"loss": 0.0062, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 19.5625, |
|
"grad_norm": 0.10667039377919106, |
|
"learning_rate": 0.00013901513557717553, |
|
"loss": 0.0041, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 19.625, |
|
"grad_norm": 0.05041143161792446, |
|
"learning_rate": 0.00013864205607612648, |
|
"loss": 0.0052, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 19.6875, |
|
"grad_norm": 0.0424817284436791, |
|
"learning_rate": 0.000138268343236509, |
|
"loss": 0.0055, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 19.75, |
|
"grad_norm": 0.05668813950953166, |
|
"learning_rate": 0.00013789400318343068, |
|
"loss": 0.0071, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 19.8125, |
|
"grad_norm": 0.028939867545089322, |
|
"learning_rate": 0.0001375190420522792, |
|
"loss": 0.003, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 19.875, |
|
"grad_norm": 0.06412293116062714, |
|
"learning_rate": 0.00013714346598862166, |
|
"loss": 0.0067, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 19.9375, |
|
"grad_norm": 0.06349552195339572, |
|
"learning_rate": 0.00013676728114810367, |
|
"loss": 0.0068, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.09285652351669382, |
|
"learning_rate": 0.00013639049369634876, |
|
"loss": 0.0108, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 20.0625, |
|
"grad_norm": 0.038426113694616765, |
|
"learning_rate": 0.00013601310980885714, |
|
"loss": 0.0039, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 20.125, |
|
"grad_norm": 0.03317557018896503, |
|
"learning_rate": 0.0001356351356709045, |
|
"loss": 0.003, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 20.1875, |
|
"grad_norm": 0.059248532620137406, |
|
"learning_rate": 0.00013525657747744072, |
|
"loss": 0.0059, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 20.25, |
|
"grad_norm": 0.05514984219384242, |
|
"learning_rate": 0.00013487744143298822, |
|
"loss": 0.004, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 20.3125, |
|
"grad_norm": 0.041791984793344325, |
|
"learning_rate": 0.0001344977337515404, |
|
"loss": 0.0039, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 20.375, |
|
"grad_norm": 0.06346255904291057, |
|
"learning_rate": 0.0001341174606564596, |
|
"loss": 0.0078, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 20.4375, |
|
"grad_norm": 0.029692961959192875, |
|
"learning_rate": 0.00013373662838037537, |
|
"loss": 0.0027, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 20.5, |
|
"grad_norm": 0.0481477761395951, |
|
"learning_rate": 0.00013335524316508208, |
|
"loss": 0.0072, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 20.5625, |
|
"grad_norm": 0.07552298978231338, |
|
"learning_rate": 0.00013297331126143667, |
|
"loss": 0.0042, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 20.625, |
|
"grad_norm": 0.03033536526572307, |
|
"learning_rate": 0.00013259083892925633, |
|
"loss": 0.0018, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 20.6875, |
|
"grad_norm": 0.020436451514801952, |
|
"learning_rate": 0.00013220783243721572, |
|
"loss": 0.0018, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 20.75, |
|
"grad_norm": 0.05066849555968109, |
|
"learning_rate": 0.0001318242980627444, |
|
"loss": 0.0068, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 20.8125, |
|
"grad_norm": 0.062213499192457326, |
|
"learning_rate": 0.0001314402420919238, |
|
"loss": 0.0071, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 20.875, |
|
"grad_norm": 0.0602651350512265, |
|
"learning_rate": 0.00013105567081938424, |
|
"loss": 0.0057, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 20.9375, |
|
"grad_norm": 0.07142806102643208, |
|
"learning_rate": 0.00013067059054820183, |
|
"loss": 0.011, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"grad_norm": 0.11306410649612093, |
|
"learning_rate": 0.00013028500758979506, |
|
"loss": 0.0061, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 21.0625, |
|
"grad_norm": 0.04077734941537789, |
|
"learning_rate": 0.00012989892826382145, |
|
"loss": 0.0047, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 21.125, |
|
"grad_norm": 0.025438845626494507, |
|
"learning_rate": 0.00012951235889807386, |
|
"loss": 0.0024, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 21.1875, |
|
"grad_norm": 0.04068069801179701, |
|
"learning_rate": 0.00012912530582837682, |
|
"loss": 0.0057, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 21.25, |
|
"grad_norm": 0.06229646373645838, |
|
"learning_rate": 0.00012873777539848283, |
|
"loss": 0.0058, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 21.3125, |
|
"grad_norm": 0.05047183477805069, |
|
"learning_rate": 0.00012834977395996818, |
|
"loss": 0.0073, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 21.375, |
|
"grad_norm": 0.03503202770379194, |
|
"learning_rate": 0.0001279613078721289, |
|
"loss": 0.0035, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 21.4375, |
|
"grad_norm": 0.02025118966232832, |
|
"learning_rate": 0.0001275723835018767, |
|
"loss": 0.0011, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 21.5, |
|
"grad_norm": 0.04264241354679371, |
|
"learning_rate": 0.0001271830072236343, |
|
"loss": 0.0048, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 21.5625, |
|
"grad_norm": 0.059381097002962034, |
|
"learning_rate": 0.0001267931854192313, |
|
"loss": 0.0065, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 21.625, |
|
"grad_norm": 0.03970878770263772, |
|
"learning_rate": 0.0001264029244777993, |
|
"loss": 0.0052, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 21.6875, |
|
"grad_norm": 0.04854204024259956, |
|
"learning_rate": 0.00012601223079566743, |
|
"loss": 0.0036, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 21.75, |
|
"grad_norm": 0.06683236405906955, |
|
"learning_rate": 0.00012562111077625722, |
|
"loss": 0.0078, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 21.8125, |
|
"grad_norm": 0.030148690304593224, |
|
"learning_rate": 0.000125229570829978, |
|
"loss": 0.0044, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 21.875, |
|
"grad_norm": 0.043501391695934324, |
|
"learning_rate": 0.0001248376173741215, |
|
"loss": 0.0044, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 21.9375, |
|
"grad_norm": 0.03558605264264846, |
|
"learning_rate": 0.00012444525683275688, |
|
"loss": 0.0045, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 0.04991180242909837, |
|
"learning_rate": 0.00012405249563662537, |
|
"loss": 0.0067, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 22.0625, |
|
"grad_norm": 0.037891384595762335, |
|
"learning_rate": 0.00012365934022303491, |
|
"loss": 0.006, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 22.125, |
|
"grad_norm": 0.029393462316367327, |
|
"learning_rate": 0.00012326579703575462, |
|
"loss": 0.0059, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 22.1875, |
|
"grad_norm": 0.02302636254784467, |
|
"learning_rate": 0.00012287187252490913, |
|
"loss": 0.002, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 22.25, |
|
"grad_norm": 0.027267824953848163, |
|
"learning_rate": 0.00012247757314687297, |
|
"loss": 0.004, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 22.3125, |
|
"grad_norm": 0.018193429790619855, |
|
"learning_rate": 0.00012208290536416463, |
|
"loss": 0.0016, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 22.375, |
|
"grad_norm": 0.03843454617816289, |
|
"learning_rate": 0.00012168787564534078, |
|
"loss": 0.0044, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 22.4375, |
|
"grad_norm": 0.033400388055823266, |
|
"learning_rate": 0.0001212924904648902, |
|
"loss": 0.0047, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 22.5, |
|
"grad_norm": 0.026909120960616175, |
|
"learning_rate": 0.00012089675630312754, |
|
"loss": 0.0039, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 22.5625, |
|
"grad_norm": 0.023045776166397722, |
|
"learning_rate": 0.00012050067964608724, |
|
"loss": 0.0022, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 22.625, |
|
"grad_norm": 0.03806155810130501, |
|
"learning_rate": 0.00012010426698541728, |
|
"loss": 0.0052, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 22.6875, |
|
"grad_norm": 0.018336649301123106, |
|
"learning_rate": 0.0001197075248182726, |
|
"loss": 0.0015, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 22.75, |
|
"grad_norm": 0.04461178752361949, |
|
"learning_rate": 0.00011931045964720881, |
|
"loss": 0.0049, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 22.8125, |
|
"grad_norm": 0.03716629926558807, |
|
"learning_rate": 0.00011891307798007536, |
|
"loss": 0.0051, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 22.875, |
|
"grad_norm": 0.10941698413421153, |
|
"learning_rate": 0.00011851538632990921, |
|
"loss": 0.0061, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 22.9375, |
|
"grad_norm": 0.031543122025977796, |
|
"learning_rate": 0.00011811739121482777, |
|
"loss": 0.0032, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"grad_norm": 0.03807850455755241, |
|
"learning_rate": 0.0001177190991579223, |
|
"loss": 0.0054, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 23.0625, |
|
"grad_norm": 0.0691977966708894, |
|
"learning_rate": 0.00011732051668715081, |
|
"loss": 0.0077, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 23.125, |
|
"grad_norm": 0.04515689836142195, |
|
"learning_rate": 0.00011692165033523117, |
|
"loss": 0.0057, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 23.1875, |
|
"grad_norm": 0.03928887922524319, |
|
"learning_rate": 0.00011652250663953415, |
|
"loss": 0.0055, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 23.25, |
|
"grad_norm": 0.02118235648867606, |
|
"learning_rate": 0.00011612309214197599, |
|
"loss": 0.0019, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 23.3125, |
|
"grad_norm": 0.014975510606827147, |
|
"learning_rate": 0.00011572341338891144, |
|
"loss": 0.0013, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 23.375, |
|
"grad_norm": 0.026742366673437934, |
|
"learning_rate": 0.00011532347693102632, |
|
"loss": 0.002, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 23.4375, |
|
"grad_norm": 0.13222981379526266, |
|
"learning_rate": 0.00011492328932323022, |
|
"loss": 0.0065, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 23.5, |
|
"grad_norm": 0.030786902568145764, |
|
"learning_rate": 0.00011452285712454904, |
|
"loss": 0.0032, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 23.5625, |
|
"grad_norm": 0.04300178598048733, |
|
"learning_rate": 0.00011412218689801748, |
|
"loss": 0.0061, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 23.625, |
|
"grad_norm": 0.031601651854756344, |
|
"learning_rate": 0.00011372128521057155, |
|
"loss": 0.0037, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 23.6875, |
|
"grad_norm": 0.07785976030991328, |
|
"learning_rate": 0.00011332015863294076, |
|
"loss": 0.0051, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 23.75, |
|
"grad_norm": 0.02462026239901167, |
|
"learning_rate": 0.00011291881373954065, |
|
"loss": 0.0024, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 23.8125, |
|
"grad_norm": 0.015040874194919356, |
|
"learning_rate": 0.00011251725710836489, |
|
"loss": 0.0011, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 23.875, |
|
"grad_norm": 0.013797157919588528, |
|
"learning_rate": 0.00011211549532087749, |
|
"loss": 0.0012, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 23.9375, |
|
"grad_norm": 0.028167968760020725, |
|
"learning_rate": 0.00011171353496190498, |
|
"loss": 0.0032, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 0.05318582105382964, |
|
"learning_rate": 0.00011131138261952845, |
|
"loss": 0.0093, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 24.0625, |
|
"grad_norm": 0.027391464358218053, |
|
"learning_rate": 0.00011090904488497549, |
|
"loss": 0.0031, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 24.125, |
|
"grad_norm": 0.03229113172070876, |
|
"learning_rate": 0.0001105065283525124, |
|
"loss": 0.0037, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 24.1875, |
|
"grad_norm": 0.030127780485989034, |
|
"learning_rate": 0.00011010383961933581, |
|
"loss": 0.0049, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 24.25, |
|
"grad_norm": 0.013416894665755037, |
|
"learning_rate": 0.00010970098528546481, |
|
"loss": 0.0002, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 24.3125, |
|
"grad_norm": 0.013218671166503056, |
|
"learning_rate": 0.00010929797195363259, |
|
"loss": 0.0012, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 24.375, |
|
"grad_norm": 0.03798660609085839, |
|
"learning_rate": 0.0001088948062291783, |
|
"loss": 0.0066, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 24.4375, |
|
"grad_norm": 0.01875425458258293, |
|
"learning_rate": 0.00010849149471993882, |
|
"loss": 0.0014, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 24.5, |
|
"grad_norm": 0.03284107465970819, |
|
"learning_rate": 0.00010808804403614043, |
|
"loss": 0.0051, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 24.5625, |
|
"grad_norm": 0.0318856663182744, |
|
"learning_rate": 0.00010768446079029044, |
|
"loss": 0.0041, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 24.625, |
|
"grad_norm": 0.03886319257158687, |
|
"learning_rate": 0.0001072807515970688, |
|
"loss": 0.0055, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 24.6875, |
|
"grad_norm": 0.0240690438810162, |
|
"learning_rate": 0.00010687692307321984, |
|
"loss": 0.0026, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 24.75, |
|
"grad_norm": 0.03153859131907879, |
|
"learning_rate": 0.00010647298183744359, |
|
"loss": 0.0037, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 24.8125, |
|
"grad_norm": 0.028754869900780036, |
|
"learning_rate": 0.00010606893451028743, |
|
"loss": 0.0046, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 24.875, |
|
"grad_norm": 0.036835349609799346, |
|
"learning_rate": 0.00010566478771403763, |
|
"loss": 0.0059, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 24.9375, |
|
"grad_norm": 0.03742416571548899, |
|
"learning_rate": 0.00010526054807261067, |
|
"loss": 0.0057, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 0.019217319407847776, |
|
"learning_rate": 0.00010485622221144484, |
|
"loss": 0.0019, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 800, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 80, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4792343961600.0, |
|
"train_batch_size": 5, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|