|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 12.0, |
|
"eval_steps": 500, |
|
"global_step": 5892, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05091649694501019, |
|
"grad_norm": 2.806674003601074, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8959, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.10183299389002037, |
|
"grad_norm": 3.7694294452667236, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5457, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15274949083503056, |
|
"grad_norm": 2.5580432415008545, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2675, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.20366598778004075, |
|
"grad_norm": 3.3819141387939453, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1332, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2545824847250509, |
|
"grad_norm": 2.542970895767212, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1962, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3054989816700611, |
|
"grad_norm": 3.139821767807007, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0642, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3564154786150713, |
|
"grad_norm": 2.8020615577697754, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1643, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4073319755600815, |
|
"grad_norm": 2.742553234100342, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0901, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.45824847250509165, |
|
"grad_norm": 2.8044979572296143, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1531, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5091649694501018, |
|
"grad_norm": 2.295903444290161, |
|
"learning_rate": 0.0002, |
|
"loss": 0.981, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.560081466395112, |
|
"grad_norm": 2.5899577140808105, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1367, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6109979633401222, |
|
"grad_norm": 3.2374327182769775, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0535, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6619144602851323, |
|
"grad_norm": 2.467207193374634, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1175, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.7128309572301426, |
|
"grad_norm": 2.5034849643707275, |
|
"learning_rate": 0.0002, |
|
"loss": 1.078, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7637474541751528, |
|
"grad_norm": 2.7688446044921875, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0662, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.814663951120163, |
|
"grad_norm": 2.9710495471954346, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0392, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8655804480651731, |
|
"grad_norm": 2.105386734008789, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0551, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.9164969450101833, |
|
"grad_norm": 2.467886209487915, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0188, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9674134419551935, |
|
"grad_norm": 2.3255183696746826, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0985, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.0183299389002036, |
|
"grad_norm": 1.9878566265106201, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9582, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0692464358452138, |
|
"grad_norm": 2.2030067443847656, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8562, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.120162932790224, |
|
"grad_norm": 2.0550765991210938, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8564, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.1710794297352343, |
|
"grad_norm": 2.8419840335845947, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8446, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.2219959266802445, |
|
"grad_norm": 1.9293944835662842, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8239, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2729124236252547, |
|
"grad_norm": 2.2307465076446533, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7875, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.3238289205702647, |
|
"grad_norm": 2.4380366802215576, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8686, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.374745417515275, |
|
"grad_norm": 2.5356967449188232, |
|
"learning_rate": 0.0002, |
|
"loss": 0.891, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.4256619144602851, |
|
"grad_norm": 1.957305669784546, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8923, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.4765784114052953, |
|
"grad_norm": 3.196012020111084, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8906, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.5274949083503055, |
|
"grad_norm": 2.050201654434204, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9081, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.5784114052953155, |
|
"grad_norm": 2.0173072814941406, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9078, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.629327902240326, |
|
"grad_norm": 2.157409906387329, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8954, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.680244399185336, |
|
"grad_norm": 3.089580535888672, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8553, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.7311608961303462, |
|
"grad_norm": 2.2709248065948486, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9085, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.7820773930753564, |
|
"grad_norm": 2.866403102874756, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8565, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.8329938900203666, |
|
"grad_norm": 2.305607795715332, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8674, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8839103869653768, |
|
"grad_norm": 2.287306070327759, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9006, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.9348268839103868, |
|
"grad_norm": 2.0112550258636475, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8583, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.9857433808553973, |
|
"grad_norm": 2.2255215644836426, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8373, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.0366598778004072, |
|
"grad_norm": 2.1194534301757812, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7602, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.0875763747454177, |
|
"grad_norm": 1.874557614326477, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6402, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 2.1384928716904277, |
|
"grad_norm": 2.008828639984131, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6897, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.189409368635438, |
|
"grad_norm": 1.8525766134262085, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6611, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 2.240325865580448, |
|
"grad_norm": 2.184070348739624, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7043, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.291242362525458, |
|
"grad_norm": 1.9643105268478394, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6444, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 2.3421588594704685, |
|
"grad_norm": 2.2633492946624756, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6915, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.3930753564154785, |
|
"grad_norm": 2.3236186504364014, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6539, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 2.443991853360489, |
|
"grad_norm": 2.5817580223083496, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6922, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.494908350305499, |
|
"grad_norm": 2.4054062366485596, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6485, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 2.5458248472505094, |
|
"grad_norm": 2.730226516723633, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7163, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.5967413441955194, |
|
"grad_norm": 2.436521530151367, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6899, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 2.6476578411405294, |
|
"grad_norm": 2.4914846420288086, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7301, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.69857433808554, |
|
"grad_norm": 2.550816774368286, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6853, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 2.74949083503055, |
|
"grad_norm": 2.2780368328094482, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7165, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.8004073319755602, |
|
"grad_norm": 2.6065120697021484, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6841, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 2.8513238289205702, |
|
"grad_norm": 2.3689310550689697, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7647, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.9022403258655807, |
|
"grad_norm": 3.0321147441864014, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6813, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 2.9531568228105907, |
|
"grad_norm": 2.069146156311035, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7123, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.0040733197556007, |
|
"grad_norm": 2.1185224056243896, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7189, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 3.054989816700611, |
|
"grad_norm": 2.558671236038208, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5302, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.105906313645621, |
|
"grad_norm": 2.6581151485443115, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4756, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 3.1568228105906315, |
|
"grad_norm": 2.602369785308838, |
|
"learning_rate": 0.0002, |
|
"loss": 0.527, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.2077393075356415, |
|
"grad_norm": 2.8149938583374023, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5173, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 3.258655804480652, |
|
"grad_norm": 2.065443515777588, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5756, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.309572301425662, |
|
"grad_norm": 2.308039903640747, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5096, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 3.360488798370672, |
|
"grad_norm": 2.4268381595611572, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5538, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.4114052953156824, |
|
"grad_norm": 2.6498641967773438, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5189, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 3.4623217922606924, |
|
"grad_norm": 2.2553043365478516, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5786, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.513238289205703, |
|
"grad_norm": 2.8816471099853516, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5232, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 3.564154786150713, |
|
"grad_norm": 1.7681572437286377, |
|
"learning_rate": 0.0002, |
|
"loss": 0.569, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.6150712830957232, |
|
"grad_norm": 2.772834062576294, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5357, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 3.6659877800407332, |
|
"grad_norm": 2.4505577087402344, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5848, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.716904276985743, |
|
"grad_norm": 3.0437214374542236, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5571, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 3.7678207739307537, |
|
"grad_norm": 3.2281458377838135, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5956, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.8187372708757636, |
|
"grad_norm": 2.437544345855713, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5542, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 3.869653767820774, |
|
"grad_norm": 2.759650230407715, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5823, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.920570264765784, |
|
"grad_norm": 2.3260252475738525, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5724, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 3.9714867617107945, |
|
"grad_norm": 2.2468202114105225, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5884, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 4.022403258655804, |
|
"grad_norm": 2.221639394760132, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5266, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 4.0733197556008145, |
|
"grad_norm": 1.9102641344070435, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3866, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.124236252545825, |
|
"grad_norm": 2.9819774627685547, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4212, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 4.175152749490835, |
|
"grad_norm": 2.39497709274292, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4073, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 4.226069246435845, |
|
"grad_norm": 2.4051284790039062, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4328, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 4.276985743380855, |
|
"grad_norm": 2.677963972091675, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4294, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.327902240325866, |
|
"grad_norm": 2.484499216079712, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4386, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 4.378818737270876, |
|
"grad_norm": 3.8193187713623047, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4317, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 4.429735234215886, |
|
"grad_norm": 2.5229299068450928, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4597, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 4.480651731160896, |
|
"grad_norm": 2.6942062377929688, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4462, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.531568228105907, |
|
"grad_norm": 2.4558463096618652, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4645, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 4.582484725050916, |
|
"grad_norm": 2.276397466659546, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4721, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 4.633401221995927, |
|
"grad_norm": 2.844794750213623, |
|
"learning_rate": 0.0002, |
|
"loss": 0.477, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 4.684317718940937, |
|
"grad_norm": 2.6256089210510254, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4553, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.7352342158859475, |
|
"grad_norm": 2.599666118621826, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4753, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 4.786150712830957, |
|
"grad_norm": 2.470028877258301, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4433, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.8370672097759675, |
|
"grad_norm": 2.4316930770874023, |
|
"learning_rate": 0.0002, |
|
"loss": 0.493, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 4.887983706720978, |
|
"grad_norm": 2.4588210582733154, |
|
"learning_rate": 0.0002, |
|
"loss": 0.465, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.9389002036659875, |
|
"grad_norm": 2.5438883304595947, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4802, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 4.989816700610998, |
|
"grad_norm": 2.764341354370117, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4788, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 5.040733197556008, |
|
"grad_norm": 2.4877161979675293, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3577, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 5.091649694501018, |
|
"grad_norm": 2.5782573223114014, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3434, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 5.142566191446028, |
|
"grad_norm": 2.363449811935425, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3595, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 5.193482688391039, |
|
"grad_norm": 2.2839596271514893, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3566, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 5.244399185336049, |
|
"grad_norm": 2.3375349044799805, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3704, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 5.295315682281059, |
|
"grad_norm": 2.2514779567718506, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3586, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 5.346232179226069, |
|
"grad_norm": 2.687700033187866, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3797, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 5.39714867617108, |
|
"grad_norm": 2.498506784439087, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3761, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 5.44806517311609, |
|
"grad_norm": 2.632052183151245, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4052, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 5.4989816700611, |
|
"grad_norm": 2.7227554321289062, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3939, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 5.54989816700611, |
|
"grad_norm": 2.041354179382324, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3995, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 5.6008146639511205, |
|
"grad_norm": 4.209399223327637, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3882, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 5.65173116089613, |
|
"grad_norm": 1.9786779880523682, |
|
"learning_rate": 0.0002, |
|
"loss": 0.401, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 5.7026476578411405, |
|
"grad_norm": 3.278057098388672, |
|
"learning_rate": 0.0002, |
|
"loss": 0.391, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 5.753564154786151, |
|
"grad_norm": 2.4366769790649414, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4022, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 5.804480651731161, |
|
"grad_norm": 2.606545925140381, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3864, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 5.855397148676171, |
|
"grad_norm": 2.517683982849121, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4029, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 5.906313645621181, |
|
"grad_norm": 2.00010085105896, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3909, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 5.957230142566192, |
|
"grad_norm": 2.561739683151245, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3977, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 6.008146639511201, |
|
"grad_norm": 2.0152885913848877, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3885, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 6.059063136456212, |
|
"grad_norm": 2.1359171867370605, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3042, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 6.109979633401222, |
|
"grad_norm": 1.833449363708496, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2968, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 6.160896130346233, |
|
"grad_norm": 1.7801095247268677, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3064, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 6.211812627291242, |
|
"grad_norm": 2.25486159324646, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3364, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 6.262729124236253, |
|
"grad_norm": 2.518338680267334, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3269, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 6.313645621181263, |
|
"grad_norm": 1.4665368795394897, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3359, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 6.364562118126273, |
|
"grad_norm": 2.3365631103515625, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3233, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 6.415478615071283, |
|
"grad_norm": 2.406109094619751, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3409, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 6.4663951120162935, |
|
"grad_norm": 3.327075958251953, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3484, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 6.517311608961304, |
|
"grad_norm": 2.125439167022705, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3423, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 6.5682281059063135, |
|
"grad_norm": 2.0253591537475586, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3325, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 6.619144602851324, |
|
"grad_norm": 2.2506959438323975, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3592, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 6.670061099796334, |
|
"grad_norm": 3.3501861095428467, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3404, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 6.720977596741344, |
|
"grad_norm": 2.8377532958984375, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3683, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 6.771894093686354, |
|
"grad_norm": 2.3918981552124023, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3506, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 6.822810590631365, |
|
"grad_norm": 2.026571273803711, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3714, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 6.873727087576375, |
|
"grad_norm": 1.9378567934036255, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3529, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 6.924643584521385, |
|
"grad_norm": 2.5375149250030518, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3614, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 6.975560081466395, |
|
"grad_norm": 2.6719155311584473, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3543, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 7.026476578411406, |
|
"grad_norm": 1.9606503248214722, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3204, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 7.077393075356415, |
|
"grad_norm": 2.2155184745788574, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2657, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 7.128309572301426, |
|
"grad_norm": 2.116616725921631, |
|
"learning_rate": 0.0002, |
|
"loss": 0.29, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 7.179226069246436, |
|
"grad_norm": 1.7466342449188232, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2942, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 7.2301425661914465, |
|
"grad_norm": 2.126159906387329, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2951, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 7.281059063136456, |
|
"grad_norm": 1.983241081237793, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3043, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 7.3319755600814664, |
|
"grad_norm": 2.3558452129364014, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3011, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 7.382892057026477, |
|
"grad_norm": 2.587038516998291, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3116, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 7.433808553971486, |
|
"grad_norm": 2.0329151153564453, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3139, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 7.484725050916497, |
|
"grad_norm": 1.955492377281189, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2996, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 7.535641547861507, |
|
"grad_norm": 1.8513798713684082, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3288, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 7.586558044806518, |
|
"grad_norm": 1.7869365215301514, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2988, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 7.637474541751527, |
|
"grad_norm": 2.4369406700134277, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3168, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 7.688391038696538, |
|
"grad_norm": 2.287158727645874, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3253, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 7.739307535641548, |
|
"grad_norm": 2.4751217365264893, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3143, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 7.790224032586558, |
|
"grad_norm": 2.351529836654663, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3135, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 7.841140529531568, |
|
"grad_norm": 1.7692896127700806, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3226, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 7.892057026476579, |
|
"grad_norm": 1.483668327331543, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3172, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 7.942973523421589, |
|
"grad_norm": 2.4964892864227295, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3395, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 7.993890020366599, |
|
"grad_norm": 1.4458813667297363, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3376, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 8.044806517311608, |
|
"grad_norm": 1.859740972518921, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2504, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 8.095723014256619, |
|
"grad_norm": 1.3244179487228394, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2643, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 8.146639511201629, |
|
"grad_norm": 1.5618747472763062, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2579, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 8.19755600814664, |
|
"grad_norm": 1.806612491607666, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2722, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 8.24847250509165, |
|
"grad_norm": 3.7666194438934326, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2762, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 8.29938900203666, |
|
"grad_norm": 2.5826241970062256, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2749, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 8.35030549898167, |
|
"grad_norm": 1.5352802276611328, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2827, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 8.40122199592668, |
|
"grad_norm": 1.3692405223846436, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2844, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 8.45213849287169, |
|
"grad_norm": 2.2680575847625732, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2921, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 8.5030549898167, |
|
"grad_norm": 1.5913770198822021, |
|
"learning_rate": 0.0002, |
|
"loss": 0.294, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 8.55397148676171, |
|
"grad_norm": 2.9854307174682617, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2971, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 8.604887983706721, |
|
"grad_norm": 2.6485278606414795, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3029, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 8.655804480651732, |
|
"grad_norm": 1.9538838863372803, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2878, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 8.706720977596742, |
|
"grad_norm": 1.6330325603485107, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3144, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 8.757637474541752, |
|
"grad_norm": 1.4803214073181152, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2976, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 8.808553971486761, |
|
"grad_norm": 3.5660393238067627, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3142, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 8.859470468431772, |
|
"grad_norm": 2.3036141395568848, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3012, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 8.910386965376782, |
|
"grad_norm": 2.528514862060547, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3145, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 8.961303462321792, |
|
"grad_norm": 1.4351972341537476, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3136, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 9.012219959266803, |
|
"grad_norm": 1.339784860610962, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2982, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 9.063136456211813, |
|
"grad_norm": 1.6082321405410767, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2447, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 9.114052953156822, |
|
"grad_norm": 2.039848566055298, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2687, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 9.164969450101832, |
|
"grad_norm": 1.8532267808914185, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2656, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 9.215885947046843, |
|
"grad_norm": 2.1240642070770264, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2685, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 9.266802443991853, |
|
"grad_norm": 1.2580517530441284, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2563, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 9.317718940936864, |
|
"grad_norm": 1.3006818294525146, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2726, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 9.368635437881874, |
|
"grad_norm": 1.2992304563522339, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2611, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 9.419551934826885, |
|
"grad_norm": 2.6281898021698, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2844, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 9.470468431771893, |
|
"grad_norm": 1.8424818515777588, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2744, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 9.521384928716904, |
|
"grad_norm": 2.217327356338501, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2943, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 9.572301425661914, |
|
"grad_norm": 1.463914394378662, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2651, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 9.623217922606925, |
|
"grad_norm": 1.7850229740142822, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2853, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 9.674134419551935, |
|
"grad_norm": 1.7324199676513672, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2755, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 9.725050916496945, |
|
"grad_norm": 1.1688644886016846, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2895, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 9.775967413441956, |
|
"grad_norm": 1.8048006296157837, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2728, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 9.826883910386965, |
|
"grad_norm": 1.9764938354492188, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2993, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 9.877800407331975, |
|
"grad_norm": 2.7224347591400146, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2882, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 9.928716904276985, |
|
"grad_norm": 2.1471643447875977, |
|
"learning_rate": 0.0002, |
|
"loss": 0.292, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 9.979633401221996, |
|
"grad_norm": 2.122901678085327, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2771, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 10.030549898167006, |
|
"grad_norm": 1.5285823345184326, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2568, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 10.081466395112017, |
|
"grad_norm": 1.8790913820266724, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2382, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 10.132382892057027, |
|
"grad_norm": 1.7328804731369019, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2489, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 10.183299389002036, |
|
"grad_norm": 1.6446770429611206, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2591, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 10.234215885947046, |
|
"grad_norm": 2.407752275466919, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2636, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 10.285132382892057, |
|
"grad_norm": 1.7200428247451782, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2528, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 10.336048879837067, |
|
"grad_norm": 4.040430545806885, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2609, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 10.386965376782078, |
|
"grad_norm": 1.499848484992981, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2699, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 10.437881873727088, |
|
"grad_norm": 1.3807271718978882, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2698, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 10.488798370672098, |
|
"grad_norm": 2.1875016689300537, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2718, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 10.539714867617107, |
|
"grad_norm": 1.8573893308639526, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2658, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 10.590631364562118, |
|
"grad_norm": 1.7177698612213135, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2666, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 10.641547861507128, |
|
"grad_norm": 2.5458731651306152, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2699, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 10.692464358452138, |
|
"grad_norm": 1.9686267375946045, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2765, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 10.743380855397149, |
|
"grad_norm": 2.519333600997925, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2746, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 10.79429735234216, |
|
"grad_norm": 1.456548810005188, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2746, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 10.84521384928717, |
|
"grad_norm": 2.32214617729187, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2921, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 10.89613034623218, |
|
"grad_norm": 1.5902856588363647, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2849, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 10.947046843177189, |
|
"grad_norm": 2.1129226684570312, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2725, |
|
"step": 5375 |
|
}, |
|
{ |
|
"epoch": 10.9979633401222, |
|
"grad_norm": 2.040208578109741, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2917, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 11.04887983706721, |
|
"grad_norm": 2.841794967651367, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2304, |
|
"step": 5425 |
|
}, |
|
{ |
|
"epoch": 11.09979633401222, |
|
"grad_norm": 1.1914277076721191, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2458, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 11.15071283095723, |
|
"grad_norm": 1.49674391746521, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2369, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 11.201629327902241, |
|
"grad_norm": 0.9921414256095886, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2586, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 11.25254582484725, |
|
"grad_norm": 2.710859537124634, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2436, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 11.30346232179226, |
|
"grad_norm": 1.3944896459579468, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2654, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 11.35437881873727, |
|
"grad_norm": 1.8961577415466309, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2411, |
|
"step": 5575 |
|
}, |
|
{ |
|
"epoch": 11.405295315682281, |
|
"grad_norm": 3.050171136856079, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2615, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 11.456211812627291, |
|
"grad_norm": 2.27951979637146, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2534, |
|
"step": 5625 |
|
}, |
|
{ |
|
"epoch": 11.507128309572302, |
|
"grad_norm": 1.4935775995254517, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2687, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 11.558044806517312, |
|
"grad_norm": 2.3738980293273926, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2485, |
|
"step": 5675 |
|
}, |
|
{ |
|
"epoch": 11.608961303462323, |
|
"grad_norm": 2.942415237426758, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2735, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 11.659877800407331, |
|
"grad_norm": 2.6978747844696045, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2488, |
|
"step": 5725 |
|
}, |
|
{ |
|
"epoch": 11.710794297352342, |
|
"grad_norm": 1.4899920225143433, |
|
"learning_rate": 0.0002, |
|
"loss": 0.267, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 11.761710794297352, |
|
"grad_norm": 2.248117208480835, |
|
"learning_rate": 0.0002, |
|
"loss": 0.253, |
|
"step": 5775 |
|
}, |
|
{ |
|
"epoch": 11.812627291242363, |
|
"grad_norm": 1.3980762958526611, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2742, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 11.863543788187373, |
|
"grad_norm": 3.564595937728882, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2679, |
|
"step": 5825 |
|
}, |
|
{ |
|
"epoch": 11.914460285132384, |
|
"grad_norm": 1.835825800895691, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2799, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 11.965376782077392, |
|
"grad_norm": 1.2737057209014893, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2761, |
|
"step": 5875 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 5892, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 12, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.5000869805056e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|