{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999498746867168, "eval_steps": 500, "global_step": 2244, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013366750208855471, "grad_norm": 2.4323846059062397, "learning_rate": 5e-06, "loss": 1.0521, "step": 10 }, { "epoch": 0.026733500417710943, "grad_norm": 1.2707159898783558, "learning_rate": 5e-06, "loss": 0.9449, "step": 20 }, { "epoch": 0.040100250626566414, "grad_norm": 0.6645760066182232, "learning_rate": 5e-06, "loss": 0.9205, "step": 30 }, { "epoch": 0.053467000835421885, "grad_norm": 0.6860381528425127, "learning_rate": 5e-06, "loss": 0.9062, "step": 40 }, { "epoch": 0.06683375104427736, "grad_norm": 0.8462056832267063, "learning_rate": 5e-06, "loss": 0.8941, "step": 50 }, { "epoch": 0.08020050125313283, "grad_norm": 0.5498617128094427, "learning_rate": 5e-06, "loss": 0.8866, "step": 60 }, { "epoch": 0.0935672514619883, "grad_norm": 0.6217303867910247, "learning_rate": 5e-06, "loss": 0.8719, "step": 70 }, { "epoch": 0.10693400167084377, "grad_norm": 0.687429978149511, "learning_rate": 5e-06, "loss": 0.876, "step": 80 }, { "epoch": 0.12030075187969924, "grad_norm": 0.7639829931940186, "learning_rate": 5e-06, "loss": 0.8704, "step": 90 }, { "epoch": 0.1336675020885547, "grad_norm": 0.5349974897408032, "learning_rate": 5e-06, "loss": 0.8677, "step": 100 }, { "epoch": 0.14703425229741018, "grad_norm": 0.6212381364086903, "learning_rate": 5e-06, "loss": 0.8624, "step": 110 }, { "epoch": 0.16040100250626566, "grad_norm": 0.5610901155787884, "learning_rate": 5e-06, "loss": 0.8621, "step": 120 }, { "epoch": 0.17376775271512113, "grad_norm": 0.6155926013296407, "learning_rate": 5e-06, "loss": 0.8582, "step": 130 }, { "epoch": 0.1871345029239766, "grad_norm": 0.6528571036607788, "learning_rate": 5e-06, "loss": 0.8532, "step": 140 }, { "epoch": 0.20050125313283207, "grad_norm": 0.5372075443842537, "learning_rate": 5e-06, "loss": 0.8492, "step": 150 }, { "epoch": 0.21386800334168754, "grad_norm": 0.7095829143035569, "learning_rate": 5e-06, "loss": 0.8494, "step": 160 }, { "epoch": 0.227234753550543, "grad_norm": 0.7745444177509586, "learning_rate": 5e-06, "loss": 0.8476, "step": 170 }, { "epoch": 0.24060150375939848, "grad_norm": 0.7586050901974903, "learning_rate": 5e-06, "loss": 0.8494, "step": 180 }, { "epoch": 0.25396825396825395, "grad_norm": 0.5964597569119979, "learning_rate": 5e-06, "loss": 0.8498, "step": 190 }, { "epoch": 0.2673350041771094, "grad_norm": 0.6293549963407589, "learning_rate": 5e-06, "loss": 0.8432, "step": 200 }, { "epoch": 0.2807017543859649, "grad_norm": 0.5524407679849426, "learning_rate": 5e-06, "loss": 0.8475, "step": 210 }, { "epoch": 0.29406850459482037, "grad_norm": 0.524350214049005, "learning_rate": 5e-06, "loss": 0.8431, "step": 220 }, { "epoch": 0.30743525480367584, "grad_norm": 0.6760002252683699, "learning_rate": 5e-06, "loss": 0.8386, "step": 230 }, { "epoch": 0.3208020050125313, "grad_norm": 0.5906902446596286, "learning_rate": 5e-06, "loss": 0.8349, "step": 240 }, { "epoch": 0.3341687552213868, "grad_norm": 0.5723926384792003, "learning_rate": 5e-06, "loss": 0.8361, "step": 250 }, { "epoch": 0.34753550543024225, "grad_norm": 0.5616096712561062, "learning_rate": 5e-06, "loss": 0.8368, "step": 260 }, { "epoch": 0.3609022556390977, "grad_norm": 0.5507735559959206, "learning_rate": 5e-06, "loss": 0.835, "step": 270 }, { "epoch": 0.3742690058479532, "grad_norm": 0.4803949597709757, "learning_rate": 5e-06, "loss": 0.8414, "step": 280 }, { "epoch": 0.38763575605680867, "grad_norm": 0.5121852118343002, "learning_rate": 5e-06, "loss": 0.8325, "step": 290 }, { "epoch": 0.40100250626566414, "grad_norm": 0.5559477754717894, "learning_rate": 5e-06, "loss": 0.8364, "step": 300 }, { "epoch": 0.4143692564745196, "grad_norm": 0.7469026400245374, "learning_rate": 5e-06, "loss": 0.8306, "step": 310 }, { "epoch": 0.4277360066833751, "grad_norm": 0.5090947427034287, "learning_rate": 5e-06, "loss": 0.8339, "step": 320 }, { "epoch": 0.44110275689223055, "grad_norm": 0.6018861983279394, "learning_rate": 5e-06, "loss": 0.8283, "step": 330 }, { "epoch": 0.454469507101086, "grad_norm": 0.5434521657719814, "learning_rate": 5e-06, "loss": 0.8285, "step": 340 }, { "epoch": 0.4678362573099415, "grad_norm": 0.5903702809830117, "learning_rate": 5e-06, "loss": 0.8324, "step": 350 }, { "epoch": 0.48120300751879697, "grad_norm": 0.6243867601355255, "learning_rate": 5e-06, "loss": 0.8284, "step": 360 }, { "epoch": 0.49456975772765244, "grad_norm": 0.6094144532555286, "learning_rate": 5e-06, "loss": 0.8283, "step": 370 }, { "epoch": 0.5079365079365079, "grad_norm": 0.5482360219270039, "learning_rate": 5e-06, "loss": 0.8289, "step": 380 }, { "epoch": 0.5213032581453634, "grad_norm": 0.5061542985510644, "learning_rate": 5e-06, "loss": 0.8317, "step": 390 }, { "epoch": 0.5346700083542189, "grad_norm": 0.6652440131533577, "learning_rate": 5e-06, "loss": 0.8256, "step": 400 }, { "epoch": 0.5480367585630743, "grad_norm": 0.5613018728699922, "learning_rate": 5e-06, "loss": 0.8252, "step": 410 }, { "epoch": 0.5614035087719298, "grad_norm": 0.7255190718604577, "learning_rate": 5e-06, "loss": 0.8247, "step": 420 }, { "epoch": 0.5747702589807853, "grad_norm": 0.6781380945175464, "learning_rate": 5e-06, "loss": 0.823, "step": 430 }, { "epoch": 0.5881370091896407, "grad_norm": 0.5530197743336887, "learning_rate": 5e-06, "loss": 0.8251, "step": 440 }, { "epoch": 0.6015037593984962, "grad_norm": 0.571851888660113, "learning_rate": 5e-06, "loss": 0.8232, "step": 450 }, { "epoch": 0.6148705096073517, "grad_norm": 0.5208791337420644, "learning_rate": 5e-06, "loss": 0.8235, "step": 460 }, { "epoch": 0.6282372598162071, "grad_norm": 0.5198842932978275, "learning_rate": 5e-06, "loss": 0.8238, "step": 470 }, { "epoch": 0.6416040100250626, "grad_norm": 0.48452315583166233, "learning_rate": 5e-06, "loss": 0.8221, "step": 480 }, { "epoch": 0.6549707602339181, "grad_norm": 0.5219240912238245, "learning_rate": 5e-06, "loss": 0.8168, "step": 490 }, { "epoch": 0.6683375104427736, "grad_norm": 0.51813285089071, "learning_rate": 5e-06, "loss": 0.8173, "step": 500 }, { "epoch": 0.681704260651629, "grad_norm": 0.49897768190410446, "learning_rate": 5e-06, "loss": 0.8193, "step": 510 }, { "epoch": 0.6950710108604845, "grad_norm": 0.546834157816808, "learning_rate": 5e-06, "loss": 0.8129, "step": 520 }, { "epoch": 0.70843776106934, "grad_norm": 0.5295360571693272, "learning_rate": 5e-06, "loss": 0.8194, "step": 530 }, { "epoch": 0.7218045112781954, "grad_norm": 0.6854942956404928, "learning_rate": 5e-06, "loss": 0.8193, "step": 540 }, { "epoch": 0.7351712614870509, "grad_norm": 0.6819748794747951, "learning_rate": 5e-06, "loss": 0.8161, "step": 550 }, { "epoch": 0.7485380116959064, "grad_norm": 0.7134808000164234, "learning_rate": 5e-06, "loss": 0.8166, "step": 560 }, { "epoch": 0.7619047619047619, "grad_norm": 0.6412479917820569, "learning_rate": 5e-06, "loss": 0.8172, "step": 570 }, { "epoch": 0.7752715121136173, "grad_norm": 0.5246142664617556, "learning_rate": 5e-06, "loss": 0.8145, "step": 580 }, { "epoch": 0.7886382623224728, "grad_norm": 0.588843604202556, "learning_rate": 5e-06, "loss": 0.82, "step": 590 }, { "epoch": 0.8020050125313283, "grad_norm": 0.5124861711768851, "learning_rate": 5e-06, "loss": 0.8156, "step": 600 }, { "epoch": 0.8153717627401837, "grad_norm": 0.5015203839251716, "learning_rate": 5e-06, "loss": 0.8191, "step": 610 }, { "epoch": 0.8287385129490392, "grad_norm": 0.6441893371422894, "learning_rate": 5e-06, "loss": 0.812, "step": 620 }, { "epoch": 0.8421052631578947, "grad_norm": 0.5838304398634407, "learning_rate": 5e-06, "loss": 0.8086, "step": 630 }, { "epoch": 0.8554720133667502, "grad_norm": 0.5107304906894905, "learning_rate": 5e-06, "loss": 0.8155, "step": 640 }, { "epoch": 0.8688387635756056, "grad_norm": 0.5122885155184959, "learning_rate": 5e-06, "loss": 0.8131, "step": 650 }, { "epoch": 0.8822055137844611, "grad_norm": 0.5985811394437027, "learning_rate": 5e-06, "loss": 0.8104, "step": 660 }, { "epoch": 0.8955722639933166, "grad_norm": 0.5323936368547137, "learning_rate": 5e-06, "loss": 0.8186, "step": 670 }, { "epoch": 0.908939014202172, "grad_norm": 0.616312309430872, "learning_rate": 5e-06, "loss": 0.8124, "step": 680 }, { "epoch": 0.9223057644110275, "grad_norm": 0.6593022396181776, "learning_rate": 5e-06, "loss": 0.8156, "step": 690 }, { "epoch": 0.935672514619883, "grad_norm": 0.5181097754729659, "learning_rate": 5e-06, "loss": 0.8135, "step": 700 }, { "epoch": 0.9490392648287385, "grad_norm": 0.5160202542043503, "learning_rate": 5e-06, "loss": 0.8108, "step": 710 }, { "epoch": 0.9624060150375939, "grad_norm": 0.5439429222609182, "learning_rate": 5e-06, "loss": 0.8098, "step": 720 }, { "epoch": 0.9757727652464494, "grad_norm": 0.5666778381149935, "learning_rate": 5e-06, "loss": 0.8064, "step": 730 }, { "epoch": 0.9891395154553049, "grad_norm": 0.5087008142559319, "learning_rate": 5e-06, "loss": 0.8124, "step": 740 }, { "epoch": 0.9998329156223893, "eval_loss": 0.8087860345840454, "eval_runtime": 793.9439, "eval_samples_per_second": 25.391, "eval_steps_per_second": 0.397, "step": 748 }, { "epoch": 1.0025062656641603, "grad_norm": 0.6885103061332264, "learning_rate": 5e-06, "loss": 0.8763, "step": 750 }, { "epoch": 1.0158730158730158, "grad_norm": 0.6156521836752095, "learning_rate": 5e-06, "loss": 0.7692, "step": 760 }, { "epoch": 1.0292397660818713, "grad_norm": 0.6134559509903806, "learning_rate": 5e-06, "loss": 0.7719, "step": 770 }, { "epoch": 1.0426065162907268, "grad_norm": 0.635583159755333, "learning_rate": 5e-06, "loss": 0.7724, "step": 780 }, { "epoch": 1.0559732664995822, "grad_norm": 0.5771840092558814, "learning_rate": 5e-06, "loss": 0.7724, "step": 790 }, { "epoch": 1.0693400167084377, "grad_norm": 0.5138399093282234, "learning_rate": 5e-06, "loss": 0.7671, "step": 800 }, { "epoch": 1.0827067669172932, "grad_norm": 0.5865180500219783, "learning_rate": 5e-06, "loss": 0.7741, "step": 810 }, { "epoch": 1.0960735171261486, "grad_norm": 0.5737059877569465, "learning_rate": 5e-06, "loss": 0.7735, "step": 820 }, { "epoch": 1.1094402673350041, "grad_norm": 0.7198057887439943, "learning_rate": 5e-06, "loss": 0.7715, "step": 830 }, { "epoch": 1.1228070175438596, "grad_norm": 0.723247678442899, "learning_rate": 5e-06, "loss": 0.7688, "step": 840 }, { "epoch": 1.136173767752715, "grad_norm": 0.5724777994659187, "learning_rate": 5e-06, "loss": 0.7709, "step": 850 }, { "epoch": 1.1495405179615705, "grad_norm": 0.6343455699124487, "learning_rate": 5e-06, "loss": 0.7756, "step": 860 }, { "epoch": 1.162907268170426, "grad_norm": 0.5975092244071976, "learning_rate": 5e-06, "loss": 0.7762, "step": 870 }, { "epoch": 1.1762740183792815, "grad_norm": 0.5550810138685736, "learning_rate": 5e-06, "loss": 0.7713, "step": 880 }, { "epoch": 1.189640768588137, "grad_norm": 0.6031833100946619, "learning_rate": 5e-06, "loss": 0.7717, "step": 890 }, { "epoch": 1.2030075187969924, "grad_norm": 0.5674692784021945, "learning_rate": 5e-06, "loss": 0.7714, "step": 900 }, { "epoch": 1.2163742690058479, "grad_norm": 0.6831373781930358, "learning_rate": 5e-06, "loss": 0.7727, "step": 910 }, { "epoch": 1.2297410192147034, "grad_norm": 0.517398562451772, "learning_rate": 5e-06, "loss": 0.7715, "step": 920 }, { "epoch": 1.2431077694235588, "grad_norm": 0.5689793551691444, "learning_rate": 5e-06, "loss": 0.7682, "step": 930 }, { "epoch": 1.2564745196324143, "grad_norm": 0.6979997189308218, "learning_rate": 5e-06, "loss": 0.7753, "step": 940 }, { "epoch": 1.2698412698412698, "grad_norm": 0.5431703707142987, "learning_rate": 5e-06, "loss": 0.7726, "step": 950 }, { "epoch": 1.2832080200501252, "grad_norm": 0.5341233588300426, "learning_rate": 5e-06, "loss": 0.7721, "step": 960 }, { "epoch": 1.2965747702589807, "grad_norm": 0.5621957425809071, "learning_rate": 5e-06, "loss": 0.7702, "step": 970 }, { "epoch": 1.3099415204678362, "grad_norm": 0.6187116295591158, "learning_rate": 5e-06, "loss": 0.7755, "step": 980 }, { "epoch": 1.3233082706766917, "grad_norm": 0.6251656247161459, "learning_rate": 5e-06, "loss": 0.7742, "step": 990 }, { "epoch": 1.3366750208855471, "grad_norm": 0.6092934361550684, "learning_rate": 5e-06, "loss": 0.7732, "step": 1000 }, { "epoch": 1.3500417710944026, "grad_norm": 0.8086073910477094, "learning_rate": 5e-06, "loss": 0.7663, "step": 1010 }, { "epoch": 1.363408521303258, "grad_norm": 0.6337909009600926, "learning_rate": 5e-06, "loss": 0.7698, "step": 1020 }, { "epoch": 1.3767752715121135, "grad_norm": 0.6156017975821142, "learning_rate": 5e-06, "loss": 0.7687, "step": 1030 }, { "epoch": 1.390142021720969, "grad_norm": 0.4791494199069362, "learning_rate": 5e-06, "loss": 0.7707, "step": 1040 }, { "epoch": 1.4035087719298245, "grad_norm": 0.5102907384647386, "learning_rate": 5e-06, "loss": 0.7698, "step": 1050 }, { "epoch": 1.41687552213868, "grad_norm": 0.60763231448239, "learning_rate": 5e-06, "loss": 0.7722, "step": 1060 }, { "epoch": 1.4302422723475354, "grad_norm": 0.5538961425736992, "learning_rate": 5e-06, "loss": 0.7769, "step": 1070 }, { "epoch": 1.443609022556391, "grad_norm": 0.511489662319519, "learning_rate": 5e-06, "loss": 0.7709, "step": 1080 }, { "epoch": 1.4569757727652464, "grad_norm": 0.5006381424370965, "learning_rate": 5e-06, "loss": 0.7652, "step": 1090 }, { "epoch": 1.4703425229741018, "grad_norm": 0.6446877306415851, "learning_rate": 5e-06, "loss": 0.7668, "step": 1100 }, { "epoch": 1.4837092731829573, "grad_norm": 0.6472792025046472, "learning_rate": 5e-06, "loss": 0.7748, "step": 1110 }, { "epoch": 1.4970760233918128, "grad_norm": 0.5297094594069526, "learning_rate": 5e-06, "loss": 0.7716, "step": 1120 }, { "epoch": 1.5104427736006683, "grad_norm": 0.5172754876638852, "learning_rate": 5e-06, "loss": 0.7693, "step": 1130 }, { "epoch": 1.5238095238095237, "grad_norm": 0.5499645842959932, "learning_rate": 5e-06, "loss": 0.7663, "step": 1140 }, { "epoch": 1.5371762740183792, "grad_norm": 0.5115786493746641, "learning_rate": 5e-06, "loss": 0.7707, "step": 1150 }, { "epoch": 1.5505430242272347, "grad_norm": 0.5733666230248589, "learning_rate": 5e-06, "loss": 0.7708, "step": 1160 }, { "epoch": 1.5639097744360901, "grad_norm": 0.4914243878129098, "learning_rate": 5e-06, "loss": 0.769, "step": 1170 }, { "epoch": 1.5772765246449456, "grad_norm": 0.5986514689445189, "learning_rate": 5e-06, "loss": 0.7722, "step": 1180 }, { "epoch": 1.590643274853801, "grad_norm": 0.49301214049058534, "learning_rate": 5e-06, "loss": 0.7709, "step": 1190 }, { "epoch": 1.6040100250626566, "grad_norm": 0.49122462674305145, "learning_rate": 5e-06, "loss": 0.7684, "step": 1200 }, { "epoch": 1.617376775271512, "grad_norm": 0.5231320343494373, "learning_rate": 5e-06, "loss": 0.773, "step": 1210 }, { "epoch": 1.6307435254803675, "grad_norm": 0.5974519524827527, "learning_rate": 5e-06, "loss": 0.7703, "step": 1220 }, { "epoch": 1.644110275689223, "grad_norm": 0.49755848059450075, "learning_rate": 5e-06, "loss": 0.7684, "step": 1230 }, { "epoch": 1.6574770258980784, "grad_norm": 0.49980350150699104, "learning_rate": 5e-06, "loss": 0.7648, "step": 1240 }, { "epoch": 1.670843776106934, "grad_norm": 0.660197673406872, "learning_rate": 5e-06, "loss": 0.7663, "step": 1250 }, { "epoch": 1.6842105263157894, "grad_norm": 0.501447743813946, "learning_rate": 5e-06, "loss": 0.7687, "step": 1260 }, { "epoch": 1.6975772765246449, "grad_norm": 0.47339053427865196, "learning_rate": 5e-06, "loss": 0.7677, "step": 1270 }, { "epoch": 1.7109440267335003, "grad_norm": 0.4776630843112484, "learning_rate": 5e-06, "loss": 0.7705, "step": 1280 }, { "epoch": 1.7243107769423558, "grad_norm": 0.5805611285838953, "learning_rate": 5e-06, "loss": 0.7664, "step": 1290 }, { "epoch": 1.7376775271512113, "grad_norm": 0.5589747352729452, "learning_rate": 5e-06, "loss": 0.7643, "step": 1300 }, { "epoch": 1.7510442773600667, "grad_norm": 0.5862892637271495, "learning_rate": 5e-06, "loss": 0.767, "step": 1310 }, { "epoch": 1.7644110275689222, "grad_norm": 0.6267084370944045, "learning_rate": 5e-06, "loss": 0.7701, "step": 1320 }, { "epoch": 1.7777777777777777, "grad_norm": 0.5590629149887701, "learning_rate": 5e-06, "loss": 0.7725, "step": 1330 }, { "epoch": 1.7911445279866332, "grad_norm": 0.589200505231269, "learning_rate": 5e-06, "loss": 0.768, "step": 1340 }, { "epoch": 1.8045112781954886, "grad_norm": 0.4948446583957624, "learning_rate": 5e-06, "loss": 0.7685, "step": 1350 }, { "epoch": 1.817878028404344, "grad_norm": 0.471229575382462, "learning_rate": 5e-06, "loss": 0.7685, "step": 1360 }, { "epoch": 1.8312447786131996, "grad_norm": 0.5347363048336566, "learning_rate": 5e-06, "loss": 0.7668, "step": 1370 }, { "epoch": 1.844611528822055, "grad_norm": 0.6085798758140744, "learning_rate": 5e-06, "loss": 0.7685, "step": 1380 }, { "epoch": 1.8579782790309105, "grad_norm": 0.49237779847072155, "learning_rate": 5e-06, "loss": 0.766, "step": 1390 }, { "epoch": 1.871345029239766, "grad_norm": 0.5429938063483495, "learning_rate": 5e-06, "loss": 0.7675, "step": 1400 }, { "epoch": 1.8847117794486214, "grad_norm": 0.5315522378087794, "learning_rate": 5e-06, "loss": 0.7651, "step": 1410 }, { "epoch": 1.898078529657477, "grad_norm": 0.5774851920268103, "learning_rate": 5e-06, "loss": 0.7683, "step": 1420 }, { "epoch": 1.9114452798663324, "grad_norm": 0.4774206459938876, "learning_rate": 5e-06, "loss": 0.7651, "step": 1430 }, { "epoch": 1.9248120300751879, "grad_norm": 0.48893280928600313, "learning_rate": 5e-06, "loss": 0.7664, "step": 1440 }, { "epoch": 1.9381787802840433, "grad_norm": 0.47709822943051283, "learning_rate": 5e-06, "loss": 0.7667, "step": 1450 }, { "epoch": 1.9515455304928988, "grad_norm": 0.5221458173728611, "learning_rate": 5e-06, "loss": 0.7649, "step": 1460 }, { "epoch": 1.9649122807017543, "grad_norm": 0.5458985479332612, "learning_rate": 5e-06, "loss": 0.7653, "step": 1470 }, { "epoch": 1.9782790309106097, "grad_norm": 0.5449151757658263, "learning_rate": 5e-06, "loss": 0.7665, "step": 1480 }, { "epoch": 1.9916457811194652, "grad_norm": 0.5792068417255367, "learning_rate": 5e-06, "loss": 0.7674, "step": 1490 }, { "epoch": 1.9996658312447786, "eval_loss": 0.7951143383979797, "eval_runtime": 795.386, "eval_samples_per_second": 25.345, "eval_steps_per_second": 0.396, "step": 1496 }, { "epoch": 2.0050125313283207, "grad_norm": 0.7521880602206925, "learning_rate": 5e-06, "loss": 0.8233, "step": 1500 }, { "epoch": 2.018379281537176, "grad_norm": 0.6560054074439666, "learning_rate": 5e-06, "loss": 0.7256, "step": 1510 }, { "epoch": 2.0317460317460316, "grad_norm": 0.5201512747130638, "learning_rate": 5e-06, "loss": 0.7218, "step": 1520 }, { "epoch": 2.045112781954887, "grad_norm": 0.5262590120532872, "learning_rate": 5e-06, "loss": 0.7285, "step": 1530 }, { "epoch": 2.0584795321637426, "grad_norm": 0.5393650388873087, "learning_rate": 5e-06, "loss": 0.7229, "step": 1540 }, { "epoch": 2.071846282372598, "grad_norm": 0.5105428821348765, "learning_rate": 5e-06, "loss": 0.7231, "step": 1550 }, { "epoch": 2.0852130325814535, "grad_norm": 0.6021970483052078, "learning_rate": 5e-06, "loss": 0.7239, "step": 1560 }, { "epoch": 2.098579782790309, "grad_norm": 0.5009099309313954, "learning_rate": 5e-06, "loss": 0.7226, "step": 1570 }, { "epoch": 2.1119465329991645, "grad_norm": 0.5605434690720502, "learning_rate": 5e-06, "loss": 0.7277, "step": 1580 }, { "epoch": 2.12531328320802, "grad_norm": 0.5732299598938305, "learning_rate": 5e-06, "loss": 0.7286, "step": 1590 }, { "epoch": 2.1386800334168754, "grad_norm": 0.5399334511302041, "learning_rate": 5e-06, "loss": 0.726, "step": 1600 }, { "epoch": 2.152046783625731, "grad_norm": 0.505832452848056, "learning_rate": 5e-06, "loss": 0.7304, "step": 1610 }, { "epoch": 2.1654135338345863, "grad_norm": 0.5674143618926153, "learning_rate": 5e-06, "loss": 0.7232, "step": 1620 }, { "epoch": 2.178780284043442, "grad_norm": 0.5068914103748654, "learning_rate": 5e-06, "loss": 0.7336, "step": 1630 }, { "epoch": 2.1921470342522973, "grad_norm": 0.5118320329600874, "learning_rate": 5e-06, "loss": 0.7255, "step": 1640 }, { "epoch": 2.2055137844611528, "grad_norm": 0.5156250232792499, "learning_rate": 5e-06, "loss": 0.7295, "step": 1650 }, { "epoch": 2.2188805346700082, "grad_norm": 0.6165225897496419, "learning_rate": 5e-06, "loss": 0.7274, "step": 1660 }, { "epoch": 2.2322472848788637, "grad_norm": 0.5863877720536036, "learning_rate": 5e-06, "loss": 0.7256, "step": 1670 }, { "epoch": 2.245614035087719, "grad_norm": 0.5641007704480012, "learning_rate": 5e-06, "loss": 0.7308, "step": 1680 }, { "epoch": 2.2589807852965746, "grad_norm": 0.6101312501534099, "learning_rate": 5e-06, "loss": 0.7314, "step": 1690 }, { "epoch": 2.27234753550543, "grad_norm": 0.5200998469176243, "learning_rate": 5e-06, "loss": 0.7275, "step": 1700 }, { "epoch": 2.2857142857142856, "grad_norm": 0.5398343134194046, "learning_rate": 5e-06, "loss": 0.727, "step": 1710 }, { "epoch": 2.299081035923141, "grad_norm": 0.5247712631574941, "learning_rate": 5e-06, "loss": 0.727, "step": 1720 }, { "epoch": 2.3124477861319965, "grad_norm": 0.5655985095958795, "learning_rate": 5e-06, "loss": 0.7286, "step": 1730 }, { "epoch": 2.325814536340852, "grad_norm": 0.5927409653328921, "learning_rate": 5e-06, "loss": 0.7271, "step": 1740 }, { "epoch": 2.3391812865497075, "grad_norm": 0.6148593425957483, "learning_rate": 5e-06, "loss": 0.733, "step": 1750 }, { "epoch": 2.352548036758563, "grad_norm": 0.5969831864554942, "learning_rate": 5e-06, "loss": 0.7302, "step": 1760 }, { "epoch": 2.3659147869674184, "grad_norm": 0.4985456007136878, "learning_rate": 5e-06, "loss": 0.7341, "step": 1770 }, { "epoch": 2.379281537176274, "grad_norm": 0.5005254522981937, "learning_rate": 5e-06, "loss": 0.7244, "step": 1780 }, { "epoch": 2.3926482873851294, "grad_norm": 0.5288709360617612, "learning_rate": 5e-06, "loss": 0.7312, "step": 1790 }, { "epoch": 2.406015037593985, "grad_norm": 0.5355584900475018, "learning_rate": 5e-06, "loss": 0.727, "step": 1800 }, { "epoch": 2.4193817878028403, "grad_norm": 0.5666733459714918, "learning_rate": 5e-06, "loss": 0.731, "step": 1810 }, { "epoch": 2.4327485380116958, "grad_norm": 0.5939862506331437, "learning_rate": 5e-06, "loss": 0.7292, "step": 1820 }, { "epoch": 2.4461152882205512, "grad_norm": 0.5696153125681646, "learning_rate": 5e-06, "loss": 0.7295, "step": 1830 }, { "epoch": 2.4594820384294067, "grad_norm": 0.5263801998302109, "learning_rate": 5e-06, "loss": 0.7289, "step": 1840 }, { "epoch": 2.472848788638262, "grad_norm": 0.5564137280433736, "learning_rate": 5e-06, "loss": 0.7289, "step": 1850 }, { "epoch": 2.4862155388471177, "grad_norm": 0.6117589560276474, "learning_rate": 5e-06, "loss": 0.7281, "step": 1860 }, { "epoch": 2.499582289055973, "grad_norm": 0.5556838242891475, "learning_rate": 5e-06, "loss": 0.7296, "step": 1870 }, { "epoch": 2.5129490392648286, "grad_norm": 0.4681598446789898, "learning_rate": 5e-06, "loss": 0.7296, "step": 1880 }, { "epoch": 2.526315789473684, "grad_norm": 0.5231611697501862, "learning_rate": 5e-06, "loss": 0.7303, "step": 1890 }, { "epoch": 2.5396825396825395, "grad_norm": 0.5126109088017671, "learning_rate": 5e-06, "loss": 0.7324, "step": 1900 }, { "epoch": 2.553049289891395, "grad_norm": 0.5300428577804921, "learning_rate": 5e-06, "loss": 0.7273, "step": 1910 }, { "epoch": 2.5664160401002505, "grad_norm": 0.4968055663040118, "learning_rate": 5e-06, "loss": 0.729, "step": 1920 }, { "epoch": 2.579782790309106, "grad_norm": 0.568494743059541, "learning_rate": 5e-06, "loss": 0.7269, "step": 1930 }, { "epoch": 2.5931495405179614, "grad_norm": 0.5482221484283202, "learning_rate": 5e-06, "loss": 0.7285, "step": 1940 }, { "epoch": 2.606516290726817, "grad_norm": 0.47129332867964935, "learning_rate": 5e-06, "loss": 0.7292, "step": 1950 }, { "epoch": 2.6198830409356724, "grad_norm": 0.5198836974979396, "learning_rate": 5e-06, "loss": 0.7264, "step": 1960 }, { "epoch": 2.633249791144528, "grad_norm": 0.4945939304862693, "learning_rate": 5e-06, "loss": 0.7279, "step": 1970 }, { "epoch": 2.6466165413533833, "grad_norm": 0.5751403403674279, "learning_rate": 5e-06, "loss": 0.7282, "step": 1980 }, { "epoch": 2.659983291562239, "grad_norm": 0.5611452949151137, "learning_rate": 5e-06, "loss": 0.7331, "step": 1990 }, { "epoch": 2.6733500417710943, "grad_norm": 0.6119128996618558, "learning_rate": 5e-06, "loss": 0.7296, "step": 2000 }, { "epoch": 2.6867167919799497, "grad_norm": 0.4799215562608329, "learning_rate": 5e-06, "loss": 0.7298, "step": 2010 }, { "epoch": 2.700083542188805, "grad_norm": 0.5541418078345739, "learning_rate": 5e-06, "loss": 0.7268, "step": 2020 }, { "epoch": 2.7134502923976607, "grad_norm": 0.6870311878219804, "learning_rate": 5e-06, "loss": 0.7277, "step": 2030 }, { "epoch": 2.726817042606516, "grad_norm": 0.5687894755714459, "learning_rate": 5e-06, "loss": 0.7298, "step": 2040 }, { "epoch": 2.7401837928153716, "grad_norm": 0.5330460246090263, "learning_rate": 5e-06, "loss": 0.7325, "step": 2050 }, { "epoch": 2.753550543024227, "grad_norm": 0.5427879116319339, "learning_rate": 5e-06, "loss": 0.7296, "step": 2060 }, { "epoch": 2.7669172932330826, "grad_norm": 0.6013738539276209, "learning_rate": 5e-06, "loss": 0.7281, "step": 2070 }, { "epoch": 2.780284043441938, "grad_norm": 0.6091854363964149, "learning_rate": 5e-06, "loss": 0.7294, "step": 2080 }, { "epoch": 2.7936507936507935, "grad_norm": 0.5190279913663577, "learning_rate": 5e-06, "loss": 0.7248, "step": 2090 }, { "epoch": 2.807017543859649, "grad_norm": 0.5126718278939274, "learning_rate": 5e-06, "loss": 0.7311, "step": 2100 }, { "epoch": 2.8203842940685044, "grad_norm": 0.5571607138857257, "learning_rate": 5e-06, "loss": 0.7318, "step": 2110 }, { "epoch": 2.83375104427736, "grad_norm": 0.5341175882686895, "learning_rate": 5e-06, "loss": 0.7336, "step": 2120 }, { "epoch": 2.8471177944862154, "grad_norm": 0.4817774606348232, "learning_rate": 5e-06, "loss": 0.731, "step": 2130 }, { "epoch": 2.860484544695071, "grad_norm": 0.5487220776810837, "learning_rate": 5e-06, "loss": 0.7282, "step": 2140 }, { "epoch": 2.8738512949039263, "grad_norm": 0.6342699103351254, "learning_rate": 5e-06, "loss": 0.7335, "step": 2150 }, { "epoch": 2.887218045112782, "grad_norm": 0.5078552425291176, "learning_rate": 5e-06, "loss": 0.7273, "step": 2160 }, { "epoch": 2.9005847953216373, "grad_norm": 0.4819316377635323, "learning_rate": 5e-06, "loss": 0.7332, "step": 2170 }, { "epoch": 2.9139515455304927, "grad_norm": 0.4627017239179797, "learning_rate": 5e-06, "loss": 0.7306, "step": 2180 }, { "epoch": 2.927318295739348, "grad_norm": 0.4761325291977869, "learning_rate": 5e-06, "loss": 0.7314, "step": 2190 }, { "epoch": 2.9406850459482037, "grad_norm": 0.5784029020001881, "learning_rate": 5e-06, "loss": 0.7298, "step": 2200 }, { "epoch": 2.954051796157059, "grad_norm": 0.5120822643666457, "learning_rate": 5e-06, "loss": 0.731, "step": 2210 }, { "epoch": 2.9674185463659146, "grad_norm": 0.5116915736315969, "learning_rate": 5e-06, "loss": 0.7322, "step": 2220 }, { "epoch": 2.98078529657477, "grad_norm": 0.5021133290964584, "learning_rate": 5e-06, "loss": 0.7269, "step": 2230 }, { "epoch": 2.9941520467836256, "grad_norm": 0.5317540745896701, "learning_rate": 5e-06, "loss": 0.7322, "step": 2240 }, { "epoch": 2.999498746867168, "eval_loss": 0.7926730513572693, "eval_runtime": 792.6639, "eval_samples_per_second": 25.432, "eval_steps_per_second": 0.397, "step": 2244 }, { "epoch": 2.999498746867168, "step": 2244, "total_flos": 3758574199111680.0, "train_loss": 0.7796513685780625, "train_runtime": 132137.1731, "train_samples_per_second": 8.696, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 2244, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3758574199111680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }