{ "best_metric": 1.0107625722885132, "best_model_checkpoint": "/data/Andre/Ref-Finder-Mistral/checkpoint-3170", "epoch": 10.0, "eval_steps": 500, "global_step": 3170, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.031545741324921134, "grad_norm": 0.5029881000518799, "learning_rate": 5e-05, "loss": 1.7154, "step": 10 }, { "epoch": 0.06309148264984227, "grad_norm": 0.3064497709274292, "learning_rate": 5e-05, "loss": 1.5963, "step": 20 }, { "epoch": 0.0946372239747634, "grad_norm": 0.3118360638618469, "learning_rate": 5e-05, "loss": 1.5074, "step": 30 }, { "epoch": 0.12618296529968454, "grad_norm": 0.33069083094596863, "learning_rate": 5e-05, "loss": 1.4047, "step": 40 }, { "epoch": 0.15772870662460567, "grad_norm": 0.2797032296657562, "learning_rate": 5e-05, "loss": 1.4167, "step": 50 }, { "epoch": 0.1892744479495268, "grad_norm": 0.3190701901912689, "learning_rate": 5e-05, "loss": 1.3361, "step": 60 }, { "epoch": 0.22082018927444794, "grad_norm": 0.3070685863494873, "learning_rate": 5e-05, "loss": 1.2655, "step": 70 }, { "epoch": 0.25236593059936907, "grad_norm": 0.3203960359096527, "learning_rate": 5e-05, "loss": 1.2295, "step": 80 }, { "epoch": 0.28391167192429023, "grad_norm": 0.30132830142974854, "learning_rate": 5e-05, "loss": 1.2277, "step": 90 }, { "epoch": 0.31545741324921134, "grad_norm": 0.3356678783893585, "learning_rate": 5e-05, "loss": 1.1848, "step": 100 }, { "epoch": 0.3470031545741325, "grad_norm": 0.3275781273841858, "learning_rate": 5e-05, "loss": 1.183, "step": 110 }, { "epoch": 0.3785488958990536, "grad_norm": 0.30640777945518494, "learning_rate": 5e-05, "loss": 1.1488, "step": 120 }, { "epoch": 0.41009463722397477, "grad_norm": 0.5068441033363342, "learning_rate": 5e-05, "loss": 1.147, "step": 130 }, { "epoch": 0.4416403785488959, "grad_norm": 0.310285747051239, "learning_rate": 5e-05, "loss": 1.1908, "step": 140 }, { "epoch": 0.47318611987381703, "grad_norm": 0.38677722215652466, "learning_rate": 5e-05, "loss": 1.1319, "step": 150 }, { "epoch": 0.5047318611987381, "grad_norm": 0.3474641740322113, "learning_rate": 5e-05, "loss": 1.1215, "step": 160 }, { "epoch": 0.5362776025236593, "grad_norm": 0.37211593985557556, "learning_rate": 5e-05, "loss": 1.1503, "step": 170 }, { "epoch": 0.5678233438485805, "grad_norm": 0.3207016885280609, "learning_rate": 5e-05, "loss": 1.182, "step": 180 }, { "epoch": 0.5993690851735016, "grad_norm": 0.34209126234054565, "learning_rate": 5e-05, "loss": 1.1298, "step": 190 }, { "epoch": 0.6309148264984227, "grad_norm": 0.3956719934940338, "learning_rate": 5e-05, "loss": 1.1443, "step": 200 }, { "epoch": 0.6624605678233438, "grad_norm": 0.34193623065948486, "learning_rate": 5e-05, "loss": 1.1148, "step": 210 }, { "epoch": 0.694006309148265, "grad_norm": 0.3550577759742737, "learning_rate": 5e-05, "loss": 1.1091, "step": 220 }, { "epoch": 0.7255520504731862, "grad_norm": 0.34275463223457336, "learning_rate": 5e-05, "loss": 1.1185, "step": 230 }, { "epoch": 0.7570977917981072, "grad_norm": 0.36972326040267944, "learning_rate": 5e-05, "loss": 1.1388, "step": 240 }, { "epoch": 0.7886435331230284, "grad_norm": 0.36260902881622314, "learning_rate": 5e-05, "loss": 1.0982, "step": 250 }, { "epoch": 0.8201892744479495, "grad_norm": 0.35559672117233276, "learning_rate": 5e-05, "loss": 1.1032, "step": 260 }, { "epoch": 0.8517350157728707, "grad_norm": 0.3544253706932068, "learning_rate": 5e-05, "loss": 1.1239, "step": 270 }, { "epoch": 0.8832807570977917, "grad_norm": 0.3803843855857849, "learning_rate": 5e-05, "loss": 1.1006, "step": 280 }, { "epoch": 0.9148264984227129, "grad_norm": 0.3776736855506897, "learning_rate": 5e-05, "loss": 1.1001, "step": 290 }, { "epoch": 0.9463722397476341, "grad_norm": 0.4238007068634033, "learning_rate": 5e-05, "loss": 1.0968, "step": 300 }, { "epoch": 0.9779179810725552, "grad_norm": 0.4062643051147461, "learning_rate": 5e-05, "loss": 1.1188, "step": 310 }, { "epoch": 1.0, "eval_loss": 1.1006102561950684, "eval_runtime": 66.4381, "eval_samples_per_second": 4.786, "eval_steps_per_second": 0.602, "step": 317 }, { "epoch": 1.0094637223974763, "grad_norm": 0.3485482633113861, "learning_rate": 5e-05, "loss": 1.0843, "step": 320 }, { "epoch": 1.0410094637223974, "grad_norm": 0.40558719635009766, "learning_rate": 5e-05, "loss": 1.097, "step": 330 }, { "epoch": 1.0725552050473186, "grad_norm": 0.4074763059616089, "learning_rate": 5e-05, "loss": 1.0697, "step": 340 }, { "epoch": 1.1041009463722398, "grad_norm": 0.40961453318595886, "learning_rate": 5e-05, "loss": 1.0635, "step": 350 }, { "epoch": 1.135646687697161, "grad_norm": 0.3752257227897644, "learning_rate": 5e-05, "loss": 1.0936, "step": 360 }, { "epoch": 1.167192429022082, "grad_norm": 0.3867760896682739, "learning_rate": 5e-05, "loss": 1.0782, "step": 370 }, { "epoch": 1.1987381703470033, "grad_norm": 0.4072268307209015, "learning_rate": 5e-05, "loss": 1.0574, "step": 380 }, { "epoch": 1.2302839116719242, "grad_norm": 0.3942580819129944, "learning_rate": 5e-05, "loss": 1.1081, "step": 390 }, { "epoch": 1.2618296529968454, "grad_norm": 0.4262318015098572, "learning_rate": 5e-05, "loss": 1.0821, "step": 400 }, { "epoch": 1.2933753943217665, "grad_norm": 0.39012083411216736, "learning_rate": 5e-05, "loss": 1.0712, "step": 410 }, { "epoch": 1.3249211356466877, "grad_norm": 0.4160712659358978, "learning_rate": 5e-05, "loss": 1.1106, "step": 420 }, { "epoch": 1.3564668769716088, "grad_norm": 0.3966641128063202, "learning_rate": 5e-05, "loss": 1.0411, "step": 430 }, { "epoch": 1.38801261829653, "grad_norm": 0.3720882833003998, "learning_rate": 5e-05, "loss": 1.0815, "step": 440 }, { "epoch": 1.4195583596214512, "grad_norm": 0.396207332611084, "learning_rate": 5e-05, "loss": 1.0462, "step": 450 }, { "epoch": 1.4511041009463723, "grad_norm": 0.38164132833480835, "learning_rate": 5e-05, "loss": 1.0891, "step": 460 }, { "epoch": 1.4826498422712935, "grad_norm": 0.38896164298057556, "learning_rate": 5e-05, "loss": 1.0905, "step": 470 }, { "epoch": 1.5141955835962144, "grad_norm": 0.4327830374240875, "learning_rate": 5e-05, "loss": 1.1205, "step": 480 }, { "epoch": 1.5457413249211358, "grad_norm": 0.423364520072937, "learning_rate": 5e-05, "loss": 1.0477, "step": 490 }, { "epoch": 1.5772870662460567, "grad_norm": 0.4212876558303833, "learning_rate": 5e-05, "loss": 1.12, "step": 500 }, { "epoch": 1.608832807570978, "grad_norm": 0.3814271092414856, "learning_rate": 5e-05, "loss": 1.0695, "step": 510 }, { "epoch": 1.640378548895899, "grad_norm": 0.3973582983016968, "learning_rate": 5e-05, "loss": 1.0832, "step": 520 }, { "epoch": 1.6719242902208202, "grad_norm": 0.4016555845737457, "learning_rate": 5e-05, "loss": 1.077, "step": 530 }, { "epoch": 1.7034700315457414, "grad_norm": 0.4084228575229645, "learning_rate": 5e-05, "loss": 1.0674, "step": 540 }, { "epoch": 1.7350157728706623, "grad_norm": 0.4218040406703949, "learning_rate": 5e-05, "loss": 1.0464, "step": 550 }, { "epoch": 1.7665615141955837, "grad_norm": 0.3857240080833435, "learning_rate": 5e-05, "loss": 1.0656, "step": 560 }, { "epoch": 1.7981072555205047, "grad_norm": 0.3926863968372345, "learning_rate": 5e-05, "loss": 1.056, "step": 570 }, { "epoch": 1.8296529968454258, "grad_norm": 0.4352160096168518, "learning_rate": 5e-05, "loss": 1.0443, "step": 580 }, { "epoch": 1.861198738170347, "grad_norm": 0.4079754650592804, "learning_rate": 5e-05, "loss": 1.0502, "step": 590 }, { "epoch": 1.8927444794952681, "grad_norm": 0.40210971236228943, "learning_rate": 5e-05, "loss": 1.0613, "step": 600 }, { "epoch": 1.9242902208201893, "grad_norm": 0.3993563950061798, "learning_rate": 5e-05, "loss": 1.0341, "step": 610 }, { "epoch": 1.9558359621451105, "grad_norm": 0.47853732109069824, "learning_rate": 5e-05, "loss": 1.0071, "step": 620 }, { "epoch": 1.9873817034700316, "grad_norm": 0.42926380038261414, "learning_rate": 5e-05, "loss": 1.0383, "step": 630 }, { "epoch": 2.0, "eval_loss": 1.0646495819091797, "eval_runtime": 66.4837, "eval_samples_per_second": 4.783, "eval_steps_per_second": 0.602, "step": 634 }, { "epoch": 2.0189274447949526, "grad_norm": 0.3886430561542511, "learning_rate": 5e-05, "loss": 1.0667, "step": 640 }, { "epoch": 2.050473186119874, "grad_norm": 0.4253116846084595, "learning_rate": 5e-05, "loss": 1.0323, "step": 650 }, { "epoch": 2.082018927444795, "grad_norm": 0.40994375944137573, "learning_rate": 5e-05, "loss": 1.0027, "step": 660 }, { "epoch": 2.1135646687697163, "grad_norm": 0.3847936689853668, "learning_rate": 5e-05, "loss": 1.0022, "step": 670 }, { "epoch": 2.145110410094637, "grad_norm": 0.43215593695640564, "learning_rate": 5e-05, "loss": 1.0564, "step": 680 }, { "epoch": 2.176656151419558, "grad_norm": 0.4463648498058319, "learning_rate": 5e-05, "loss": 1.0277, "step": 690 }, { "epoch": 2.2082018927444795, "grad_norm": 0.42896410822868347, "learning_rate": 5e-05, "loss": 1.0466, "step": 700 }, { "epoch": 2.2397476340694005, "grad_norm": 0.4028797149658203, "learning_rate": 5e-05, "loss": 1.0588, "step": 710 }, { "epoch": 2.271293375394322, "grad_norm": 0.4177733361721039, "learning_rate": 5e-05, "loss": 1.0519, "step": 720 }, { "epoch": 2.302839116719243, "grad_norm": 0.42829203605651855, "learning_rate": 5e-05, "loss": 1.0202, "step": 730 }, { "epoch": 2.334384858044164, "grad_norm": 0.5054190158843994, "learning_rate": 5e-05, "loss": 0.9972, "step": 740 }, { "epoch": 2.365930599369085, "grad_norm": 0.4306070804595947, "learning_rate": 5e-05, "loss": 1.0412, "step": 750 }, { "epoch": 2.3974763406940065, "grad_norm": 0.443590372800827, "learning_rate": 5e-05, "loss": 1.0424, "step": 760 }, { "epoch": 2.4290220820189274, "grad_norm": 0.4287286400794983, "learning_rate": 5e-05, "loss": 1.0331, "step": 770 }, { "epoch": 2.4605678233438484, "grad_norm": 0.39775350689888, "learning_rate": 5e-05, "loss": 1.0454, "step": 780 }, { "epoch": 2.4921135646687698, "grad_norm": 0.4093973636627197, "learning_rate": 5e-05, "loss": 1.0442, "step": 790 }, { "epoch": 2.5236593059936907, "grad_norm": 0.45389777421951294, "learning_rate": 5e-05, "loss": 1.024, "step": 800 }, { "epoch": 2.555205047318612, "grad_norm": 0.428648442029953, "learning_rate": 5e-05, "loss": 1.0407, "step": 810 }, { "epoch": 2.586750788643533, "grad_norm": 0.41237714886665344, "learning_rate": 5e-05, "loss": 1.0159, "step": 820 }, { "epoch": 2.6182965299684544, "grad_norm": 0.42067545652389526, "learning_rate": 5e-05, "loss": 1.0347, "step": 830 }, { "epoch": 2.6498422712933754, "grad_norm": 0.4184909462928772, "learning_rate": 5e-05, "loss": 1.0337, "step": 840 }, { "epoch": 2.6813880126182967, "grad_norm": 0.414995014667511, "learning_rate": 5e-05, "loss": 1.092, "step": 850 }, { "epoch": 2.7129337539432177, "grad_norm": 0.4137355089187622, "learning_rate": 5e-05, "loss": 1.0514, "step": 860 }, { "epoch": 2.7444794952681386, "grad_norm": 0.45818576216697693, "learning_rate": 5e-05, "loss": 1.0225, "step": 870 }, { "epoch": 2.77602523659306, "grad_norm": 0.455785870552063, "learning_rate": 5e-05, "loss": 1.0483, "step": 880 }, { "epoch": 2.807570977917981, "grad_norm": 0.4084894061088562, "learning_rate": 5e-05, "loss": 0.9846, "step": 890 }, { "epoch": 2.8391167192429023, "grad_norm": 0.4103436768054962, "learning_rate": 5e-05, "loss": 1.0217, "step": 900 }, { "epoch": 2.8706624605678233, "grad_norm": 0.40420758724212646, "learning_rate": 5e-05, "loss": 1.0399, "step": 910 }, { "epoch": 2.9022082018927446, "grad_norm": 0.5487234592437744, "learning_rate": 5e-05, "loss": 1.0394, "step": 920 }, { "epoch": 2.9337539432176656, "grad_norm": 0.47695016860961914, "learning_rate": 5e-05, "loss": 1.0445, "step": 930 }, { "epoch": 2.965299684542587, "grad_norm": 0.41771531105041504, "learning_rate": 5e-05, "loss": 1.0377, "step": 940 }, { "epoch": 2.996845425867508, "grad_norm": 0.5724055767059326, "learning_rate": 5e-05, "loss": 1.0193, "step": 950 }, { "epoch": 3.0, "eval_loss": 1.0459696054458618, "eval_runtime": 66.4844, "eval_samples_per_second": 4.783, "eval_steps_per_second": 0.602, "step": 951 }, { "epoch": 3.028391167192429, "grad_norm": 0.481629878282547, "learning_rate": 5e-05, "loss": 0.9837, "step": 960 }, { "epoch": 3.0599369085173502, "grad_norm": 0.42061686515808105, "learning_rate": 5e-05, "loss": 1.0083, "step": 970 }, { "epoch": 3.091482649842271, "grad_norm": 0.4234108030796051, "learning_rate": 5e-05, "loss": 1.0249, "step": 980 }, { "epoch": 3.1230283911671926, "grad_norm": 0.43123263120651245, "learning_rate": 5e-05, "loss": 1.0319, "step": 990 }, { "epoch": 3.1545741324921135, "grad_norm": 0.4268761873245239, "learning_rate": 5e-05, "loss": 1.0067, "step": 1000 }, { "epoch": 3.186119873817035, "grad_norm": 0.41744470596313477, "learning_rate": 5e-05, "loss": 1.0316, "step": 1010 }, { "epoch": 3.217665615141956, "grad_norm": 0.46088990569114685, "learning_rate": 5e-05, "loss": 0.9993, "step": 1020 }, { "epoch": 3.249211356466877, "grad_norm": 0.43155333399772644, "learning_rate": 5e-05, "loss": 1.0275, "step": 1030 }, { "epoch": 3.280757097791798, "grad_norm": 0.4405035972595215, "learning_rate": 5e-05, "loss": 1.0014, "step": 1040 }, { "epoch": 3.312302839116719, "grad_norm": 0.466680645942688, "learning_rate": 5e-05, "loss": 1.0066, "step": 1050 }, { "epoch": 3.3438485804416405, "grad_norm": 0.4462493360042572, "learning_rate": 5e-05, "loss": 1.0081, "step": 1060 }, { "epoch": 3.3753943217665614, "grad_norm": 0.4766935706138611, "learning_rate": 5e-05, "loss": 0.9957, "step": 1070 }, { "epoch": 3.406940063091483, "grad_norm": 0.4287005364894867, "learning_rate": 5e-05, "loss": 1.0022, "step": 1080 }, { "epoch": 3.4384858044164037, "grad_norm": 0.43795284628868103, "learning_rate": 5e-05, "loss": 1.0248, "step": 1090 }, { "epoch": 3.470031545741325, "grad_norm": 0.4681282937526703, "learning_rate": 5e-05, "loss": 1.0241, "step": 1100 }, { "epoch": 3.501577287066246, "grad_norm": 0.44735008478164673, "learning_rate": 5e-05, "loss": 1.0209, "step": 1110 }, { "epoch": 3.5331230283911674, "grad_norm": 0.4473140835762024, "learning_rate": 5e-05, "loss": 0.9824, "step": 1120 }, { "epoch": 3.5646687697160884, "grad_norm": 0.44602036476135254, "learning_rate": 5e-05, "loss": 1.0095, "step": 1130 }, { "epoch": 3.5962145110410093, "grad_norm": 0.455937922000885, "learning_rate": 5e-05, "loss": 1.0045, "step": 1140 }, { "epoch": 3.6277602523659307, "grad_norm": 0.416535347700119, "learning_rate": 5e-05, "loss": 1.0293, "step": 1150 }, { "epoch": 3.6593059936908516, "grad_norm": 0.454054057598114, "learning_rate": 5e-05, "loss": 0.9761, "step": 1160 }, { "epoch": 3.690851735015773, "grad_norm": 0.4191015958786011, "learning_rate": 5e-05, "loss": 1.0275, "step": 1170 }, { "epoch": 3.722397476340694, "grad_norm": 0.45472997426986694, "learning_rate": 5e-05, "loss": 0.975, "step": 1180 }, { "epoch": 3.753943217665615, "grad_norm": 0.429548442363739, "learning_rate": 5e-05, "loss": 0.9638, "step": 1190 }, { "epoch": 3.7854889589905363, "grad_norm": 0.4479614198207855, "learning_rate": 5e-05, "loss": 1.0034, "step": 1200 }, { "epoch": 3.8170347003154577, "grad_norm": 0.41878965497016907, "learning_rate": 5e-05, "loss": 1.0102, "step": 1210 }, { "epoch": 3.8485804416403786, "grad_norm": 0.42527589201927185, "learning_rate": 5e-05, "loss": 0.9746, "step": 1220 }, { "epoch": 3.8801261829652995, "grad_norm": 0.4646793007850647, "learning_rate": 5e-05, "loss": 1.0139, "step": 1230 }, { "epoch": 3.911671924290221, "grad_norm": 0.41096052527427673, "learning_rate": 5e-05, "loss": 1.0247, "step": 1240 }, { "epoch": 3.943217665615142, "grad_norm": 0.4595187306404114, "learning_rate": 5e-05, "loss": 1.0149, "step": 1250 }, { "epoch": 3.9747634069400632, "grad_norm": 0.4228056073188782, "learning_rate": 5e-05, "loss": 1.0199, "step": 1260 }, { "epoch": 4.0, "eval_loss": 1.032894492149353, "eval_runtime": 66.5042, "eval_samples_per_second": 4.782, "eval_steps_per_second": 0.601, "step": 1268 }, { "epoch": 4.006309148264984, "grad_norm": 0.4469398558139801, "learning_rate": 5e-05, "loss": 0.9636, "step": 1270 }, { "epoch": 4.037854889589905, "grad_norm": 0.4484340250492096, "learning_rate": 5e-05, "loss": 0.9827, "step": 1280 }, { "epoch": 4.069400630914826, "grad_norm": 0.4563854932785034, "learning_rate": 5e-05, "loss": 0.9877, "step": 1290 }, { "epoch": 4.100946372239748, "grad_norm": 0.44243761897087097, "learning_rate": 5e-05, "loss": 0.9872, "step": 1300 }, { "epoch": 4.132492113564669, "grad_norm": 0.448011189699173, "learning_rate": 5e-05, "loss": 1.0118, "step": 1310 }, { "epoch": 4.16403785488959, "grad_norm": 0.4259743094444275, "learning_rate": 5e-05, "loss": 1.0109, "step": 1320 }, { "epoch": 4.195583596214511, "grad_norm": 0.456064909696579, "learning_rate": 5e-05, "loss": 0.9552, "step": 1330 }, { "epoch": 4.2271293375394325, "grad_norm": 0.49178850650787354, "learning_rate": 5e-05, "loss": 0.9976, "step": 1340 }, { "epoch": 4.2586750788643535, "grad_norm": 0.4512215852737427, "learning_rate": 5e-05, "loss": 0.9889, "step": 1350 }, { "epoch": 4.290220820189274, "grad_norm": 0.4504569172859192, "learning_rate": 5e-05, "loss": 0.9675, "step": 1360 }, { "epoch": 4.321766561514195, "grad_norm": 0.4347565472126007, "learning_rate": 5e-05, "loss": 0.9904, "step": 1370 }, { "epoch": 4.353312302839116, "grad_norm": 0.4649258852005005, "learning_rate": 5e-05, "loss": 0.9832, "step": 1380 }, { "epoch": 4.384858044164038, "grad_norm": 0.4316873252391815, "learning_rate": 5e-05, "loss": 0.9952, "step": 1390 }, { "epoch": 4.416403785488959, "grad_norm": 0.4411141872406006, "learning_rate": 5e-05, "loss": 0.9743, "step": 1400 }, { "epoch": 4.44794952681388, "grad_norm": 0.46868711709976196, "learning_rate": 5e-05, "loss": 0.9737, "step": 1410 }, { "epoch": 4.479495268138801, "grad_norm": 0.47713035345077515, "learning_rate": 5e-05, "loss": 0.9646, "step": 1420 }, { "epoch": 4.511041009463723, "grad_norm": 0.4720157980918884, "learning_rate": 5e-05, "loss": 0.9645, "step": 1430 }, { "epoch": 4.542586750788644, "grad_norm": 0.4508207440376282, "learning_rate": 5e-05, "loss": 0.9669, "step": 1440 }, { "epoch": 4.574132492113565, "grad_norm": 0.4645206928253174, "learning_rate": 5e-05, "loss": 0.9945, "step": 1450 }, { "epoch": 4.605678233438486, "grad_norm": 0.45657721161842346, "learning_rate": 5e-05, "loss": 1.0193, "step": 1460 }, { "epoch": 4.6372239747634065, "grad_norm": 0.48605337738990784, "learning_rate": 5e-05, "loss": 0.9796, "step": 1470 }, { "epoch": 4.668769716088328, "grad_norm": 0.4564870595932007, "learning_rate": 5e-05, "loss": 1.0164, "step": 1480 }, { "epoch": 4.700315457413249, "grad_norm": 0.46090081334114075, "learning_rate": 5e-05, "loss": 0.9854, "step": 1490 }, { "epoch": 4.73186119873817, "grad_norm": 0.4782868027687073, "learning_rate": 5e-05, "loss": 0.985, "step": 1500 }, { "epoch": 4.763406940063091, "grad_norm": 0.45532533526420593, "learning_rate": 5e-05, "loss": 0.9555, "step": 1510 }, { "epoch": 4.794952681388013, "grad_norm": 0.4831511676311493, "learning_rate": 5e-05, "loss": 0.9775, "step": 1520 }, { "epoch": 4.826498422712934, "grad_norm": 0.4660089612007141, "learning_rate": 5e-05, "loss": 0.9805, "step": 1530 }, { "epoch": 4.858044164037855, "grad_norm": 0.47603532671928406, "learning_rate": 5e-05, "loss": 1.0222, "step": 1540 }, { "epoch": 4.889589905362776, "grad_norm": 0.4162875711917877, "learning_rate": 5e-05, "loss": 0.9867, "step": 1550 }, { "epoch": 4.921135646687697, "grad_norm": 0.4378200173377991, "learning_rate": 5e-05, "loss": 0.9762, "step": 1560 }, { "epoch": 4.952681388012619, "grad_norm": 0.43556976318359375, "learning_rate": 5e-05, "loss": 0.9557, "step": 1570 }, { "epoch": 4.9842271293375395, "grad_norm": 0.4165530204772949, "learning_rate": 5e-05, "loss": 0.998, "step": 1580 }, { "epoch": 5.0, "eval_loss": 1.02390456199646, "eval_runtime": 66.4974, "eval_samples_per_second": 4.782, "eval_steps_per_second": 0.602, "step": 1585 }, { "epoch": 5.0157728706624605, "grad_norm": 0.557310938835144, "learning_rate": 5e-05, "loss": 0.9705, "step": 1590 }, { "epoch": 5.047318611987381, "grad_norm": 0.47156888246536255, "learning_rate": 5e-05, "loss": 0.9629, "step": 1600 }, { "epoch": 5.078864353312303, "grad_norm": 0.51046222448349, "learning_rate": 5e-05, "loss": 0.9429, "step": 1610 }, { "epoch": 5.110410094637224, "grad_norm": 0.48319852352142334, "learning_rate": 5e-05, "loss": 0.9637, "step": 1620 }, { "epoch": 5.141955835962145, "grad_norm": 0.45673197507858276, "learning_rate": 5e-05, "loss": 0.9233, "step": 1630 }, { "epoch": 5.173501577287066, "grad_norm": 0.5032113194465637, "learning_rate": 5e-05, "loss": 0.9486, "step": 1640 }, { "epoch": 5.205047318611987, "grad_norm": 0.449439138174057, "learning_rate": 5e-05, "loss": 0.9107, "step": 1650 }, { "epoch": 5.236593059936909, "grad_norm": 0.4683469831943512, "learning_rate": 5e-05, "loss": 0.9608, "step": 1660 }, { "epoch": 5.26813880126183, "grad_norm": 0.48362118005752563, "learning_rate": 5e-05, "loss": 0.9246, "step": 1670 }, { "epoch": 5.299684542586751, "grad_norm": 0.4709579050540924, "learning_rate": 5e-05, "loss": 0.9958, "step": 1680 }, { "epoch": 5.331230283911672, "grad_norm": 0.4630713164806366, "learning_rate": 5e-05, "loss": 0.9837, "step": 1690 }, { "epoch": 5.3627760252365935, "grad_norm": 0.475508451461792, "learning_rate": 5e-05, "loss": 1.0084, "step": 1700 }, { "epoch": 5.394321766561514, "grad_norm": 0.5352875590324402, "learning_rate": 5e-05, "loss": 0.9595, "step": 1710 }, { "epoch": 5.425867507886435, "grad_norm": 0.5087634325027466, "learning_rate": 5e-05, "loss": 0.9697, "step": 1720 }, { "epoch": 5.457413249211356, "grad_norm": 0.4558835029602051, "learning_rate": 5e-05, "loss": 0.9609, "step": 1730 }, { "epoch": 5.488958990536277, "grad_norm": 0.5090092420578003, "learning_rate": 5e-05, "loss": 0.9732, "step": 1740 }, { "epoch": 5.520504731861199, "grad_norm": 0.48192793130874634, "learning_rate": 5e-05, "loss": 0.9917, "step": 1750 }, { "epoch": 5.55205047318612, "grad_norm": 0.4428229033946991, "learning_rate": 5e-05, "loss": 0.9607, "step": 1760 }, { "epoch": 5.583596214511041, "grad_norm": 0.4858005940914154, "learning_rate": 5e-05, "loss": 0.994, "step": 1770 }, { "epoch": 5.615141955835962, "grad_norm": 0.4797442555427551, "learning_rate": 5e-05, "loss": 0.9554, "step": 1780 }, { "epoch": 5.646687697160884, "grad_norm": 0.4797378480434418, "learning_rate": 5e-05, "loss": 0.9486, "step": 1790 }, { "epoch": 5.678233438485805, "grad_norm": 0.4509980082511902, "learning_rate": 5e-05, "loss": 0.9693, "step": 1800 }, { "epoch": 5.709779179810726, "grad_norm": 0.45232152938842773, "learning_rate": 5e-05, "loss": 0.9622, "step": 1810 }, { "epoch": 5.7413249211356465, "grad_norm": 0.49943023920059204, "learning_rate": 5e-05, "loss": 1.0051, "step": 1820 }, { "epoch": 5.7728706624605675, "grad_norm": 0.4827818274497986, "learning_rate": 5e-05, "loss": 0.9536, "step": 1830 }, { "epoch": 5.804416403785489, "grad_norm": 0.4689510464668274, "learning_rate": 5e-05, "loss": 0.9706, "step": 1840 }, { "epoch": 5.83596214511041, "grad_norm": 0.47188493609428406, "learning_rate": 5e-05, "loss": 0.9582, "step": 1850 }, { "epoch": 5.867507886435331, "grad_norm": 0.47195523977279663, "learning_rate": 5e-05, "loss": 0.9688, "step": 1860 }, { "epoch": 5.899053627760252, "grad_norm": 0.4700336158275604, "learning_rate": 5e-05, "loss": 0.9399, "step": 1870 }, { "epoch": 5.930599369085174, "grad_norm": 0.5036072731018066, "learning_rate": 5e-05, "loss": 0.9726, "step": 1880 }, { "epoch": 5.962145110410095, "grad_norm": 0.5032414197921753, "learning_rate": 5e-05, "loss": 0.9426, "step": 1890 }, { "epoch": 5.993690851735016, "grad_norm": 0.4505554139614105, "learning_rate": 5e-05, "loss": 0.9911, "step": 1900 }, { "epoch": 6.0, "eval_loss": 1.0179320573806763, "eval_runtime": 66.4482, "eval_samples_per_second": 4.786, "eval_steps_per_second": 0.602, "step": 1902 }, { "epoch": 6.025236593059937, "grad_norm": 0.48737627267837524, "learning_rate": 5e-05, "loss": 0.9504, "step": 1910 }, { "epoch": 6.056782334384858, "grad_norm": 0.520263135433197, "learning_rate": 5e-05, "loss": 0.9411, "step": 1920 }, { "epoch": 6.0883280757097795, "grad_norm": 0.4799466133117676, "learning_rate": 5e-05, "loss": 0.9448, "step": 1930 }, { "epoch": 6.1198738170347005, "grad_norm": 0.49849933385849, "learning_rate": 5e-05, "loss": 0.9511, "step": 1940 }, { "epoch": 6.151419558359621, "grad_norm": 0.4995006322860718, "learning_rate": 5e-05, "loss": 0.9315, "step": 1950 }, { "epoch": 6.182965299684542, "grad_norm": 0.5434730648994446, "learning_rate": 5e-05, "loss": 0.9509, "step": 1960 }, { "epoch": 6.214511041009464, "grad_norm": 0.5055322647094727, "learning_rate": 5e-05, "loss": 0.9449, "step": 1970 }, { "epoch": 6.246056782334385, "grad_norm": 0.4768029749393463, "learning_rate": 5e-05, "loss": 0.9356, "step": 1980 }, { "epoch": 6.277602523659306, "grad_norm": 0.5039747357368469, "learning_rate": 5e-05, "loss": 0.9478, "step": 1990 }, { "epoch": 6.309148264984227, "grad_norm": 0.5042532086372375, "learning_rate": 5e-05, "loss": 0.8941, "step": 2000 }, { "epoch": 6.340694006309148, "grad_norm": 0.5117079615592957, "learning_rate": 5e-05, "loss": 0.9081, "step": 2010 }, { "epoch": 6.37223974763407, "grad_norm": 0.5625054836273193, "learning_rate": 5e-05, "loss": 0.9588, "step": 2020 }, { "epoch": 6.403785488958991, "grad_norm": 0.49397581815719604, "learning_rate": 5e-05, "loss": 0.9405, "step": 2030 }, { "epoch": 6.435331230283912, "grad_norm": 0.5129591226577759, "learning_rate": 5e-05, "loss": 0.9357, "step": 2040 }, { "epoch": 6.466876971608833, "grad_norm": 0.5299010276794434, "learning_rate": 5e-05, "loss": 0.9425, "step": 2050 }, { "epoch": 6.498422712933754, "grad_norm": 0.512342095375061, "learning_rate": 5e-05, "loss": 0.936, "step": 2060 }, { "epoch": 6.529968454258675, "grad_norm": 0.5136451721191406, "learning_rate": 5e-05, "loss": 0.9549, "step": 2070 }, { "epoch": 6.561514195583596, "grad_norm": 0.6025319695472717, "learning_rate": 5e-05, "loss": 0.9705, "step": 2080 }, { "epoch": 6.593059936908517, "grad_norm": 0.48766204714775085, "learning_rate": 5e-05, "loss": 0.96, "step": 2090 }, { "epoch": 6.624605678233438, "grad_norm": 0.4721720516681671, "learning_rate": 5e-05, "loss": 0.9457, "step": 2100 }, { "epoch": 6.65615141955836, "grad_norm": 0.48331397771835327, "learning_rate": 5e-05, "loss": 0.9105, "step": 2110 }, { "epoch": 6.687697160883281, "grad_norm": 0.4890565872192383, "learning_rate": 5e-05, "loss": 0.9859, "step": 2120 }, { "epoch": 6.719242902208202, "grad_norm": 0.5263992547988892, "learning_rate": 5e-05, "loss": 0.9659, "step": 2130 }, { "epoch": 6.750788643533123, "grad_norm": 0.45187363028526306, "learning_rate": 5e-05, "loss": 0.9319, "step": 2140 }, { "epoch": 6.782334384858045, "grad_norm": 0.4888645112514496, "learning_rate": 5e-05, "loss": 0.9623, "step": 2150 }, { "epoch": 6.813880126182966, "grad_norm": 0.48433786630630493, "learning_rate": 5e-05, "loss": 0.9601, "step": 2160 }, { "epoch": 6.8454258675078865, "grad_norm": 0.5414565205574036, "learning_rate": 5e-05, "loss": 0.9381, "step": 2170 }, { "epoch": 6.8769716088328074, "grad_norm": 0.47471919655799866, "learning_rate": 5e-05, "loss": 0.9443, "step": 2180 }, { "epoch": 6.908517350157728, "grad_norm": 0.4787106513977051, "learning_rate": 5e-05, "loss": 0.9331, "step": 2190 }, { "epoch": 6.94006309148265, "grad_norm": 0.4515725076198578, "learning_rate": 5e-05, "loss": 0.9461, "step": 2200 }, { "epoch": 6.971608832807571, "grad_norm": 0.4714019000530243, "learning_rate": 5e-05, "loss": 0.9587, "step": 2210 }, { "epoch": 7.0, "eval_loss": 1.0130518674850464, "eval_runtime": 66.5051, "eval_samples_per_second": 4.782, "eval_steps_per_second": 0.601, "step": 2219 }, { "epoch": 7.003154574132492, "grad_norm": 0.44392409920692444, "learning_rate": 5e-05, "loss": 0.9691, "step": 2220 }, { "epoch": 7.034700315457413, "grad_norm": 0.538865864276886, "learning_rate": 5e-05, "loss": 0.9095, "step": 2230 }, { "epoch": 7.066246056782334, "grad_norm": 0.5173049569129944, "learning_rate": 5e-05, "loss": 0.9281, "step": 2240 }, { "epoch": 7.097791798107256, "grad_norm": 0.4751831293106079, "learning_rate": 5e-05, "loss": 0.9094, "step": 2250 }, { "epoch": 7.129337539432177, "grad_norm": 0.5221697092056274, "learning_rate": 5e-05, "loss": 0.9148, "step": 2260 }, { "epoch": 7.160883280757098, "grad_norm": 0.5088801383972168, "learning_rate": 5e-05, "loss": 0.9383, "step": 2270 }, { "epoch": 7.192429022082019, "grad_norm": 0.5191715359687805, "learning_rate": 5e-05, "loss": 0.9187, "step": 2280 }, { "epoch": 7.2239747634069404, "grad_norm": 0.5438238382339478, "learning_rate": 5e-05, "loss": 0.9192, "step": 2290 }, { "epoch": 7.255520504731861, "grad_norm": 0.5197346210479736, "learning_rate": 5e-05, "loss": 0.9226, "step": 2300 }, { "epoch": 7.287066246056782, "grad_norm": 0.5286086797714233, "learning_rate": 5e-05, "loss": 0.9009, "step": 2310 }, { "epoch": 7.318611987381703, "grad_norm": 0.4977555274963379, "learning_rate": 5e-05, "loss": 0.9524, "step": 2320 }, { "epoch": 7.350157728706624, "grad_norm": 0.5014932751655579, "learning_rate": 5e-05, "loss": 0.9356, "step": 2330 }, { "epoch": 7.381703470031546, "grad_norm": 0.5207954049110413, "learning_rate": 5e-05, "loss": 0.9095, "step": 2340 }, { "epoch": 7.413249211356467, "grad_norm": 0.512366771697998, "learning_rate": 5e-05, "loss": 0.9357, "step": 2350 }, { "epoch": 7.444794952681388, "grad_norm": 0.5742561221122742, "learning_rate": 5e-05, "loss": 0.9188, "step": 2360 }, { "epoch": 7.476340694006309, "grad_norm": 0.5032497644424438, "learning_rate": 5e-05, "loss": 0.9624, "step": 2370 }, { "epoch": 7.50788643533123, "grad_norm": 0.6190054416656494, "learning_rate": 5e-05, "loss": 0.8954, "step": 2380 }, { "epoch": 7.539432176656152, "grad_norm": 0.5226176977157593, "learning_rate": 5e-05, "loss": 0.9212, "step": 2390 }, { "epoch": 7.570977917981073, "grad_norm": 0.5045409202575684, "learning_rate": 5e-05, "loss": 0.9459, "step": 2400 }, { "epoch": 7.6025236593059935, "grad_norm": 0.48342952132225037, "learning_rate": 5e-05, "loss": 0.9306, "step": 2410 }, { "epoch": 7.634069400630915, "grad_norm": 0.48288047313690186, "learning_rate": 5e-05, "loss": 0.9217, "step": 2420 }, { "epoch": 7.665615141955836, "grad_norm": 0.5119076371192932, "learning_rate": 5e-05, "loss": 0.9594, "step": 2430 }, { "epoch": 7.697160883280757, "grad_norm": 0.5182865262031555, "learning_rate": 5e-05, "loss": 0.9158, "step": 2440 }, { "epoch": 7.728706624605678, "grad_norm": 0.5085521340370178, "learning_rate": 5e-05, "loss": 0.9249, "step": 2450 }, { "epoch": 7.760252365930599, "grad_norm": 0.49291595816612244, "learning_rate": 5e-05, "loss": 0.9128, "step": 2460 }, { "epoch": 7.79179810725552, "grad_norm": 0.5067439675331116, "learning_rate": 5e-05, "loss": 0.8993, "step": 2470 }, { "epoch": 7.823343848580442, "grad_norm": 0.49475356936454773, "learning_rate": 5e-05, "loss": 0.9313, "step": 2480 }, { "epoch": 7.854889589905363, "grad_norm": 0.5028258562088013, "learning_rate": 5e-05, "loss": 0.9459, "step": 2490 }, { "epoch": 7.886435331230284, "grad_norm": 0.482112854719162, "learning_rate": 5e-05, "loss": 0.9243, "step": 2500 }, { "epoch": 7.917981072555205, "grad_norm": 0.5285838842391968, "learning_rate": 5e-05, "loss": 0.9847, "step": 2510 }, { "epoch": 7.9495268138801265, "grad_norm": 0.5217479467391968, "learning_rate": 5e-05, "loss": 0.9172, "step": 2520 }, { "epoch": 7.981072555205047, "grad_norm": 0.47651416063308716, "learning_rate": 5e-05, "loss": 0.9003, "step": 2530 }, { "epoch": 8.0, "eval_loss": 1.0111174583435059, "eval_runtime": 66.5497, "eval_samples_per_second": 4.778, "eval_steps_per_second": 0.601, "step": 2536 }, { "epoch": 8.012618296529968, "grad_norm": 0.5173642635345459, "learning_rate": 5e-05, "loss": 0.9408, "step": 2540 }, { "epoch": 8.04416403785489, "grad_norm": 0.5017074346542358, "learning_rate": 5e-05, "loss": 0.9028, "step": 2550 }, { "epoch": 8.07570977917981, "grad_norm": 0.5437857508659363, "learning_rate": 5e-05, "loss": 0.916, "step": 2560 }, { "epoch": 8.107255520504731, "grad_norm": 0.485762357711792, "learning_rate": 5e-05, "loss": 0.8698, "step": 2570 }, { "epoch": 8.138801261829652, "grad_norm": 0.5231262445449829, "learning_rate": 5e-05, "loss": 0.9004, "step": 2580 }, { "epoch": 8.170347003154575, "grad_norm": 0.49633580446243286, "learning_rate": 5e-05, "loss": 0.9159, "step": 2590 }, { "epoch": 8.201892744479496, "grad_norm": 0.5477921366691589, "learning_rate": 5e-05, "loss": 0.9015, "step": 2600 }, { "epoch": 8.233438485804417, "grad_norm": 0.5651286840438843, "learning_rate": 5e-05, "loss": 0.8925, "step": 2610 }, { "epoch": 8.264984227129338, "grad_norm": 0.5210168957710266, "learning_rate": 5e-05, "loss": 0.9172, "step": 2620 }, { "epoch": 8.296529968454259, "grad_norm": 0.5071650743484497, "learning_rate": 5e-05, "loss": 0.9172, "step": 2630 }, { "epoch": 8.32807570977918, "grad_norm": 0.5585223436355591, "learning_rate": 5e-05, "loss": 0.9234, "step": 2640 }, { "epoch": 8.3596214511041, "grad_norm": 0.5303429961204529, "learning_rate": 5e-05, "loss": 0.896, "step": 2650 }, { "epoch": 8.391167192429021, "grad_norm": 0.5033040046691895, "learning_rate": 5e-05, "loss": 0.9431, "step": 2660 }, { "epoch": 8.422712933753942, "grad_norm": 0.4882967472076416, "learning_rate": 5e-05, "loss": 0.9103, "step": 2670 }, { "epoch": 8.454258675078865, "grad_norm": 0.5938067436218262, "learning_rate": 5e-05, "loss": 0.9067, "step": 2680 }, { "epoch": 8.485804416403786, "grad_norm": 0.5606987476348877, "learning_rate": 5e-05, "loss": 0.9177, "step": 2690 }, { "epoch": 8.517350157728707, "grad_norm": 0.5056515336036682, "learning_rate": 5e-05, "loss": 0.8924, "step": 2700 }, { "epoch": 8.548895899053628, "grad_norm": 0.5208995938301086, "learning_rate": 5e-05, "loss": 0.917, "step": 2710 }, { "epoch": 8.580441640378549, "grad_norm": 0.575134813785553, "learning_rate": 5e-05, "loss": 0.9132, "step": 2720 }, { "epoch": 8.61198738170347, "grad_norm": 0.5263710021972656, "learning_rate": 5e-05, "loss": 0.9162, "step": 2730 }, { "epoch": 8.64353312302839, "grad_norm": 0.5916036367416382, "learning_rate": 5e-05, "loss": 0.9147, "step": 2740 }, { "epoch": 8.675078864353312, "grad_norm": 0.5610800981521606, "learning_rate": 5e-05, "loss": 0.9022, "step": 2750 }, { "epoch": 8.706624605678233, "grad_norm": 0.5309184193611145, "learning_rate": 5e-05, "loss": 0.8736, "step": 2760 }, { "epoch": 8.738170347003155, "grad_norm": 0.5035881996154785, "learning_rate": 5e-05, "loss": 0.898, "step": 2770 }, { "epoch": 8.769716088328076, "grad_norm": 0.5445141196250916, "learning_rate": 5e-05, "loss": 0.903, "step": 2780 }, { "epoch": 8.801261829652997, "grad_norm": 0.5459301471710205, "learning_rate": 5e-05, "loss": 0.9124, "step": 2790 }, { "epoch": 8.832807570977918, "grad_norm": 0.5099250078201294, "learning_rate": 5e-05, "loss": 0.9132, "step": 2800 }, { "epoch": 8.864353312302839, "grad_norm": 0.5143303275108337, "learning_rate": 5e-05, "loss": 0.9085, "step": 2810 }, { "epoch": 8.89589905362776, "grad_norm": 0.5371480584144592, "learning_rate": 5e-05, "loss": 0.9463, "step": 2820 }, { "epoch": 8.927444794952681, "grad_norm": 0.517353892326355, "learning_rate": 5e-05, "loss": 0.8911, "step": 2830 }, { "epoch": 8.958990536277602, "grad_norm": 0.5601980090141296, "learning_rate": 5e-05, "loss": 0.915, "step": 2840 }, { "epoch": 8.990536277602523, "grad_norm": 0.5473778247833252, "learning_rate": 5e-05, "loss": 0.9254, "step": 2850 }, { "epoch": 9.0, "eval_loss": 1.0110243558883667, "eval_runtime": 66.5208, "eval_samples_per_second": 4.78, "eval_steps_per_second": 0.601, "step": 2853 }, { "epoch": 9.022082018927446, "grad_norm": 0.5774141550064087, "learning_rate": 5e-05, "loss": 0.8731, "step": 2860 }, { "epoch": 9.053627760252366, "grad_norm": 0.5381526350975037, "learning_rate": 5e-05, "loss": 0.9125, "step": 2870 }, { "epoch": 9.085173501577287, "grad_norm": 0.5414624810218811, "learning_rate": 5e-05, "loss": 0.8838, "step": 2880 }, { "epoch": 9.116719242902208, "grad_norm": 0.526127815246582, "learning_rate": 5e-05, "loss": 0.8709, "step": 2890 }, { "epoch": 9.14826498422713, "grad_norm": 0.5719351768493652, "learning_rate": 5e-05, "loss": 0.8976, "step": 2900 }, { "epoch": 9.17981072555205, "grad_norm": 0.6119252443313599, "learning_rate": 5e-05, "loss": 0.9006, "step": 2910 }, { "epoch": 9.211356466876971, "grad_norm": 0.5286473035812378, "learning_rate": 5e-05, "loss": 0.873, "step": 2920 }, { "epoch": 9.242902208201892, "grad_norm": 0.5602397918701172, "learning_rate": 5e-05, "loss": 0.9113, "step": 2930 }, { "epoch": 9.274447949526813, "grad_norm": 0.5757038593292236, "learning_rate": 5e-05, "loss": 0.8967, "step": 2940 }, { "epoch": 9.305993690851736, "grad_norm": 0.5797662138938904, "learning_rate": 5e-05, "loss": 0.921, "step": 2950 }, { "epoch": 9.337539432176657, "grad_norm": 0.5598446726799011, "learning_rate": 5e-05, "loss": 0.9121, "step": 2960 }, { "epoch": 9.369085173501578, "grad_norm": 0.5119657516479492, "learning_rate": 5e-05, "loss": 0.8748, "step": 2970 }, { "epoch": 9.400630914826499, "grad_norm": 0.5484170317649841, "learning_rate": 5e-05, "loss": 0.8971, "step": 2980 }, { "epoch": 9.43217665615142, "grad_norm": 0.5351391434669495, "learning_rate": 5e-05, "loss": 0.8466, "step": 2990 }, { "epoch": 9.46372239747634, "grad_norm": 0.5337589979171753, "learning_rate": 5e-05, "loss": 0.8986, "step": 3000 }, { "epoch": 9.495268138801261, "grad_norm": 0.5773183703422546, "learning_rate": 5e-05, "loss": 0.9001, "step": 3010 }, { "epoch": 9.526813880126182, "grad_norm": 0.6163984537124634, "learning_rate": 5e-05, "loss": 0.902, "step": 3020 }, { "epoch": 9.558359621451103, "grad_norm": 0.5879700183868408, "learning_rate": 5e-05, "loss": 0.8855, "step": 3030 }, { "epoch": 9.589905362776026, "grad_norm": 0.5596455335617065, "learning_rate": 5e-05, "loss": 0.9052, "step": 3040 }, { "epoch": 9.621451104100947, "grad_norm": 0.5862036943435669, "learning_rate": 5e-05, "loss": 0.9092, "step": 3050 }, { "epoch": 9.652996845425868, "grad_norm": 0.5491965413093567, "learning_rate": 5e-05, "loss": 0.8887, "step": 3060 }, { "epoch": 9.684542586750789, "grad_norm": 0.5651736259460449, "learning_rate": 5e-05, "loss": 0.8873, "step": 3070 }, { "epoch": 9.71608832807571, "grad_norm": 0.5439329147338867, "learning_rate": 5e-05, "loss": 0.8871, "step": 3080 }, { "epoch": 9.74763406940063, "grad_norm": 0.5257729887962341, "learning_rate": 5e-05, "loss": 0.8711, "step": 3090 }, { "epoch": 9.779179810725552, "grad_norm": 0.5310476422309875, "learning_rate": 5e-05, "loss": 0.9118, "step": 3100 }, { "epoch": 9.810725552050473, "grad_norm": 0.5593333840370178, "learning_rate": 5e-05, "loss": 0.8647, "step": 3110 }, { "epoch": 9.842271293375394, "grad_norm": 0.5591513514518738, "learning_rate": 5e-05, "loss": 0.8788, "step": 3120 }, { "epoch": 9.873817034700316, "grad_norm": 0.5862401723861694, "learning_rate": 5e-05, "loss": 0.9113, "step": 3130 }, { "epoch": 9.905362776025237, "grad_norm": 0.5847012996673584, "learning_rate": 5e-05, "loss": 0.9228, "step": 3140 }, { "epoch": 9.936908517350158, "grad_norm": 0.5507489442825317, "learning_rate": 5e-05, "loss": 0.9114, "step": 3150 }, { "epoch": 9.968454258675079, "grad_norm": 0.5988635420799255, "learning_rate": 5e-05, "loss": 0.8753, "step": 3160 }, { "epoch": 10.0, "grad_norm": 0.6517265439033508, "learning_rate": 5e-05, "loss": 0.8532, "step": 3170 }, { "epoch": 10.0, "eval_loss": 1.0107625722885132, "eval_runtime": 66.5594, "eval_samples_per_second": 4.778, "eval_steps_per_second": 0.601, "step": 3170 } ], "logging_steps": 10, "max_steps": 6340, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1112958196396851e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }