{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.1186113789778207, "eval_steps": 100, "global_step": 2900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003857280617164899, "grad_norm": 2.45951247215271, "learning_rate": 2.497749774977498e-05, "loss": 3.6523, "step": 10 }, { "epoch": 0.007714561234329798, "grad_norm": 1.4561721086502075, "learning_rate": 2.4952495249524954e-05, "loss": 2.7058, "step": 20 }, { "epoch": 0.011571841851494697, "grad_norm": 1.2352159023284912, "learning_rate": 2.4927492749274926e-05, "loss": 2.0281, "step": 30 }, { "epoch": 0.015429122468659595, "grad_norm": 1.2523480653762817, "learning_rate": 2.4902490249024905e-05, "loss": 1.5702, "step": 40 }, { "epoch": 0.019286403085824494, "grad_norm": 1.407914400100708, "learning_rate": 2.4877487748774877e-05, "loss": 1.177, "step": 50 }, { "epoch": 0.023143683702989394, "grad_norm": 1.1699292659759521, "learning_rate": 2.4852485248524852e-05, "loss": 0.9623, "step": 60 }, { "epoch": 0.02700096432015429, "grad_norm": 1.0662754774093628, "learning_rate": 2.4827482748274828e-05, "loss": 1.029, "step": 70 }, { "epoch": 0.03085824493731919, "grad_norm": 0.9690182209014893, "learning_rate": 2.4802480248024803e-05, "loss": 0.946, "step": 80 }, { "epoch": 0.03471552555448409, "grad_norm": 0.8241429328918457, "learning_rate": 2.477747774777478e-05, "loss": 0.8613, "step": 90 }, { "epoch": 0.03857280617164899, "grad_norm": 0.9876273274421692, "learning_rate": 2.4752475247524754e-05, "loss": 0.891, "step": 100 }, { "epoch": 0.03857280617164899, "eval_loss": 0.8584423065185547, "eval_runtime": 94.247, "eval_samples_per_second": 55.015, "eval_steps_per_second": 6.886, "step": 100 }, { "epoch": 0.04243008678881389, "grad_norm": 0.8240203261375427, "learning_rate": 2.472747274727473e-05, "loss": 0.8924, "step": 110 }, { "epoch": 0.04628736740597879, "grad_norm": 0.7671812176704407, "learning_rate": 2.47024702470247e-05, "loss": 0.8446, "step": 120 }, { "epoch": 0.05014464802314368, "grad_norm": 0.9588340520858765, "learning_rate": 2.467746774677468e-05, "loss": 0.8491, "step": 130 }, { "epoch": 0.05400192864030858, "grad_norm": 0.9825944304466248, "learning_rate": 2.4652465246524652e-05, "loss": 0.8437, "step": 140 }, { "epoch": 0.05785920925747348, "grad_norm": 0.9779114723205566, "learning_rate": 2.4627462746274628e-05, "loss": 0.8404, "step": 150 }, { "epoch": 0.06171648987463838, "grad_norm": 0.9949918389320374, "learning_rate": 2.4602460246024603e-05, "loss": 0.772, "step": 160 }, { "epoch": 0.06557377049180328, "grad_norm": 0.9132283329963684, "learning_rate": 2.457745774577458e-05, "loss": 0.8132, "step": 170 }, { "epoch": 0.06943105110896818, "grad_norm": 0.8586040735244751, "learning_rate": 2.4552455245524554e-05, "loss": 0.8365, "step": 180 }, { "epoch": 0.07328833172613308, "grad_norm": 0.78518146276474, "learning_rate": 2.452745274527453e-05, "loss": 0.7607, "step": 190 }, { "epoch": 0.07714561234329798, "grad_norm": 1.1228320598602295, "learning_rate": 2.45024502450245e-05, "loss": 0.8262, "step": 200 }, { "epoch": 0.07714561234329798, "eval_loss": 0.7683142423629761, "eval_runtime": 94.3304, "eval_samples_per_second": 54.966, "eval_steps_per_second": 6.88, "step": 200 }, { "epoch": 0.08100289296046287, "grad_norm": 1.47947096824646, "learning_rate": 2.447744774477448e-05, "loss": 0.7629, "step": 210 }, { "epoch": 0.08486017357762778, "grad_norm": 1.398677110671997, "learning_rate": 2.4452445244524452e-05, "loss": 0.7253, "step": 220 }, { "epoch": 0.08871745419479267, "grad_norm": 0.8628906607627869, "learning_rate": 2.4427442744274428e-05, "loss": 0.7691, "step": 230 }, { "epoch": 0.09257473481195758, "grad_norm": 0.9008379578590393, "learning_rate": 2.4402440244024403e-05, "loss": 0.6461, "step": 240 }, { "epoch": 0.09643201542912247, "grad_norm": 0.6998778581619263, "learning_rate": 2.437743774377438e-05, "loss": 0.7174, "step": 250 }, { "epoch": 0.10028929604628736, "grad_norm": 0.863390326499939, "learning_rate": 2.4352435243524354e-05, "loss": 0.6757, "step": 260 }, { "epoch": 0.10414657666345227, "grad_norm": 1.0060020685195923, "learning_rate": 2.432743274327433e-05, "loss": 0.6933, "step": 270 }, { "epoch": 0.10800385728061716, "grad_norm": 0.8257681727409363, "learning_rate": 2.4302430243024305e-05, "loss": 0.7369, "step": 280 }, { "epoch": 0.11186113789778207, "grad_norm": 0.6368749141693115, "learning_rate": 2.4277427742774277e-05, "loss": 0.6612, "step": 290 }, { "epoch": 0.11571841851494696, "grad_norm": 0.8179033994674683, "learning_rate": 2.4252425242524256e-05, "loss": 0.6744, "step": 300 }, { "epoch": 0.11571841851494696, "eval_loss": 0.689677357673645, "eval_runtime": 94.4186, "eval_samples_per_second": 54.915, "eval_steps_per_second": 6.874, "step": 300 }, { "epoch": 0.11957569913211186, "grad_norm": 0.7856632471084595, "learning_rate": 2.4227422742274228e-05, "loss": 0.6407, "step": 310 }, { "epoch": 0.12343297974927676, "grad_norm": 0.788524866104126, "learning_rate": 2.4202420242024203e-05, "loss": 0.7115, "step": 320 }, { "epoch": 0.12729026036644167, "grad_norm": 1.0506746768951416, "learning_rate": 2.417741774177418e-05, "loss": 0.6825, "step": 330 }, { "epoch": 0.13114754098360656, "grad_norm": 1.2924314737319946, "learning_rate": 2.4152415241524154e-05, "loss": 0.6627, "step": 340 }, { "epoch": 0.13500482160077146, "grad_norm": 0.8082237243652344, "learning_rate": 2.4127412741274126e-05, "loss": 0.6895, "step": 350 }, { "epoch": 0.13886210221793635, "grad_norm": 0.787610650062561, "learning_rate": 2.4102410241024105e-05, "loss": 0.6839, "step": 360 }, { "epoch": 0.14271938283510124, "grad_norm": 0.7939244508743286, "learning_rate": 2.4077407740774077e-05, "loss": 0.6533, "step": 370 }, { "epoch": 0.14657666345226616, "grad_norm": 0.7655636668205261, "learning_rate": 2.4052405240524052e-05, "loss": 0.6645, "step": 380 }, { "epoch": 0.15043394406943106, "grad_norm": 0.709829568862915, "learning_rate": 2.402740274027403e-05, "loss": 0.6792, "step": 390 }, { "epoch": 0.15429122468659595, "grad_norm": 0.7088485956192017, "learning_rate": 2.4002400240024003e-05, "loss": 0.6568, "step": 400 }, { "epoch": 0.15429122468659595, "eval_loss": 0.677769124507904, "eval_runtime": 94.3721, "eval_samples_per_second": 54.942, "eval_steps_per_second": 6.877, "step": 400 }, { "epoch": 0.15814850530376084, "grad_norm": 0.739398717880249, "learning_rate": 2.397739773977398e-05, "loss": 0.6802, "step": 410 }, { "epoch": 0.16200578592092574, "grad_norm": 0.7921575307846069, "learning_rate": 2.3952395239523954e-05, "loss": 0.6598, "step": 420 }, { "epoch": 0.16586306653809066, "grad_norm": 0.9333528280258179, "learning_rate": 2.392739273927393e-05, "loss": 0.6543, "step": 430 }, { "epoch": 0.16972034715525555, "grad_norm": 0.906482994556427, "learning_rate": 2.39023902390239e-05, "loss": 0.692, "step": 440 }, { "epoch": 0.17357762777242045, "grad_norm": 0.8562319278717041, "learning_rate": 2.387738773877388e-05, "loss": 0.6963, "step": 450 }, { "epoch": 0.17743490838958534, "grad_norm": 0.8864608407020569, "learning_rate": 2.3852385238523852e-05, "loss": 0.6672, "step": 460 }, { "epoch": 0.18129218900675023, "grad_norm": 0.7445130944252014, "learning_rate": 2.3827382738273828e-05, "loss": 0.6052, "step": 470 }, { "epoch": 0.18514946962391515, "grad_norm": 0.751557469367981, "learning_rate": 2.3802380238023803e-05, "loss": 0.6301, "step": 480 }, { "epoch": 0.18900675024108005, "grad_norm": 0.6981202960014343, "learning_rate": 2.377737773777378e-05, "loss": 0.6423, "step": 490 }, { "epoch": 0.19286403085824494, "grad_norm": 0.9979777336120605, "learning_rate": 2.3752375237523754e-05, "loss": 0.6075, "step": 500 }, { "epoch": 0.19286403085824494, "eval_loss": 0.6642400622367859, "eval_runtime": 94.3518, "eval_samples_per_second": 54.954, "eval_steps_per_second": 6.879, "step": 500 }, { "epoch": 0.19672131147540983, "grad_norm": 0.7130064368247986, "learning_rate": 2.372737273727373e-05, "loss": 0.6054, "step": 510 }, { "epoch": 0.20057859209257473, "grad_norm": 0.7771989703178406, "learning_rate": 2.37023702370237e-05, "loss": 0.6621, "step": 520 }, { "epoch": 0.20443587270973965, "grad_norm": 0.8572603464126587, "learning_rate": 2.3677367736773677e-05, "loss": 0.6563, "step": 530 }, { "epoch": 0.20829315332690454, "grad_norm": 0.8305298686027527, "learning_rate": 2.3652365236523656e-05, "loss": 0.6658, "step": 540 }, { "epoch": 0.21215043394406943, "grad_norm": 0.8520190119743347, "learning_rate": 2.3627362736273628e-05, "loss": 0.638, "step": 550 }, { "epoch": 0.21600771456123433, "grad_norm": 0.9404274225234985, "learning_rate": 2.3602360236023603e-05, "loss": 0.6067, "step": 560 }, { "epoch": 0.21986499517839922, "grad_norm": 0.8018991351127625, "learning_rate": 2.357735773577358e-05, "loss": 0.6385, "step": 570 }, { "epoch": 0.22372227579556414, "grad_norm": 0.8628789186477661, "learning_rate": 2.3552355235523554e-05, "loss": 0.6554, "step": 580 }, { "epoch": 0.22757955641272903, "grad_norm": 0.8279526829719543, "learning_rate": 2.352735273527353e-05, "loss": 0.6114, "step": 590 }, { "epoch": 0.23143683702989393, "grad_norm": 0.8158825635910034, "learning_rate": 2.3502350235023505e-05, "loss": 0.6149, "step": 600 }, { "epoch": 0.23143683702989393, "eval_loss": 0.6521801352500916, "eval_runtime": 94.3458, "eval_samples_per_second": 54.957, "eval_steps_per_second": 6.879, "step": 600 }, { "epoch": 0.23529411764705882, "grad_norm": 0.8334428071975708, "learning_rate": 2.3477347734773477e-05, "loss": 0.6769, "step": 610 }, { "epoch": 0.2391513982642237, "grad_norm": 0.9083623886108398, "learning_rate": 2.3452345234523456e-05, "loss": 0.6331, "step": 620 }, { "epoch": 0.24300867888138863, "grad_norm": 1.199766993522644, "learning_rate": 2.3427342734273428e-05, "loss": 0.6967, "step": 630 }, { "epoch": 0.24686595949855353, "grad_norm": 1.2198294401168823, "learning_rate": 2.3402340234023403e-05, "loss": 0.6618, "step": 640 }, { "epoch": 0.2507232401157184, "grad_norm": 0.8489105701446533, "learning_rate": 2.337733773377338e-05, "loss": 0.6242, "step": 650 }, { "epoch": 0.25458052073288334, "grad_norm": 1.0652421712875366, "learning_rate": 2.3352335233523354e-05, "loss": 0.6192, "step": 660 }, { "epoch": 0.25843780135004824, "grad_norm": 0.7928668856620789, "learning_rate": 2.3327332733273326e-05, "loss": 0.5706, "step": 670 }, { "epoch": 0.26229508196721313, "grad_norm": 0.8512901663780212, "learning_rate": 2.3302330233023305e-05, "loss": 0.6457, "step": 680 }, { "epoch": 0.266152362584378, "grad_norm": 0.8443427085876465, "learning_rate": 2.327732773277328e-05, "loss": 0.601, "step": 690 }, { "epoch": 0.2700096432015429, "grad_norm": 0.8724773526191711, "learning_rate": 2.3252325232523252e-05, "loss": 0.6476, "step": 700 }, { "epoch": 0.2700096432015429, "eval_loss": 0.6422102451324463, "eval_runtime": 94.4282, "eval_samples_per_second": 54.909, "eval_steps_per_second": 6.873, "step": 700 }, { "epoch": 0.2738669238187078, "grad_norm": 0.8733763098716736, "learning_rate": 2.322732273227323e-05, "loss": 0.6523, "step": 710 }, { "epoch": 0.2777242044358727, "grad_norm": 0.8932089805603027, "learning_rate": 2.3202320232023203e-05, "loss": 0.6305, "step": 720 }, { "epoch": 0.2815814850530376, "grad_norm": 0.9854605197906494, "learning_rate": 2.317731773177318e-05, "loss": 0.6358, "step": 730 }, { "epoch": 0.2854387656702025, "grad_norm": 0.8158785700798035, "learning_rate": 2.3152315231523154e-05, "loss": 0.6027, "step": 740 }, { "epoch": 0.2892960462873674, "grad_norm": 0.9273302555084229, "learning_rate": 2.312731273127313e-05, "loss": 0.6431, "step": 750 }, { "epoch": 0.29315332690453233, "grad_norm": 0.9094042181968689, "learning_rate": 2.31023102310231e-05, "loss": 0.5767, "step": 760 }, { "epoch": 0.2970106075216972, "grad_norm": 0.8175253868103027, "learning_rate": 2.307730773077308e-05, "loss": 0.6174, "step": 770 }, { "epoch": 0.3008678881388621, "grad_norm": 0.8517961502075195, "learning_rate": 2.3052305230523052e-05, "loss": 0.6183, "step": 780 }, { "epoch": 0.304725168756027, "grad_norm": 0.8863179087638855, "learning_rate": 2.3027302730273028e-05, "loss": 0.5849, "step": 790 }, { "epoch": 0.3085824493731919, "grad_norm": 0.9195278882980347, "learning_rate": 2.3002300230023003e-05, "loss": 0.6016, "step": 800 }, { "epoch": 0.3085824493731919, "eval_loss": 0.6320463418960571, "eval_runtime": 94.3592, "eval_samples_per_second": 54.95, "eval_steps_per_second": 6.878, "step": 800 }, { "epoch": 0.3124397299903568, "grad_norm": 0.9424280524253845, "learning_rate": 2.297729772977298e-05, "loss": 0.5961, "step": 810 }, { "epoch": 0.3162970106075217, "grad_norm": 1.031079888343811, "learning_rate": 2.295229522952295e-05, "loss": 0.6357, "step": 820 }, { "epoch": 0.3201542912246866, "grad_norm": 0.9320313334465027, "learning_rate": 2.292729272927293e-05, "loss": 0.6228, "step": 830 }, { "epoch": 0.3240115718418515, "grad_norm": 0.9292299747467041, "learning_rate": 2.2902290229022905e-05, "loss": 0.6504, "step": 840 }, { "epoch": 0.32786885245901637, "grad_norm": 0.8377825021743774, "learning_rate": 2.2877287728772877e-05, "loss": 0.5952, "step": 850 }, { "epoch": 0.3317261330761813, "grad_norm": 0.8555241227149963, "learning_rate": 2.2852285228522856e-05, "loss": 0.5852, "step": 860 }, { "epoch": 0.3355834136933462, "grad_norm": 1.0691065788269043, "learning_rate": 2.2827282728272828e-05, "loss": 0.5806, "step": 870 }, { "epoch": 0.3394406943105111, "grad_norm": 1.0052144527435303, "learning_rate": 2.2802280228022803e-05, "loss": 0.6592, "step": 880 }, { "epoch": 0.343297974927676, "grad_norm": 1.000553011894226, "learning_rate": 2.277727772777278e-05, "loss": 0.6347, "step": 890 }, { "epoch": 0.3471552555448409, "grad_norm": 1.13107430934906, "learning_rate": 2.2752275227522754e-05, "loss": 0.5989, "step": 900 }, { "epoch": 0.3471552555448409, "eval_loss": 0.6231358647346497, "eval_runtime": 94.3809, "eval_samples_per_second": 54.937, "eval_steps_per_second": 6.876, "step": 900 }, { "epoch": 0.3510125361620058, "grad_norm": 1.0130326747894287, "learning_rate": 2.272727272727273e-05, "loss": 0.6227, "step": 910 }, { "epoch": 0.3548698167791707, "grad_norm": 1.0335384607315063, "learning_rate": 2.2702270227022705e-05, "loss": 0.5277, "step": 920 }, { "epoch": 0.35872709739633557, "grad_norm": 0.9162185788154602, "learning_rate": 2.2677267726772677e-05, "loss": 0.5633, "step": 930 }, { "epoch": 0.36258437801350046, "grad_norm": 0.9492796063423157, "learning_rate": 2.2652265226522652e-05, "loss": 0.6536, "step": 940 }, { "epoch": 0.36644165863066536, "grad_norm": 1.0065137147903442, "learning_rate": 2.2627262726272628e-05, "loss": 0.6622, "step": 950 }, { "epoch": 0.3702989392478303, "grad_norm": 0.917143702507019, "learning_rate": 2.2602260226022603e-05, "loss": 0.6391, "step": 960 }, { "epoch": 0.3741562198649952, "grad_norm": 0.9580853581428528, "learning_rate": 2.257725772577258e-05, "loss": 0.6354, "step": 970 }, { "epoch": 0.3780135004821601, "grad_norm": 1.1998488903045654, "learning_rate": 2.2552255225522554e-05, "loss": 0.5885, "step": 980 }, { "epoch": 0.381870781099325, "grad_norm": 0.9667923450469971, "learning_rate": 2.252725272527253e-05, "loss": 0.6199, "step": 990 }, { "epoch": 0.3857280617164899, "grad_norm": 0.9675014019012451, "learning_rate": 2.2502250225022505e-05, "loss": 0.5522, "step": 1000 }, { "epoch": 0.3857280617164899, "eval_loss": 0.6154375672340393, "eval_runtime": 94.3972, "eval_samples_per_second": 54.927, "eval_steps_per_second": 6.875, "step": 1000 }, { "epoch": 0.38958534233365477, "grad_norm": 1.035885214805603, "learning_rate": 2.247724772477248e-05, "loss": 0.5868, "step": 1010 }, { "epoch": 0.39344262295081966, "grad_norm": 1.1226266622543335, "learning_rate": 2.2452245224522452e-05, "loss": 0.5787, "step": 1020 }, { "epoch": 0.39729990356798456, "grad_norm": 1.0908483266830444, "learning_rate": 2.2427242724272428e-05, "loss": 0.6161, "step": 1030 }, { "epoch": 0.40115718418514945, "grad_norm": 0.9660767316818237, "learning_rate": 2.2402240224022403e-05, "loss": 0.6277, "step": 1040 }, { "epoch": 0.40501446480231434, "grad_norm": 0.9711313843727112, "learning_rate": 2.237723772377238e-05, "loss": 0.6062, "step": 1050 }, { "epoch": 0.4088717454194793, "grad_norm": 0.9374969601631165, "learning_rate": 2.2352235223522354e-05, "loss": 0.6299, "step": 1060 }, { "epoch": 0.4127290260366442, "grad_norm": 1.0570039749145508, "learning_rate": 2.232723272327233e-05, "loss": 0.5965, "step": 1070 }, { "epoch": 0.4165863066538091, "grad_norm": 1.0144932270050049, "learning_rate": 2.23022302230223e-05, "loss": 0.5479, "step": 1080 }, { "epoch": 0.420443587270974, "grad_norm": 0.9654034972190857, "learning_rate": 2.227722772277228e-05, "loss": 0.5768, "step": 1090 }, { "epoch": 0.42430086788813887, "grad_norm": 0.9580025672912598, "learning_rate": 2.2252225222522252e-05, "loss": 0.6023, "step": 1100 }, { "epoch": 0.42430086788813887, "eval_loss": 0.6073106527328491, "eval_runtime": 94.4156, "eval_samples_per_second": 54.917, "eval_steps_per_second": 6.874, "step": 1100 }, { "epoch": 0.42815814850530376, "grad_norm": 1.0227288007736206, "learning_rate": 2.2227222722272228e-05, "loss": 0.5824, "step": 1110 }, { "epoch": 0.43201542912246865, "grad_norm": 0.977800726890564, "learning_rate": 2.2202220222022203e-05, "loss": 0.5629, "step": 1120 }, { "epoch": 0.43587270973963355, "grad_norm": 0.9433587789535522, "learning_rate": 2.217721772177218e-05, "loss": 0.5774, "step": 1130 }, { "epoch": 0.43972999035679844, "grad_norm": 1.0534788370132446, "learning_rate": 2.2152215221522154e-05, "loss": 0.6191, "step": 1140 }, { "epoch": 0.44358727097396333, "grad_norm": 0.9741374850273132, "learning_rate": 2.212721272127213e-05, "loss": 0.5989, "step": 1150 }, { "epoch": 0.4474445515911283, "grad_norm": 1.1215403079986572, "learning_rate": 2.2102210221022105e-05, "loss": 0.6547, "step": 1160 }, { "epoch": 0.4513018322082932, "grad_norm": 1.1161948442459106, "learning_rate": 2.2077207720772077e-05, "loss": 0.5999, "step": 1170 }, { "epoch": 0.45515911282545807, "grad_norm": 1.1462429761886597, "learning_rate": 2.2052205220522055e-05, "loss": 0.6458, "step": 1180 }, { "epoch": 0.45901639344262296, "grad_norm": 1.0904706716537476, "learning_rate": 2.2027202720272027e-05, "loss": 0.5839, "step": 1190 }, { "epoch": 0.46287367405978785, "grad_norm": 1.0991252660751343, "learning_rate": 2.2002200220022003e-05, "loss": 0.5403, "step": 1200 }, { "epoch": 0.46287367405978785, "eval_loss": 0.6001272797584534, "eval_runtime": 94.5589, "eval_samples_per_second": 54.834, "eval_steps_per_second": 6.863, "step": 1200 }, { "epoch": 0.46673095467695275, "grad_norm": 1.221454381942749, "learning_rate": 2.197719771977198e-05, "loss": 0.6155, "step": 1210 }, { "epoch": 0.47058823529411764, "grad_norm": 1.0147477388381958, "learning_rate": 2.1952195219521954e-05, "loss": 0.62, "step": 1220 }, { "epoch": 0.47444551591128253, "grad_norm": 1.0702507495880127, "learning_rate": 2.1927192719271926e-05, "loss": 0.5605, "step": 1230 }, { "epoch": 0.4783027965284474, "grad_norm": 1.295518398284912, "learning_rate": 2.1902190219021905e-05, "loss": 0.5065, "step": 1240 }, { "epoch": 0.4821600771456123, "grad_norm": 1.1323541402816772, "learning_rate": 2.1877187718771877e-05, "loss": 0.5726, "step": 1250 }, { "epoch": 0.48601735776277727, "grad_norm": 0.9562482833862305, "learning_rate": 2.1852185218521852e-05, "loss": 0.5683, "step": 1260 }, { "epoch": 0.48987463837994216, "grad_norm": 1.129547119140625, "learning_rate": 2.1827182718271827e-05, "loss": 0.5732, "step": 1270 }, { "epoch": 0.49373191899710706, "grad_norm": 1.0175765752792358, "learning_rate": 2.1802180218021803e-05, "loss": 0.5251, "step": 1280 }, { "epoch": 0.49758919961427195, "grad_norm": 1.1538267135620117, "learning_rate": 2.177717771777178e-05, "loss": 0.5798, "step": 1290 }, { "epoch": 0.5014464802314368, "grad_norm": 1.1203854084014893, "learning_rate": 2.1752175217521754e-05, "loss": 0.535, "step": 1300 }, { "epoch": 0.5014464802314368, "eval_loss": 0.593771755695343, "eval_runtime": 94.4579, "eval_samples_per_second": 54.892, "eval_steps_per_second": 6.871, "step": 1300 }, { "epoch": 0.5053037608486017, "grad_norm": 1.158937692642212, "learning_rate": 2.172717271727173e-05, "loss": 0.5667, "step": 1310 }, { "epoch": 0.5091610414657667, "grad_norm": 1.1078110933303833, "learning_rate": 2.17021702170217e-05, "loss": 0.6097, "step": 1320 }, { "epoch": 0.5130183220829315, "grad_norm": 1.1934500932693481, "learning_rate": 2.167716771677168e-05, "loss": 0.5478, "step": 1330 }, { "epoch": 0.5168756027000965, "grad_norm": 1.048662781715393, "learning_rate": 2.1652165216521652e-05, "loss": 0.5753, "step": 1340 }, { "epoch": 0.5207328833172613, "grad_norm": 1.0503116846084595, "learning_rate": 2.1627162716271627e-05, "loss": 0.5762, "step": 1350 }, { "epoch": 0.5245901639344263, "grad_norm": 1.1861109733581543, "learning_rate": 2.1602160216021603e-05, "loss": 0.5808, "step": 1360 }, { "epoch": 0.5284474445515911, "grad_norm": 1.178539752960205, "learning_rate": 2.1577157715771578e-05, "loss": 0.5584, "step": 1370 }, { "epoch": 0.532304725168756, "grad_norm": 1.0662671327590942, "learning_rate": 2.1552155215521554e-05, "loss": 0.5535, "step": 1380 }, { "epoch": 0.5361620057859209, "grad_norm": 1.1202431917190552, "learning_rate": 2.152715271527153e-05, "loss": 0.5555, "step": 1390 }, { "epoch": 0.5400192864030858, "grad_norm": 1.1992982625961304, "learning_rate": 2.15021502150215e-05, "loss": 0.5712, "step": 1400 }, { "epoch": 0.5400192864030858, "eval_loss": 0.5875272750854492, "eval_runtime": 94.4312, "eval_samples_per_second": 54.908, "eval_steps_per_second": 6.873, "step": 1400 }, { "epoch": 0.5438765670202508, "grad_norm": 1.1259962320327759, "learning_rate": 2.147714771477148e-05, "loss": 0.5676, "step": 1410 }, { "epoch": 0.5477338476374156, "grad_norm": 1.0652165412902832, "learning_rate": 2.1452145214521452e-05, "loss": 0.5551, "step": 1420 }, { "epoch": 0.5515911282545806, "grad_norm": 1.1056393384933472, "learning_rate": 2.1427142714271427e-05, "loss": 0.508, "step": 1430 }, { "epoch": 0.5554484088717454, "grad_norm": 1.1506450176239014, "learning_rate": 2.1402140214021403e-05, "loss": 0.582, "step": 1440 }, { "epoch": 0.5593056894889104, "grad_norm": 1.4107190370559692, "learning_rate": 2.1377137713771378e-05, "loss": 0.5821, "step": 1450 }, { "epoch": 0.5631629701060752, "grad_norm": 1.2830005884170532, "learning_rate": 2.1352135213521354e-05, "loss": 0.5451, "step": 1460 }, { "epoch": 0.5670202507232401, "grad_norm": 1.1122502088546753, "learning_rate": 2.132713271327133e-05, "loss": 0.5905, "step": 1470 }, { "epoch": 0.570877531340405, "grad_norm": 1.1104683876037598, "learning_rate": 2.1302130213021305e-05, "loss": 0.6189, "step": 1480 }, { "epoch": 0.5747348119575699, "grad_norm": 1.2569029331207275, "learning_rate": 2.1277127712771277e-05, "loss": 0.5717, "step": 1490 }, { "epoch": 0.5785920925747348, "grad_norm": 1.1278156042099, "learning_rate": 2.1252125212521255e-05, "loss": 0.5686, "step": 1500 }, { "epoch": 0.5785920925747348, "eval_loss": 0.5797137022018433, "eval_runtime": 94.4199, "eval_samples_per_second": 54.914, "eval_steps_per_second": 6.874, "step": 1500 }, { "epoch": 0.5824493731918997, "grad_norm": 1.075393795967102, "learning_rate": 2.1227122712271227e-05, "loss": 0.5849, "step": 1510 }, { "epoch": 0.5863066538090647, "grad_norm": 1.2325960397720337, "learning_rate": 2.1202120212021203e-05, "loss": 0.5706, "step": 1520 }, { "epoch": 0.5901639344262295, "grad_norm": 1.1058759689331055, "learning_rate": 2.1177117711771178e-05, "loss": 0.5706, "step": 1530 }, { "epoch": 0.5940212150433944, "grad_norm": 1.1634057760238647, "learning_rate": 2.1152115211521154e-05, "loss": 0.5518, "step": 1540 }, { "epoch": 0.5978784956605593, "grad_norm": 1.0119497776031494, "learning_rate": 2.1127112711271126e-05, "loss": 0.5104, "step": 1550 }, { "epoch": 0.6017357762777242, "grad_norm": 1.2648943662643433, "learning_rate": 2.1102110211021104e-05, "loss": 0.5261, "step": 1560 }, { "epoch": 0.6055930568948891, "grad_norm": 1.2454555034637451, "learning_rate": 2.1077107710771077e-05, "loss": 0.5633, "step": 1570 }, { "epoch": 0.609450337512054, "grad_norm": 1.1793566942214966, "learning_rate": 2.1052105210521052e-05, "loss": 0.535, "step": 1580 }, { "epoch": 0.6133076181292189, "grad_norm": 1.5229750871658325, "learning_rate": 2.102710271027103e-05, "loss": 0.5559, "step": 1590 }, { "epoch": 0.6171648987463838, "grad_norm": 1.2203059196472168, "learning_rate": 2.1002100210021003e-05, "loss": 0.5315, "step": 1600 }, { "epoch": 0.6171648987463838, "eval_loss": 0.5732572078704834, "eval_runtime": 94.4141, "eval_samples_per_second": 54.918, "eval_steps_per_second": 6.874, "step": 1600 }, { "epoch": 0.6210221793635486, "grad_norm": 1.4130253791809082, "learning_rate": 2.0977097709770978e-05, "loss": 0.5521, "step": 1610 }, { "epoch": 0.6248794599807136, "grad_norm": 1.2830981016159058, "learning_rate": 2.0952095209520954e-05, "loss": 0.5432, "step": 1620 }, { "epoch": 0.6287367405978785, "grad_norm": 1.1956433057785034, "learning_rate": 2.092709270927093e-05, "loss": 0.5746, "step": 1630 }, { "epoch": 0.6325940212150434, "grad_norm": 1.5104076862335205, "learning_rate": 2.09020902090209e-05, "loss": 0.5916, "step": 1640 }, { "epoch": 0.6364513018322083, "grad_norm": 1.2112847566604614, "learning_rate": 2.087708770877088e-05, "loss": 0.5322, "step": 1650 }, { "epoch": 0.6403085824493732, "grad_norm": 1.1859279870986938, "learning_rate": 2.0852085208520852e-05, "loss": 0.599, "step": 1660 }, { "epoch": 0.6441658630665381, "grad_norm": 1.348300576210022, "learning_rate": 2.0827082708270827e-05, "loss": 0.605, "step": 1670 }, { "epoch": 0.648023143683703, "grad_norm": 1.3982155323028564, "learning_rate": 2.0802080208020803e-05, "loss": 0.5367, "step": 1680 }, { "epoch": 0.6518804243008679, "grad_norm": 1.2189476490020752, "learning_rate": 2.0777077707770778e-05, "loss": 0.5855, "step": 1690 }, { "epoch": 0.6557377049180327, "grad_norm": 1.3908072710037231, "learning_rate": 2.0752075207520754e-05, "loss": 0.5876, "step": 1700 }, { "epoch": 0.6557377049180327, "eval_loss": 0.5670270919799805, "eval_runtime": 94.4393, "eval_samples_per_second": 54.903, "eval_steps_per_second": 6.872, "step": 1700 }, { "epoch": 0.6595949855351977, "grad_norm": 1.150038480758667, "learning_rate": 2.072707270727073e-05, "loss": 0.5151, "step": 1710 }, { "epoch": 0.6634522661523626, "grad_norm": 1.2351560592651367, "learning_rate": 2.07020702070207e-05, "loss": 0.5171, "step": 1720 }, { "epoch": 0.6673095467695275, "grad_norm": 1.2720533609390259, "learning_rate": 2.0677067706770676e-05, "loss": 0.5526, "step": 1730 }, { "epoch": 0.6711668273866924, "grad_norm": 1.2330290079116821, "learning_rate": 2.0652065206520655e-05, "loss": 0.5516, "step": 1740 }, { "epoch": 0.6750241080038573, "grad_norm": 1.319873571395874, "learning_rate": 2.0627062706270627e-05, "loss": 0.5512, "step": 1750 }, { "epoch": 0.6788813886210222, "grad_norm": 1.663527250289917, "learning_rate": 2.0602060206020603e-05, "loss": 0.556, "step": 1760 }, { "epoch": 0.682738669238187, "grad_norm": 1.2730813026428223, "learning_rate": 2.0577057705770578e-05, "loss": 0.5362, "step": 1770 }, { "epoch": 0.686595949855352, "grad_norm": 1.2985719442367554, "learning_rate": 2.0552055205520554e-05, "loss": 0.6448, "step": 1780 }, { "epoch": 0.6904532304725168, "grad_norm": 1.384941577911377, "learning_rate": 2.052705270527053e-05, "loss": 0.5767, "step": 1790 }, { "epoch": 0.6943105110896818, "grad_norm": 1.2721012830734253, "learning_rate": 2.0502050205020504e-05, "loss": 0.6248, "step": 1800 }, { "epoch": 0.6943105110896818, "eval_loss": 0.560900092124939, "eval_runtime": 94.4846, "eval_samples_per_second": 54.877, "eval_steps_per_second": 6.869, "step": 1800 }, { "epoch": 0.6981677917068466, "grad_norm": 1.3880654573440552, "learning_rate": 2.0477047704770476e-05, "loss": 0.5389, "step": 1810 }, { "epoch": 0.7020250723240116, "grad_norm": 1.2518627643585205, "learning_rate": 2.0452045204520455e-05, "loss": 0.566, "step": 1820 }, { "epoch": 0.7058823529411765, "grad_norm": 1.4524362087249756, "learning_rate": 2.0427042704270427e-05, "loss": 0.5105, "step": 1830 }, { "epoch": 0.7097396335583414, "grad_norm": 1.2816158533096313, "learning_rate": 2.0402040204020403e-05, "loss": 0.5308, "step": 1840 }, { "epoch": 0.7135969141755063, "grad_norm": 1.286135196685791, "learning_rate": 2.0377037703770378e-05, "loss": 0.5273, "step": 1850 }, { "epoch": 0.7174541947926711, "grad_norm": 1.4501844644546509, "learning_rate": 2.0352035203520354e-05, "loss": 0.5622, "step": 1860 }, { "epoch": 0.7213114754098361, "grad_norm": 1.3340784311294556, "learning_rate": 2.0327032703270326e-05, "loss": 0.6137, "step": 1870 }, { "epoch": 0.7251687560270009, "grad_norm": 1.439643383026123, "learning_rate": 2.0302030203020304e-05, "loss": 0.5846, "step": 1880 }, { "epoch": 0.7290260366441659, "grad_norm": 1.2474430799484253, "learning_rate": 2.027702770277028e-05, "loss": 0.5519, "step": 1890 }, { "epoch": 0.7328833172613307, "grad_norm": 1.0996040105819702, "learning_rate": 2.0252025202520252e-05, "loss": 0.5481, "step": 1900 }, { "epoch": 0.7328833172613307, "eval_loss": 0.5548669695854187, "eval_runtime": 94.5252, "eval_samples_per_second": 54.853, "eval_steps_per_second": 6.866, "step": 1900 }, { "epoch": 0.7367405978784957, "grad_norm": 1.5467498302459717, "learning_rate": 2.022702270227023e-05, "loss": 0.546, "step": 1910 }, { "epoch": 0.7405978784956606, "grad_norm": 1.4486864805221558, "learning_rate": 2.0202020202020203e-05, "loss": 0.5239, "step": 1920 }, { "epoch": 0.7444551591128254, "grad_norm": 1.3535338640213013, "learning_rate": 2.0177017701770178e-05, "loss": 0.5733, "step": 1930 }, { "epoch": 0.7483124397299904, "grad_norm": 1.4148615598678589, "learning_rate": 2.0152015201520154e-05, "loss": 0.5177, "step": 1940 }, { "epoch": 0.7521697203471552, "grad_norm": 1.5134552717208862, "learning_rate": 2.012701270127013e-05, "loss": 0.5643, "step": 1950 }, { "epoch": 0.7560270009643202, "grad_norm": 1.5626767873764038, "learning_rate": 2.01020102010201e-05, "loss": 0.5317, "step": 1960 }, { "epoch": 0.759884281581485, "grad_norm": 1.3729217052459717, "learning_rate": 2.007700770077008e-05, "loss": 0.5859, "step": 1970 }, { "epoch": 0.76374156219865, "grad_norm": 1.5823298692703247, "learning_rate": 2.0052005200520052e-05, "loss": 0.517, "step": 1980 }, { "epoch": 0.7675988428158148, "grad_norm": 1.4126390218734741, "learning_rate": 2.0027002700270027e-05, "loss": 0.578, "step": 1990 }, { "epoch": 0.7714561234329798, "grad_norm": 1.5024161338806152, "learning_rate": 2.0002000200020003e-05, "loss": 0.4779, "step": 2000 }, { "epoch": 0.7714561234329798, "eval_loss": 0.548928439617157, "eval_runtime": 94.5508, "eval_samples_per_second": 54.838, "eval_steps_per_second": 6.864, "step": 2000 }, { "epoch": 0.7753134040501446, "grad_norm": 1.4644631147384644, "learning_rate": 1.9976997699769978e-05, "loss": 0.545, "step": 2010 }, { "epoch": 0.7791706846673095, "grad_norm": 1.394882082939148, "learning_rate": 1.995199519951995e-05, "loss": 0.5502, "step": 2020 }, { "epoch": 0.7830279652844745, "grad_norm": 1.4921457767486572, "learning_rate": 1.992699269926993e-05, "loss": 0.6197, "step": 2030 }, { "epoch": 0.7868852459016393, "grad_norm": 1.3136405944824219, "learning_rate": 1.9901990199019904e-05, "loss": 0.5296, "step": 2040 }, { "epoch": 0.7907425265188043, "grad_norm": 1.5223480463027954, "learning_rate": 1.9876987698769876e-05, "loss": 0.4991, "step": 2050 }, { "epoch": 0.7945998071359691, "grad_norm": 1.4527870416641235, "learning_rate": 1.9851985198519855e-05, "loss": 0.5194, "step": 2060 }, { "epoch": 0.7984570877531341, "grad_norm": 1.4777238368988037, "learning_rate": 1.9826982698269827e-05, "loss": 0.5511, "step": 2070 }, { "epoch": 0.8023143683702989, "grad_norm": 1.8136184215545654, "learning_rate": 1.9801980198019803e-05, "loss": 0.5819, "step": 2080 }, { "epoch": 0.8061716489874639, "grad_norm": 1.7190624475479126, "learning_rate": 1.9776977697769778e-05, "loss": 0.5725, "step": 2090 }, { "epoch": 0.8100289296046287, "grad_norm": 1.2566032409667969, "learning_rate": 1.9751975197519753e-05, "loss": 0.5471, "step": 2100 }, { "epoch": 0.8100289296046287, "eval_loss": 0.5430962443351746, "eval_runtime": 94.4905, "eval_samples_per_second": 54.873, "eval_steps_per_second": 6.868, "step": 2100 }, { "epoch": 0.8138862102217936, "grad_norm": 1.1948508024215698, "learning_rate": 1.9726972697269725e-05, "loss": 0.5449, "step": 2110 }, { "epoch": 0.8177434908389586, "grad_norm": 1.355807900428772, "learning_rate": 1.9701970197019704e-05, "loss": 0.5238, "step": 2120 }, { "epoch": 0.8216007714561234, "grad_norm": 1.4238370656967163, "learning_rate": 1.9676967696769676e-05, "loss": 0.5425, "step": 2130 }, { "epoch": 0.8254580520732884, "grad_norm": 1.5667427778244019, "learning_rate": 1.9651965196519652e-05, "loss": 0.5571, "step": 2140 }, { "epoch": 0.8293153326904532, "grad_norm": 1.5513569116592407, "learning_rate": 1.9626962696269627e-05, "loss": 0.5631, "step": 2150 }, { "epoch": 0.8331726133076182, "grad_norm": 1.3871880769729614, "learning_rate": 1.9601960196019603e-05, "loss": 0.5687, "step": 2160 }, { "epoch": 0.837029893924783, "grad_norm": 1.4342153072357178, "learning_rate": 1.9576957695769578e-05, "loss": 0.5193, "step": 2170 }, { "epoch": 0.840887174541948, "grad_norm": 1.4925063848495483, "learning_rate": 1.9551955195519553e-05, "loss": 0.5548, "step": 2180 }, { "epoch": 0.8447444551591128, "grad_norm": 1.5816041231155396, "learning_rate": 1.952695269526953e-05, "loss": 0.5538, "step": 2190 }, { "epoch": 0.8486017357762777, "grad_norm": 1.803604006767273, "learning_rate": 1.9501950195019504e-05, "loss": 0.4947, "step": 2200 }, { "epoch": 0.8486017357762777, "eval_loss": 0.5378134846687317, "eval_runtime": 94.5121, "eval_samples_per_second": 54.861, "eval_steps_per_second": 6.867, "step": 2200 }, { "epoch": 0.8524590163934426, "grad_norm": 1.5246657133102417, "learning_rate": 1.947694769476948e-05, "loss": 0.5183, "step": 2210 }, { "epoch": 0.8563162970106075, "grad_norm": 1.4470975399017334, "learning_rate": 1.9451945194519452e-05, "loss": 0.5197, "step": 2220 }, { "epoch": 0.8601735776277725, "grad_norm": 1.6767865419387817, "learning_rate": 1.9426942694269427e-05, "loss": 0.5654, "step": 2230 }, { "epoch": 0.8640308582449373, "grad_norm": 1.5155974626541138, "learning_rate": 1.9401940194019403e-05, "loss": 0.6042, "step": 2240 }, { "epoch": 0.8678881388621023, "grad_norm": 1.6148077249526978, "learning_rate": 1.9376937693769378e-05, "loss": 0.5055, "step": 2250 }, { "epoch": 0.8717454194792671, "grad_norm": 1.5768954753875732, "learning_rate": 1.9351935193519353e-05, "loss": 0.4966, "step": 2260 }, { "epoch": 0.875602700096432, "grad_norm": 1.5010885000228882, "learning_rate": 1.932693269326933e-05, "loss": 0.501, "step": 2270 }, { "epoch": 0.8794599807135969, "grad_norm": 1.661967158317566, "learning_rate": 1.93019301930193e-05, "loss": 0.5405, "step": 2280 }, { "epoch": 0.8833172613307618, "grad_norm": 1.5393158197402954, "learning_rate": 1.927692769276928e-05, "loss": 0.5544, "step": 2290 }, { "epoch": 0.8871745419479267, "grad_norm": 1.7475782632827759, "learning_rate": 1.9251925192519252e-05, "loss": 0.6173, "step": 2300 }, { "epoch": 0.8871745419479267, "eval_loss": 0.5320296287536621, "eval_runtime": 94.5369, "eval_samples_per_second": 54.846, "eval_steps_per_second": 6.865, "step": 2300 }, { "epoch": 0.8910318225650916, "grad_norm": 1.3934800624847412, "learning_rate": 1.9226922692269227e-05, "loss": 0.5125, "step": 2310 }, { "epoch": 0.8948891031822566, "grad_norm": 1.6484580039978027, "learning_rate": 1.9201920192019203e-05, "loss": 0.494, "step": 2320 }, { "epoch": 0.8987463837994214, "grad_norm": 1.6516157388687134, "learning_rate": 1.9176917691769178e-05, "loss": 0.5253, "step": 2330 }, { "epoch": 0.9026036644165863, "grad_norm": 1.5073869228363037, "learning_rate": 1.9151915191519153e-05, "loss": 0.5516, "step": 2340 }, { "epoch": 0.9064609450337512, "grad_norm": 1.582481026649475, "learning_rate": 1.912691269126913e-05, "loss": 0.5621, "step": 2350 }, { "epoch": 0.9103182256509161, "grad_norm": 1.4449944496154785, "learning_rate": 1.9101910191019104e-05, "loss": 0.5494, "step": 2360 }, { "epoch": 0.914175506268081, "grad_norm": 1.7907747030258179, "learning_rate": 1.9076907690769076e-05, "loss": 0.5404, "step": 2370 }, { "epoch": 0.9180327868852459, "grad_norm": 1.719509243965149, "learning_rate": 1.9051905190519055e-05, "loss": 0.5283, "step": 2380 }, { "epoch": 0.9218900675024108, "grad_norm": 1.5800633430480957, "learning_rate": 1.9026902690269027e-05, "loss": 0.5292, "step": 2390 }, { "epoch": 0.9257473481195757, "grad_norm": 1.4846770763397217, "learning_rate": 1.9001900190019003e-05, "loss": 0.524, "step": 2400 }, { "epoch": 0.9257473481195757, "eval_loss": 0.5241175889968872, "eval_runtime": 94.4587, "eval_samples_per_second": 54.892, "eval_steps_per_second": 6.871, "step": 2400 }, { "epoch": 0.9296046287367405, "grad_norm": 1.7714641094207764, "learning_rate": 1.8976897689768978e-05, "loss": 0.4915, "step": 2410 }, { "epoch": 0.9334619093539055, "grad_norm": 1.964656114578247, "learning_rate": 1.8951895189518953e-05, "loss": 0.4874, "step": 2420 }, { "epoch": 0.9373191899710704, "grad_norm": 1.6763602495193481, "learning_rate": 1.8926892689268925e-05, "loss": 0.5526, "step": 2430 }, { "epoch": 0.9411764705882353, "grad_norm": 1.6096868515014648, "learning_rate": 1.8901890189018904e-05, "loss": 0.5101, "step": 2440 }, { "epoch": 0.9450337512054002, "grad_norm": 1.5164107084274292, "learning_rate": 1.8876887688768876e-05, "loss": 0.5307, "step": 2450 }, { "epoch": 0.9488910318225651, "grad_norm": 1.4356317520141602, "learning_rate": 1.885188518851885e-05, "loss": 0.4733, "step": 2460 }, { "epoch": 0.95274831243973, "grad_norm": 1.6256446838378906, "learning_rate": 1.8826882688268827e-05, "loss": 0.5726, "step": 2470 }, { "epoch": 0.9566055930568949, "grad_norm": 1.5358326435089111, "learning_rate": 1.8801880188018802e-05, "loss": 0.5134, "step": 2480 }, { "epoch": 0.9604628736740598, "grad_norm": 1.862509846687317, "learning_rate": 1.8776877687768778e-05, "loss": 0.5277, "step": 2490 }, { "epoch": 0.9643201542912246, "grad_norm": 1.7659302949905396, "learning_rate": 1.8751875187518753e-05, "loss": 0.5523, "step": 2500 }, { "epoch": 0.9643201542912246, "eval_loss": 0.5180462002754211, "eval_runtime": 94.4176, "eval_samples_per_second": 54.916, "eval_steps_per_second": 6.874, "step": 2500 }, { "epoch": 0.9681774349083896, "grad_norm": 1.5947084426879883, "learning_rate": 1.872687268726873e-05, "loss": 0.5419, "step": 2510 }, { "epoch": 0.9720347155255545, "grad_norm": 1.829914927482605, "learning_rate": 1.87018701870187e-05, "loss": 0.5897, "step": 2520 }, { "epoch": 0.9758919961427194, "grad_norm": 1.3083444833755493, "learning_rate": 1.867686768676868e-05, "loss": 0.4932, "step": 2530 }, { "epoch": 0.9797492767598843, "grad_norm": 1.5652191638946533, "learning_rate": 1.865186518651865e-05, "loss": 0.4967, "step": 2540 }, { "epoch": 0.9836065573770492, "grad_norm": 1.7959744930267334, "learning_rate": 1.8626862686268627e-05, "loss": 0.4934, "step": 2550 }, { "epoch": 0.9874638379942141, "grad_norm": 1.6218141317367554, "learning_rate": 1.8601860186018602e-05, "loss": 0.4809, "step": 2560 }, { "epoch": 0.991321118611379, "grad_norm": 1.641104817390442, "learning_rate": 1.8576857685768578e-05, "loss": 0.4789, "step": 2570 }, { "epoch": 0.9951783992285439, "grad_norm": 1.732410192489624, "learning_rate": 1.8551855185518553e-05, "loss": 0.4998, "step": 2580 }, { "epoch": 0.9990356798457087, "grad_norm": 1.8680731058120728, "learning_rate": 1.852685268526853e-05, "loss": 0.5097, "step": 2590 }, { "epoch": 1.0028929604628736, "grad_norm": 1.7208608388900757, "learning_rate": 1.85018501850185e-05, "loss": 0.4809, "step": 2600 }, { "epoch": 1.0028929604628736, "eval_loss": 0.5124805569648743, "eval_runtime": 94.4179, "eval_samples_per_second": 54.915, "eval_steps_per_second": 6.874, "step": 2600 }, { "epoch": 1.0067502410800386, "grad_norm": 1.9916785955429077, "learning_rate": 1.847684768476848e-05, "loss": 0.4324, "step": 2610 }, { "epoch": 1.0106075216972035, "grad_norm": 1.5762462615966797, "learning_rate": 1.845184518451845e-05, "loss": 0.4817, "step": 2620 }, { "epoch": 1.0144648023143683, "grad_norm": 2.0109360218048096, "learning_rate": 1.8426842684268427e-05, "loss": 0.441, "step": 2630 }, { "epoch": 1.0183220829315334, "grad_norm": 1.7828129529953003, "learning_rate": 1.8401840184018402e-05, "loss": 0.4551, "step": 2640 }, { "epoch": 1.0221793635486982, "grad_norm": 1.7471317052841187, "learning_rate": 1.8376837683768378e-05, "loss": 0.3956, "step": 2650 }, { "epoch": 1.026036644165863, "grad_norm": 1.9026498794555664, "learning_rate": 1.8351835183518353e-05, "loss": 0.4544, "step": 2660 }, { "epoch": 1.0298939247830279, "grad_norm": 1.9493508338928223, "learning_rate": 1.832683268326833e-05, "loss": 0.4609, "step": 2670 }, { "epoch": 1.033751205400193, "grad_norm": 1.8381072282791138, "learning_rate": 1.8301830183018304e-05, "loss": 0.4321, "step": 2680 }, { "epoch": 1.0376084860173578, "grad_norm": 1.5527135133743286, "learning_rate": 1.8276827682768276e-05, "loss": 0.4112, "step": 2690 }, { "epoch": 1.0414657666345226, "grad_norm": 2.231661319732666, "learning_rate": 1.8251825182518255e-05, "loss": 0.4279, "step": 2700 }, { "epoch": 1.0414657666345226, "eval_loss": 0.5103564262390137, "eval_runtime": 94.417, "eval_samples_per_second": 54.916, "eval_steps_per_second": 6.874, "step": 2700 }, { "epoch": 1.0453230472516875, "grad_norm": 3.195507049560547, "learning_rate": 1.8226822682268227e-05, "loss": 0.4678, "step": 2710 }, { "epoch": 1.0491803278688525, "grad_norm": 1.8608683347702026, "learning_rate": 1.8201820182018202e-05, "loss": 0.4831, "step": 2720 }, { "epoch": 1.0530376084860174, "grad_norm": 2.1820995807647705, "learning_rate": 1.8176817681768178e-05, "loss": 0.4, "step": 2730 }, { "epoch": 1.0568948891031822, "grad_norm": 1.7552732229232788, "learning_rate": 1.8151815181518153e-05, "loss": 0.4431, "step": 2740 }, { "epoch": 1.0607521697203472, "grad_norm": 2.040696859359741, "learning_rate": 1.8126812681268125e-05, "loss": 0.528, "step": 2750 }, { "epoch": 1.064609450337512, "grad_norm": 1.7921245098114014, "learning_rate": 1.8101810181018104e-05, "loss": 0.449, "step": 2760 }, { "epoch": 1.068466730954677, "grad_norm": 2.0593929290771484, "learning_rate": 1.8076807680768076e-05, "loss": 0.41, "step": 2770 }, { "epoch": 1.0723240115718418, "grad_norm": 2.059739112854004, "learning_rate": 1.805180518051805e-05, "loss": 0.4451, "step": 2780 }, { "epoch": 1.0761812921890068, "grad_norm": 2.0607693195343018, "learning_rate": 1.802680268026803e-05, "loss": 0.4387, "step": 2790 }, { "epoch": 1.0800385728061717, "grad_norm": 1.7160958051681519, "learning_rate": 1.8001800180018002e-05, "loss": 0.4501, "step": 2800 }, { "epoch": 1.0800385728061717, "eval_loss": 0.5034841895103455, "eval_runtime": 94.5244, "eval_samples_per_second": 54.854, "eval_steps_per_second": 6.866, "step": 2800 }, { "epoch": 1.0838958534233365, "grad_norm": 1.879629373550415, "learning_rate": 1.7976797679767978e-05, "loss": 0.4553, "step": 2810 }, { "epoch": 1.0877531340405016, "grad_norm": 2.0610523223876953, "learning_rate": 1.7951795179517953e-05, "loss": 0.4842, "step": 2820 }, { "epoch": 1.0916104146576664, "grad_norm": 1.8454833030700684, "learning_rate": 1.792679267926793e-05, "loss": 0.4288, "step": 2830 }, { "epoch": 1.0954676952748312, "grad_norm": 1.7830801010131836, "learning_rate": 1.79017901790179e-05, "loss": 0.4552, "step": 2840 }, { "epoch": 1.099324975891996, "grad_norm": 1.7110368013381958, "learning_rate": 1.787678767876788e-05, "loss": 0.4557, "step": 2850 }, { "epoch": 1.1031822565091611, "grad_norm": 2.69413161277771, "learning_rate": 1.785178517851785e-05, "loss": 0.5252, "step": 2860 }, { "epoch": 1.107039537126326, "grad_norm": 2.2572829723358154, "learning_rate": 1.7826782678267827e-05, "loss": 0.5042, "step": 2870 }, { "epoch": 1.1108968177434908, "grad_norm": 2.144115447998047, "learning_rate": 1.7801780178017802e-05, "loss": 0.4615, "step": 2880 }, { "epoch": 1.1147540983606556, "grad_norm": 1.661698818206787, "learning_rate": 1.7776777677767778e-05, "loss": 0.429, "step": 2890 }, { "epoch": 1.1186113789778207, "grad_norm": 2.2900257110595703, "learning_rate": 1.7751775177517753e-05, "loss": 0.4651, "step": 2900 }, { "epoch": 1.1186113789778207, "eval_loss": 0.4993349611759186, "eval_runtime": 94.4361, "eval_samples_per_second": 54.905, "eval_steps_per_second": 6.872, "step": 2900 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3606948147288474e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }