{ "best_metric": 9.438363075256348, "best_model_checkpoint": "./output/checkpoint-4200", "epoch": 6.083650190114068, "eval_steps": 150, "global_step": 4800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012674271229404309, "grad_norm": 1.1669180393218994, "learning_rate": 1e-05, "loss": 10.3758, "step": 10 }, { "epoch": 0.025348542458808618, "grad_norm": 0.8870534300804138, "learning_rate": 2e-05, "loss": 10.3737, "step": 20 }, { "epoch": 0.03802281368821293, "grad_norm": 1.3315657377243042, "learning_rate": 3e-05, "loss": 10.368, "step": 30 }, { "epoch": 0.050697084917617236, "grad_norm": 1.3620336055755615, "learning_rate": 4e-05, "loss": 10.3623, "step": 40 }, { "epoch": 0.06337135614702155, "grad_norm": 1.9475831985473633, "learning_rate": 5e-05, "loss": 10.3501, "step": 50 }, { "epoch": 0.07604562737642585, "grad_norm": 1.6437187194824219, "learning_rate": 6e-05, "loss": 10.3274, "step": 60 }, { "epoch": 0.08871989860583017, "grad_norm": 1.1295599937438965, "learning_rate": 7e-05, "loss": 10.3138, "step": 70 }, { "epoch": 0.10139416983523447, "grad_norm": 1.2060078382492065, "learning_rate": 8e-05, "loss": 10.2926, "step": 80 }, { "epoch": 0.11406844106463879, "grad_norm": 1.0347387790679932, "learning_rate": 9e-05, "loss": 10.2681, "step": 90 }, { "epoch": 0.1267427122940431, "grad_norm": 1.117771029472351, "learning_rate": 0.0001, "loss": 10.2437, "step": 100 }, { "epoch": 0.1394169835234474, "grad_norm": 1.1378909349441528, "learning_rate": 9.99989723479183e-05, "loss": 10.2028, "step": 110 }, { "epoch": 0.1520912547528517, "grad_norm": 1.100027322769165, "learning_rate": 9.999588943391597e-05, "loss": 10.1804, "step": 120 }, { "epoch": 0.16476552598225602, "grad_norm": 0.8653411865234375, "learning_rate": 9.999075138471951e-05, "loss": 10.156, "step": 130 }, { "epoch": 0.17743979721166034, "grad_norm": 1.123915672302246, "learning_rate": 9.9983558411534e-05, "loss": 10.1186, "step": 140 }, { "epoch": 0.19011406844106463, "grad_norm": 1.0851942300796509, "learning_rate": 9.99743108100344e-05, "loss": 10.0932, "step": 150 }, { "epoch": 0.19011406844106463, "eval_loss": 10.073542594909668, "eval_runtime": 2.5025, "eval_samples_per_second": 199.804, "eval_steps_per_second": 199.804, "step": 150 }, { "epoch": 0.20278833967046894, "grad_norm": 1.0364855527877808, "learning_rate": 9.996300896035339e-05, "loss": 10.0653, "step": 160 }, { "epoch": 0.21546261089987326, "grad_norm": 0.9104143977165222, "learning_rate": 9.994965332706573e-05, "loss": 10.0582, "step": 170 }, { "epoch": 0.22813688212927757, "grad_norm": 0.9514102339744568, "learning_rate": 9.993424445916923e-05, "loss": 10.0115, "step": 180 }, { "epoch": 0.24081115335868186, "grad_norm": 0.847805917263031, "learning_rate": 9.991678299006205e-05, "loss": 9.9946, "step": 190 }, { "epoch": 0.2534854245880862, "grad_norm": 0.872083306312561, "learning_rate": 9.989726963751682e-05, "loss": 9.9719, "step": 200 }, { "epoch": 0.2661596958174905, "grad_norm": 1.1582080125808716, "learning_rate": 9.987570520365104e-05, "loss": 9.9385, "step": 210 }, { "epoch": 0.2788339670468948, "grad_norm": 0.785866916179657, "learning_rate": 9.98520905748941e-05, "loss": 9.9325, "step": 220 }, { "epoch": 0.2915082382762991, "grad_norm": 0.909336268901825, "learning_rate": 9.982642672195092e-05, "loss": 9.9029, "step": 230 }, { "epoch": 0.3041825095057034, "grad_norm": 0.8668178915977478, "learning_rate": 9.979871469976196e-05, "loss": 9.8908, "step": 240 }, { "epoch": 0.31685678073510776, "grad_norm": 0.9314823746681213, "learning_rate": 9.976895564745991e-05, "loss": 9.8654, "step": 250 }, { "epoch": 0.32953105196451205, "grad_norm": 0.7770909667015076, "learning_rate": 9.973715078832288e-05, "loss": 9.8517, "step": 260 }, { "epoch": 0.34220532319391633, "grad_norm": 0.8226707577705383, "learning_rate": 9.970330142972401e-05, "loss": 9.8345, "step": 270 }, { "epoch": 0.3548795944233207, "grad_norm": 1.000295639038086, "learning_rate": 9.966740896307791e-05, "loss": 9.8048, "step": 280 }, { "epoch": 0.36755386565272496, "grad_norm": 0.9318373799324036, "learning_rate": 9.962947486378326e-05, "loss": 9.8118, "step": 290 }, { "epoch": 0.38022813688212925, "grad_norm": 0.7704954743385315, "learning_rate": 9.95895006911623e-05, "loss": 9.7825, "step": 300 }, { "epoch": 0.38022813688212925, "eval_loss": 9.766677856445312, "eval_runtime": 2.5091, "eval_samples_per_second": 199.276, "eval_steps_per_second": 199.276, "step": 300 }, { "epoch": 0.3929024081115336, "grad_norm": 0.7742841839790344, "learning_rate": 9.954748808839674e-05, "loss": 9.766, "step": 310 }, { "epoch": 0.4055766793409379, "grad_norm": 0.7012545466423035, "learning_rate": 9.95034387824601e-05, "loss": 9.7737, "step": 320 }, { "epoch": 0.41825095057034223, "grad_norm": 0.9269613027572632, "learning_rate": 9.945735458404681e-05, "loss": 9.7264, "step": 330 }, { "epoch": 0.4309252217997465, "grad_norm": 0.8523616194725037, "learning_rate": 9.940923738749778e-05, "loss": 9.7115, "step": 340 }, { "epoch": 0.4435994930291508, "grad_norm": 0.7607449889183044, "learning_rate": 9.935908917072252e-05, "loss": 9.7171, "step": 350 }, { "epoch": 0.45627376425855515, "grad_norm": 0.7919751405715942, "learning_rate": 9.930691199511775e-05, "loss": 9.7054, "step": 360 }, { "epoch": 0.46894803548795944, "grad_norm": 0.7790911197662354, "learning_rate": 9.925270800548285e-05, "loss": 9.6927, "step": 370 }, { "epoch": 0.4816223067173637, "grad_norm": 0.9352760910987854, "learning_rate": 9.919647942993148e-05, "loss": 9.6947, "step": 380 }, { "epoch": 0.49429657794676807, "grad_norm": 0.8070693612098694, "learning_rate": 9.91382285798002e-05, "loss": 9.668, "step": 390 }, { "epoch": 0.5069708491761724, "grad_norm": 1.024916648864746, "learning_rate": 9.907795784955327e-05, "loss": 9.6635, "step": 400 }, { "epoch": 0.5196451204055766, "grad_norm": 0.8214429020881653, "learning_rate": 9.901566971668437e-05, "loss": 9.6875, "step": 410 }, { "epoch": 0.532319391634981, "grad_norm": 0.772158145904541, "learning_rate": 9.895136674161465e-05, "loss": 9.6507, "step": 420 }, { "epoch": 0.5449936628643853, "grad_norm": 0.8375211954116821, "learning_rate": 9.888505156758759e-05, "loss": 9.6561, "step": 430 }, { "epoch": 0.5576679340937896, "grad_norm": 0.8345258831977844, "learning_rate": 9.881672692056021e-05, "loss": 9.6464, "step": 440 }, { "epoch": 0.5703422053231939, "grad_norm": 0.8216052055358887, "learning_rate": 9.874639560909117e-05, "loss": 9.6384, "step": 450 }, { "epoch": 0.5703422053231939, "eval_loss": 9.625658988952637, "eval_runtime": 2.4793, "eval_samples_per_second": 201.673, "eval_steps_per_second": 201.673, "step": 450 }, { "epoch": 0.5830164765525983, "grad_norm": 0.7313345074653625, "learning_rate": 9.867406052422524e-05, "loss": 9.6274, "step": 460 }, { "epoch": 0.5956907477820025, "grad_norm": 1.0418285131454468, "learning_rate": 9.859972463937441e-05, "loss": 9.6169, "step": 470 }, { "epoch": 0.6083650190114068, "grad_norm": 1.0138803720474243, "learning_rate": 9.852339101019574e-05, "loss": 9.6184, "step": 480 }, { "epoch": 0.6210392902408112, "grad_norm": 0.9741042852401733, "learning_rate": 9.844506277446577e-05, "loss": 9.6151, "step": 490 }, { "epoch": 0.6337135614702155, "grad_norm": 0.7746536731719971, "learning_rate": 9.836474315195147e-05, "loss": 9.6212, "step": 500 }, { "epoch": 0.6463878326996197, "grad_norm": 0.7604137659072876, "learning_rate": 9.828243544427796e-05, "loss": 9.6016, "step": 510 }, { "epoch": 0.6590621039290241, "grad_norm": 0.8758711814880371, "learning_rate": 9.819814303479267e-05, "loss": 9.6122, "step": 520 }, { "epoch": 0.6717363751584284, "grad_norm": 0.8378990292549133, "learning_rate": 9.811186938842645e-05, "loss": 9.5875, "step": 530 }, { "epoch": 0.6844106463878327, "grad_norm": 0.8295283317565918, "learning_rate": 9.802361805155097e-05, "loss": 9.5996, "step": 540 }, { "epoch": 0.697084917617237, "grad_norm": 0.8388843536376953, "learning_rate": 9.793339265183303e-05, "loss": 9.5976, "step": 550 }, { "epoch": 0.7097591888466414, "grad_norm": 0.8450760841369629, "learning_rate": 9.784119689808544e-05, "loss": 9.5894, "step": 560 }, { "epoch": 0.7224334600760456, "grad_norm": 0.8286111354827881, "learning_rate": 9.774703458011453e-05, "loss": 9.5918, "step": 570 }, { "epoch": 0.7351077313054499, "grad_norm": 0.7997474670410156, "learning_rate": 9.765090956856436e-05, "loss": 9.5999, "step": 580 }, { "epoch": 0.7477820025348543, "grad_norm": 0.9216246008872986, "learning_rate": 9.755282581475769e-05, "loss": 9.5944, "step": 590 }, { "epoch": 0.7604562737642585, "grad_norm": 0.6118740439414978, "learning_rate": 9.745278735053343e-05, "loss": 9.5913, "step": 600 }, { "epoch": 0.7604562737642585, "eval_loss": 9.573010444641113, "eval_runtime": 2.4434, "eval_samples_per_second": 204.629, "eval_steps_per_second": 204.629, "step": 600 }, { "epoch": 0.7731305449936628, "grad_norm": 0.7758316993713379, "learning_rate": 9.735079828808107e-05, "loss": 9.5751, "step": 610 }, { "epoch": 0.7858048162230672, "grad_norm": 0.7930996417999268, "learning_rate": 9.724686281977146e-05, "loss": 9.6014, "step": 620 }, { "epoch": 0.7984790874524715, "grad_norm": 0.9326168894767761, "learning_rate": 9.714098521798465e-05, "loss": 9.5753, "step": 630 }, { "epoch": 0.8111533586818758, "grad_norm": 0.7726701498031616, "learning_rate": 9.703316983493414e-05, "loss": 9.5833, "step": 640 }, { "epoch": 0.8238276299112801, "grad_norm": 0.7833057641983032, "learning_rate": 9.692342110248802e-05, "loss": 9.5825, "step": 650 }, { "epoch": 0.8365019011406845, "grad_norm": 0.8093037009239197, "learning_rate": 9.681174353198687e-05, "loss": 9.5705, "step": 660 }, { "epoch": 0.8491761723700887, "grad_norm": 1.0097565650939941, "learning_rate": 9.669814171405816e-05, "loss": 9.5801, "step": 670 }, { "epoch": 0.861850443599493, "grad_norm": 0.8601319789886475, "learning_rate": 9.65826203184277e-05, "loss": 9.5646, "step": 680 }, { "epoch": 0.8745247148288974, "grad_norm": 0.7913228869438171, "learning_rate": 9.64651840937276e-05, "loss": 9.5771, "step": 690 }, { "epoch": 0.8871989860583016, "grad_norm": 0.730390191078186, "learning_rate": 9.63458378673011e-05, "loss": 9.5433, "step": 700 }, { "epoch": 0.899873257287706, "grad_norm": 0.9280151724815369, "learning_rate": 9.622458654500409e-05, "loss": 9.5334, "step": 710 }, { "epoch": 0.9125475285171103, "grad_norm": 0.8011367917060852, "learning_rate": 9.610143511100354e-05, "loss": 9.543, "step": 720 }, { "epoch": 0.9252217997465145, "grad_norm": 0.7572063207626343, "learning_rate": 9.597638862757255e-05, "loss": 9.5421, "step": 730 }, { "epoch": 0.9378960709759189, "grad_norm": 0.6425439119338989, "learning_rate": 9.584945223488227e-05, "loss": 9.546, "step": 740 }, { "epoch": 0.9505703422053232, "grad_norm": 0.7719993591308594, "learning_rate": 9.572063115079063e-05, "loss": 9.519, "step": 750 }, { "epoch": 0.9505703422053232, "eval_loss": 9.523771286010742, "eval_runtime": 2.4436, "eval_samples_per_second": 204.616, "eval_steps_per_second": 204.616, "step": 750 }, { "epoch": 0.9632446134347274, "grad_norm": 0.8080549240112305, "learning_rate": 9.558993067062785e-05, "loss": 9.5503, "step": 760 }, { "epoch": 0.9759188846641318, "grad_norm": 0.7494069337844849, "learning_rate": 9.545735616697875e-05, "loss": 9.5379, "step": 770 }, { "epoch": 0.9885931558935361, "grad_norm": 0.8330128788948059, "learning_rate": 9.53229130894619e-05, "loss": 9.5348, "step": 780 }, { "epoch": 1.0012674271229405, "grad_norm": 0.7647750377655029, "learning_rate": 9.518660696450568e-05, "loss": 9.5243, "step": 790 }, { "epoch": 1.0139416983523448, "grad_norm": 0.6513275504112244, "learning_rate": 9.504844339512095e-05, "loss": 9.497, "step": 800 }, { "epoch": 1.026615969581749, "grad_norm": 0.645017683506012, "learning_rate": 9.490842806067095e-05, "loss": 9.5162, "step": 810 }, { "epoch": 1.0392902408111533, "grad_norm": 0.7399146556854248, "learning_rate": 9.476656671663765e-05, "loss": 9.504, "step": 820 }, { "epoch": 1.0519645120405576, "grad_norm": 0.8449930548667908, "learning_rate": 9.46228651943853e-05, "loss": 9.5187, "step": 830 }, { "epoch": 1.064638783269962, "grad_norm": 0.9231324195861816, "learning_rate": 9.44773294009206e-05, "loss": 9.5209, "step": 840 }, { "epoch": 1.0773130544993663, "grad_norm": 1.298727035522461, "learning_rate": 9.432996531865002e-05, "loss": 9.5094, "step": 850 }, { "epoch": 1.0899873257287707, "grad_norm": 0.7451304793357849, "learning_rate": 9.418077900513377e-05, "loss": 9.4958, "step": 860 }, { "epoch": 1.102661596958175, "grad_norm": 0.89738929271698, "learning_rate": 9.40297765928369e-05, "loss": 9.5266, "step": 870 }, { "epoch": 1.1153358681875791, "grad_norm": 0.8584068417549133, "learning_rate": 9.387696428887716e-05, "loss": 9.5107, "step": 880 }, { "epoch": 1.1280101394169835, "grad_norm": 0.7765348553657532, "learning_rate": 9.372234837476978e-05, "loss": 9.4897, "step": 890 }, { "epoch": 1.1406844106463878, "grad_norm": 0.9170354008674622, "learning_rate": 9.356593520616948e-05, "loss": 9.4966, "step": 900 }, { "epoch": 1.1406844106463878, "eval_loss": 9.50583267211914, "eval_runtime": 2.4554, "eval_samples_per_second": 203.631, "eval_steps_per_second": 203.631, "step": 900 }, { "epoch": 1.1533586818757922, "grad_norm": 0.8190200328826904, "learning_rate": 9.340773121260893e-05, "loss": 9.5031, "step": 910 }, { "epoch": 1.1660329531051965, "grad_norm": 0.6572229862213135, "learning_rate": 9.324774289723468e-05, "loss": 9.4963, "step": 920 }, { "epoch": 1.1787072243346008, "grad_norm": 1.4209988117218018, "learning_rate": 9.308597683653975e-05, "loss": 9.497, "step": 930 }, { "epoch": 1.1913814955640052, "grad_norm": 0.7789145112037659, "learning_rate": 9.292243968009331e-05, "loss": 9.4981, "step": 940 }, { "epoch": 1.2040557667934093, "grad_norm": 0.744008481502533, "learning_rate": 9.275713815026731e-05, "loss": 9.4749, "step": 950 }, { "epoch": 1.2167300380228137, "grad_norm": 0.8584849834442139, "learning_rate": 9.259007904196023e-05, "loss": 9.5064, "step": 960 }, { "epoch": 1.229404309252218, "grad_norm": 0.8089480400085449, "learning_rate": 9.242126922231763e-05, "loss": 9.4889, "step": 970 }, { "epoch": 1.2420785804816223, "grad_norm": 0.8510288000106812, "learning_rate": 9.225071563045007e-05, "loss": 9.4976, "step": 980 }, { "epoch": 1.2547528517110267, "grad_norm": 0.8175288438796997, "learning_rate": 9.207842527714767e-05, "loss": 9.5021, "step": 990 }, { "epoch": 1.2674271229404308, "grad_norm": 0.8296633958816528, "learning_rate": 9.190440524459203e-05, "loss": 9.4761, "step": 1000 }, { "epoch": 1.2801013941698351, "grad_norm": 0.6615609526634216, "learning_rate": 9.172866268606513e-05, "loss": 9.4957, "step": 1010 }, { "epoch": 1.2927756653992395, "grad_norm": 0.9342919588088989, "learning_rate": 9.155120482565521e-05, "loss": 9.4997, "step": 1020 }, { "epoch": 1.3054499366286438, "grad_norm": 0.7679300904273987, "learning_rate": 9.137203895795983e-05, "loss": 9.5052, "step": 1030 }, { "epoch": 1.3181242078580482, "grad_norm": 0.7991816401481628, "learning_rate": 9.119117244778607e-05, "loss": 9.4666, "step": 1040 }, { "epoch": 1.3307984790874525, "grad_norm": 0.962862491607666, "learning_rate": 9.10086127298478e-05, "loss": 9.4864, "step": 1050 }, { "epoch": 1.3307984790874525, "eval_loss": 9.489490509033203, "eval_runtime": 2.4608, "eval_samples_per_second": 203.186, "eval_steps_per_second": 203.186, "step": 1050 }, { "epoch": 1.3434727503168569, "grad_norm": 0.8132615685462952, "learning_rate": 9.082436730845993e-05, "loss": 9.4913, "step": 1060 }, { "epoch": 1.3561470215462612, "grad_norm": 0.8389043211936951, "learning_rate": 9.063844375723014e-05, "loss": 9.4917, "step": 1070 }, { "epoch": 1.3688212927756653, "grad_norm": 1.0954796075820923, "learning_rate": 9.045084971874738e-05, "loss": 9.4598, "step": 1080 }, { "epoch": 1.3814955640050697, "grad_norm": 0.8693401217460632, "learning_rate": 9.02615929042678e-05, "loss": 9.4697, "step": 1090 }, { "epoch": 1.394169835234474, "grad_norm": 0.6581054329872131, "learning_rate": 9.007068109339784e-05, "loss": 9.4678, "step": 1100 }, { "epoch": 1.4068441064638784, "grad_norm": 0.8068575263023376, "learning_rate": 8.987812213377424e-05, "loss": 9.478, "step": 1110 }, { "epoch": 1.4195183776932827, "grad_norm": 0.7853858470916748, "learning_rate": 8.968392394074164e-05, "loss": 9.4699, "step": 1120 }, { "epoch": 1.4321926489226868, "grad_norm": 0.7116039395332336, "learning_rate": 8.948809449702711e-05, "loss": 9.4858, "step": 1130 }, { "epoch": 1.4448669201520912, "grad_norm": 0.8717799782752991, "learning_rate": 8.929064185241213e-05, "loss": 9.4768, "step": 1140 }, { "epoch": 1.4575411913814955, "grad_norm": 0.7932648062705994, "learning_rate": 8.90915741234015e-05, "loss": 9.4621, "step": 1150 }, { "epoch": 1.4702154626108999, "grad_norm": 0.7295988202095032, "learning_rate": 8.889089949288986e-05, "loss": 9.4573, "step": 1160 }, { "epoch": 1.4828897338403042, "grad_norm": 0.7738338112831116, "learning_rate": 8.868862620982534e-05, "loss": 9.4682, "step": 1170 }, { "epoch": 1.4955640050697085, "grad_norm": 0.6926583647727966, "learning_rate": 8.848476258887031e-05, "loss": 9.4759, "step": 1180 }, { "epoch": 1.508238276299113, "grad_norm": 0.8051027059555054, "learning_rate": 8.827931701005974e-05, "loss": 9.4608, "step": 1190 }, { "epoch": 1.5209125475285172, "grad_norm": 0.8707593083381653, "learning_rate": 8.807229791845673e-05, "loss": 9.4667, "step": 1200 }, { "epoch": 1.5209125475285172, "eval_loss": 9.460935592651367, "eval_runtime": 2.465, "eval_samples_per_second": 202.839, "eval_steps_per_second": 202.839, "step": 1200 }, { "epoch": 1.5335868187579216, "grad_norm": 0.8105939030647278, "learning_rate": 8.786371382380528e-05, "loss": 9.4458, "step": 1210 }, { "epoch": 1.5462610899873257, "grad_norm": 0.8065782189369202, "learning_rate": 8.765357330018056e-05, "loss": 9.4795, "step": 1220 }, { "epoch": 1.55893536121673, "grad_norm": 0.8195183873176575, "learning_rate": 8.744188498563641e-05, "loss": 9.4579, "step": 1230 }, { "epoch": 1.5716096324461344, "grad_norm": 0.9439378380775452, "learning_rate": 8.722865758185035e-05, "loss": 9.45, "step": 1240 }, { "epoch": 1.5842839036755385, "grad_norm": 0.6892300248146057, "learning_rate": 8.701389985376578e-05, "loss": 9.458, "step": 1250 }, { "epoch": 1.5969581749049429, "grad_norm": 0.7570083737373352, "learning_rate": 8.679762062923175e-05, "loss": 9.4433, "step": 1260 }, { "epoch": 1.6096324461343472, "grad_norm": 0.7365748286247253, "learning_rate": 8.657982879864007e-05, "loss": 9.4578, "step": 1270 }, { "epoch": 1.6223067173637515, "grad_norm": 0.8592304587364197, "learning_rate": 8.636053331455987e-05, "loss": 9.4566, "step": 1280 }, { "epoch": 1.6349809885931559, "grad_norm": 0.7177086472511292, "learning_rate": 8.613974319136958e-05, "loss": 9.432, "step": 1290 }, { "epoch": 1.6476552598225602, "grad_norm": 0.7533496022224426, "learning_rate": 8.591746750488639e-05, "loss": 9.4502, "step": 1300 }, { "epoch": 1.6603295310519646, "grad_norm": 0.7828896045684814, "learning_rate": 8.569371539199316e-05, "loss": 9.4429, "step": 1310 }, { "epoch": 1.673003802281369, "grad_norm": 0.7749277949333191, "learning_rate": 8.54684960502629e-05, "loss": 9.4343, "step": 1320 }, { "epoch": 1.6856780735107733, "grad_norm": 0.7745513319969177, "learning_rate": 8.524181873758059e-05, "loss": 9.4443, "step": 1330 }, { "epoch": 1.6983523447401776, "grad_norm": 0.8417680859565735, "learning_rate": 8.501369277176276e-05, "loss": 9.4383, "step": 1340 }, { "epoch": 1.7110266159695817, "grad_norm": 0.7989394664764404, "learning_rate": 8.478412753017433e-05, "loss": 9.4436, "step": 1350 }, { "epoch": 1.7110266159695817, "eval_loss": 9.445840835571289, "eval_runtime": 2.4521, "eval_samples_per_second": 203.907, "eval_steps_per_second": 203.907, "step": 1350 }, { "epoch": 1.723700887198986, "grad_norm": 0.7097709774971008, "learning_rate": 8.455313244934324e-05, "loss": 9.4431, "step": 1360 }, { "epoch": 1.7363751584283904, "grad_norm": 0.8070104718208313, "learning_rate": 8.432071702457252e-05, "loss": 9.4454, "step": 1370 }, { "epoch": 1.7490494296577945, "grad_norm": 0.7669633626937866, "learning_rate": 8.408689080954998e-05, "loss": 9.4415, "step": 1380 }, { "epoch": 1.7617237008871989, "grad_norm": 0.8000221848487854, "learning_rate": 8.385166341595548e-05, "loss": 9.4428, "step": 1390 }, { "epoch": 1.7743979721166032, "grad_norm": 0.7370336651802063, "learning_rate": 8.361504451306585e-05, "loss": 9.4579, "step": 1400 }, { "epoch": 1.7870722433460076, "grad_norm": 0.7398391962051392, "learning_rate": 8.33770438273574e-05, "loss": 9.4406, "step": 1410 }, { "epoch": 1.799746514575412, "grad_norm": 0.7623130679130554, "learning_rate": 8.313767114210615e-05, "loss": 9.432, "step": 1420 }, { "epoch": 1.8124207858048162, "grad_norm": 0.6531655788421631, "learning_rate": 8.289693629698564e-05, "loss": 9.4524, "step": 1430 }, { "epoch": 1.8250950570342206, "grad_norm": 0.7224334478378296, "learning_rate": 8.265484918766243e-05, "loss": 9.4409, "step": 1440 }, { "epoch": 1.837769328263625, "grad_norm": 0.752308189868927, "learning_rate": 8.241141976538943e-05, "loss": 9.4547, "step": 1450 }, { "epoch": 1.8504435994930293, "grad_norm": 0.738632321357727, "learning_rate": 8.216665803659671e-05, "loss": 9.4455, "step": 1460 }, { "epoch": 1.8631178707224336, "grad_norm": 0.826473593711853, "learning_rate": 8.192057406248028e-05, "loss": 9.4281, "step": 1470 }, { "epoch": 1.8757921419518377, "grad_norm": 0.8795267343521118, "learning_rate": 8.167317795858851e-05, "loss": 9.4425, "step": 1480 }, { "epoch": 1.888466413181242, "grad_norm": 0.8274568915367126, "learning_rate": 8.142447989440618e-05, "loss": 9.4429, "step": 1490 }, { "epoch": 1.9011406844106464, "grad_norm": 0.7531217336654663, "learning_rate": 8.117449009293668e-05, "loss": 9.4536, "step": 1500 }, { "epoch": 1.9011406844106464, "eval_loss": 9.442911148071289, "eval_runtime": 2.4762, "eval_samples_per_second": 201.924, "eval_steps_per_second": 201.924, "step": 1500 }, { "epoch": 1.9138149556400506, "grad_norm": 0.8622466325759888, "learning_rate": 8.092321883028158e-05, "loss": 9.4458, "step": 1510 }, { "epoch": 1.926489226869455, "grad_norm": 0.7276505827903748, "learning_rate": 8.067067643521834e-05, "loss": 9.4444, "step": 1520 }, { "epoch": 1.9391634980988592, "grad_norm": 0.7629052996635437, "learning_rate": 8.041687328877567e-05, "loss": 9.4455, "step": 1530 }, { "epoch": 1.9518377693282636, "grad_norm": 0.7246270775794983, "learning_rate": 8.016181982380682e-05, "loss": 9.4464, "step": 1540 }, { "epoch": 1.964512040557668, "grad_norm": 0.9381201863288879, "learning_rate": 7.990552652456081e-05, "loss": 9.4281, "step": 1550 }, { "epoch": 1.9771863117870723, "grad_norm": 0.8291416168212891, "learning_rate": 7.964800392625129e-05, "loss": 9.4377, "step": 1560 }, { "epoch": 1.9898605830164766, "grad_norm": 0.7957122921943665, "learning_rate": 7.938926261462366e-05, "loss": 9.4532, "step": 1570 }, { "epoch": 2.002534854245881, "grad_norm": 0.6859972476959229, "learning_rate": 7.91293132255198e-05, "loss": 9.4341, "step": 1580 }, { "epoch": 2.0152091254752853, "grad_norm": 0.682180643081665, "learning_rate": 7.886816644444098e-05, "loss": 9.4314, "step": 1590 }, { "epoch": 2.0278833967046896, "grad_norm": 0.8989324569702148, "learning_rate": 7.860583300610849e-05, "loss": 9.443, "step": 1600 }, { "epoch": 2.040557667934094, "grad_norm": 0.730932891368866, "learning_rate": 7.83423236940225e-05, "loss": 9.4309, "step": 1610 }, { "epoch": 2.053231939163498, "grad_norm": 0.8413243889808655, "learning_rate": 7.807764934001874e-05, "loss": 9.4434, "step": 1620 }, { "epoch": 2.0659062103929022, "grad_norm": 0.8659064769744873, "learning_rate": 7.781182082382325e-05, "loss": 9.4305, "step": 1630 }, { "epoch": 2.0785804816223066, "grad_norm": 0.7855962514877319, "learning_rate": 7.754484907260513e-05, "loss": 9.4274, "step": 1640 }, { "epoch": 2.091254752851711, "grad_norm": 0.8000444173812866, "learning_rate": 7.727674506052743e-05, "loss": 9.429, "step": 1650 }, { "epoch": 2.091254752851711, "eval_loss": 9.441214561462402, "eval_runtime": 2.4608, "eval_samples_per_second": 203.186, "eval_steps_per_second": 203.186, "step": 1650 }, { "epoch": 2.1039290240811153, "grad_norm": 0.7676946520805359, "learning_rate": 7.700751980829602e-05, "loss": 9.4297, "step": 1660 }, { "epoch": 2.1166032953105196, "grad_norm": 18.00615882873535, "learning_rate": 7.673718438270648e-05, "loss": 9.44, "step": 1670 }, { "epoch": 2.129277566539924, "grad_norm": 0.6926226615905762, "learning_rate": 7.646574989618938e-05, "loss": 9.4336, "step": 1680 }, { "epoch": 2.1419518377693283, "grad_norm": 0.6653842329978943, "learning_rate": 7.619322750635327e-05, "loss": 9.4637, "step": 1690 }, { "epoch": 2.1546261089987326, "grad_norm": 0.7826524376869202, "learning_rate": 7.591962841552627e-05, "loss": 9.4438, "step": 1700 }, { "epoch": 2.167300380228137, "grad_norm": 0.743466317653656, "learning_rate": 7.564496387029532e-05, "loss": 9.4449, "step": 1710 }, { "epoch": 2.1799746514575413, "grad_norm": 0.8151763081550598, "learning_rate": 7.536924516104411e-05, "loss": 9.4519, "step": 1720 }, { "epoch": 2.1926489226869457, "grad_norm": 0.7397652864456177, "learning_rate": 7.509248362148889e-05, "loss": 9.4285, "step": 1730 }, { "epoch": 2.20532319391635, "grad_norm": 0.811447024345398, "learning_rate": 7.481469062821252e-05, "loss": 9.442, "step": 1740 }, { "epoch": 2.2179974651457544, "grad_norm": 0.8720759153366089, "learning_rate": 7.45358776001969e-05, "loss": 9.4355, "step": 1750 }, { "epoch": 2.2306717363751583, "grad_norm": 0.7164329290390015, "learning_rate": 7.425605599835361e-05, "loss": 9.4289, "step": 1760 }, { "epoch": 2.2433460076045626, "grad_norm": 0.7464995384216309, "learning_rate": 7.39752373250527e-05, "loss": 9.4392, "step": 1770 }, { "epoch": 2.256020278833967, "grad_norm": 0.6373656392097473, "learning_rate": 7.369343312364993e-05, "loss": 9.4287, "step": 1780 }, { "epoch": 2.2686945500633713, "grad_norm": 0.7979937791824341, "learning_rate": 7.34106549780123e-05, "loss": 9.4373, "step": 1790 }, { "epoch": 2.2813688212927756, "grad_norm": 0.7435820698738098, "learning_rate": 7.312691451204178e-05, "loss": 9.4268, "step": 1800 }, { "epoch": 2.2813688212927756, "eval_loss": 9.440364837646484, "eval_runtime": 2.4843, "eval_samples_per_second": 201.265, "eval_steps_per_second": 201.265, "step": 1800 }, { "epoch": 2.29404309252218, "grad_norm": 0.8188759684562683, "learning_rate": 7.284222338919758e-05, "loss": 9.4326, "step": 1810 }, { "epoch": 2.3067173637515843, "grad_norm": 0.8077417016029358, "learning_rate": 7.255659331201673e-05, "loss": 9.426, "step": 1820 }, { "epoch": 2.3193916349809887, "grad_norm": 0.9484453201293945, "learning_rate": 7.227003602163295e-05, "loss": 9.434, "step": 1830 }, { "epoch": 2.332065906210393, "grad_norm": 0.8113880753517151, "learning_rate": 7.198256329729412e-05, "loss": 9.4314, "step": 1840 }, { "epoch": 2.3447401774397973, "grad_norm": 0.7225760221481323, "learning_rate": 7.169418695587791e-05, "loss": 9.441, "step": 1850 }, { "epoch": 2.3574144486692017, "grad_norm": 0.8036431670188904, "learning_rate": 7.14049188514063e-05, "loss": 9.4608, "step": 1860 }, { "epoch": 2.3700887198986056, "grad_norm": 0.8197149038314819, "learning_rate": 7.1114770874558e-05, "loss": 9.4485, "step": 1870 }, { "epoch": 2.3827629911280104, "grad_norm": 0.7283074259757996, "learning_rate": 7.082375495217995e-05, "loss": 9.435, "step": 1880 }, { "epoch": 2.3954372623574143, "grad_norm": 0.7838678956031799, "learning_rate": 7.05318830467969e-05, "loss": 9.4389, "step": 1890 }, { "epoch": 2.4081115335868186, "grad_norm": 0.5950124859809875, "learning_rate": 7.023916715611969e-05, "loss": 9.4208, "step": 1900 }, { "epoch": 2.420785804816223, "grad_norm": 0.711269736289978, "learning_rate": 6.99456193125521e-05, "loss": 9.4412, "step": 1910 }, { "epoch": 2.4334600760456273, "grad_norm": 0.7352544665336609, "learning_rate": 6.965125158269619e-05, "loss": 9.4239, "step": 1920 }, { "epoch": 2.4461343472750317, "grad_norm": 0.7017738223075867, "learning_rate": 6.935607606685642e-05, "loss": 9.428, "step": 1930 }, { "epoch": 2.458808618504436, "grad_norm": 0.7372350096702576, "learning_rate": 6.906010489854209e-05, "loss": 9.4492, "step": 1940 }, { "epoch": 2.4714828897338403, "grad_norm": 0.7527620196342468, "learning_rate": 6.876335024396872e-05, "loss": 9.4337, "step": 1950 }, { "epoch": 2.4714828897338403, "eval_loss": 9.439714431762695, "eval_runtime": 2.4651, "eval_samples_per_second": 202.83, "eval_steps_per_second": 202.83, "step": 1950 }, { "epoch": 2.4841571609632447, "grad_norm": 0.6684580445289612, "learning_rate": 6.846582430155783e-05, "loss": 9.4216, "step": 1960 }, { "epoch": 2.496831432192649, "grad_norm": 0.7620694041252136, "learning_rate": 6.816753930143558e-05, "loss": 9.4288, "step": 1970 }, { "epoch": 2.5095057034220534, "grad_norm": 0.7074058055877686, "learning_rate": 6.786850750493006e-05, "loss": 9.437, "step": 1980 }, { "epoch": 2.5221799746514577, "grad_norm": 0.7757852077484131, "learning_rate": 6.756874120406714e-05, "loss": 9.4384, "step": 1990 }, { "epoch": 2.5348542458808616, "grad_norm": 0.8488853573799133, "learning_rate": 6.726825272106538e-05, "loss": 9.435, "step": 2000 }, { "epoch": 2.5475285171102664, "grad_norm": 0.7243012189865112, "learning_rate": 6.696705440782938e-05, "loss": 9.4408, "step": 2010 }, { "epoch": 2.5602027883396703, "grad_norm": 0.8224584460258484, "learning_rate": 6.666515864544209e-05, "loss": 9.4505, "step": 2020 }, { "epoch": 2.5728770595690746, "grad_norm": 0.8101847171783447, "learning_rate": 6.636257784365584e-05, "loss": 9.4352, "step": 2030 }, { "epoch": 2.585551330798479, "grad_norm": 0.7522319555282593, "learning_rate": 6.605932444038229e-05, "loss": 9.4427, "step": 2040 }, { "epoch": 2.5982256020278833, "grad_norm": 0.7391648292541504, "learning_rate": 6.575541090118105e-05, "loss": 9.4396, "step": 2050 }, { "epoch": 2.6108998732572877, "grad_norm": 0.6625525951385498, "learning_rate": 6.545084971874738e-05, "loss": 9.4374, "step": 2060 }, { "epoch": 2.623574144486692, "grad_norm": 0.9081099033355713, "learning_rate": 6.514565341239861e-05, "loss": 9.4484, "step": 2070 }, { "epoch": 2.6362484157160964, "grad_norm": 0.7026821374893188, "learning_rate": 6.483983452755953e-05, "loss": 9.4287, "step": 2080 }, { "epoch": 2.6489226869455007, "grad_norm": 0.8642821907997131, "learning_rate": 6.453340563524669e-05, "loss": 9.4255, "step": 2090 }, { "epoch": 2.661596958174905, "grad_norm": 0.7191464304924011, "learning_rate": 6.422637933155162e-05, "loss": 9.4548, "step": 2100 }, { "epoch": 2.661596958174905, "eval_loss": 9.439363479614258, "eval_runtime": 2.4725, "eval_samples_per_second": 202.228, "eval_steps_per_second": 202.228, "step": 2100 }, { "epoch": 2.6742712294043094, "grad_norm": 0.8825898766517639, "learning_rate": 6.391876823712317e-05, "loss": 9.4327, "step": 2110 }, { "epoch": 2.6869455006337137, "grad_norm": 0.7532851099967957, "learning_rate": 6.361058499664856e-05, "loss": 9.4264, "step": 2120 }, { "epoch": 2.6996197718631176, "grad_norm": 0.7267948985099792, "learning_rate": 6.330184227833376e-05, "loss": 9.4313, "step": 2130 }, { "epoch": 2.7122940430925224, "grad_norm": 0.9392057061195374, "learning_rate": 6.299255277338265e-05, "loss": 9.4202, "step": 2140 }, { "epoch": 2.7249683143219263, "grad_norm": 0.7596332430839539, "learning_rate": 6.268272919547537e-05, "loss": 9.4382, "step": 2150 }, { "epoch": 2.7376425855513307, "grad_norm": 0.6376082897186279, "learning_rate": 6.237238428024572e-05, "loss": 9.4259, "step": 2160 }, { "epoch": 2.750316856780735, "grad_norm": 0.7612680196762085, "learning_rate": 6.206153078475763e-05, "loss": 9.4397, "step": 2170 }, { "epoch": 2.7629911280101394, "grad_norm": 0.6870391964912415, "learning_rate": 6.175018148698077e-05, "loss": 9.4531, "step": 2180 }, { "epoch": 2.7756653992395437, "grad_norm": 0.7528817653656006, "learning_rate": 6.143834918526527e-05, "loss": 9.4412, "step": 2190 }, { "epoch": 2.788339670468948, "grad_norm": 0.8387569785118103, "learning_rate": 6.112604669781572e-05, "loss": 9.434, "step": 2200 }, { "epoch": 2.8010139416983524, "grad_norm": 0.9195103049278259, "learning_rate": 6.081328686216418e-05, "loss": 9.4159, "step": 2210 }, { "epoch": 2.8136882129277567, "grad_norm": 0.7632887959480286, "learning_rate": 6.0500082534642464e-05, "loss": 9.4423, "step": 2220 }, { "epoch": 2.826362484157161, "grad_norm": 0.9032052159309387, "learning_rate": 6.0186446589853784e-05, "loss": 9.4262, "step": 2230 }, { "epoch": 2.8390367553865654, "grad_norm": 0.8594799041748047, "learning_rate": 5.987239192014336e-05, "loss": 9.4479, "step": 2240 }, { "epoch": 2.8517110266159698, "grad_norm": 0.826501727104187, "learning_rate": 5.955793143506863e-05, "loss": 9.4406, "step": 2250 }, { "epoch": 2.8517110266159698, "eval_loss": 9.438972473144531, "eval_runtime": 2.4547, "eval_samples_per_second": 203.688, "eval_steps_per_second": 203.688, "step": 2250 }, { "epoch": 2.8643852978453737, "grad_norm": 0.7650038599967957, "learning_rate": 5.924307806086844e-05, "loss": 9.4417, "step": 2260 }, { "epoch": 2.8770595690747784, "grad_norm": 0.7463059425354004, "learning_rate": 5.8927844739931834e-05, "loss": 9.4331, "step": 2270 }, { "epoch": 2.8897338403041823, "grad_norm": 0.6988371014595032, "learning_rate": 5.861224443026595e-05, "loss": 9.4348, "step": 2280 }, { "epoch": 2.9024081115335867, "grad_norm": 0.845314621925354, "learning_rate": 5.82962901049634e-05, "loss": 9.436, "step": 2290 }, { "epoch": 2.915082382762991, "grad_norm": 0.8427614569664001, "learning_rate": 5.7979994751668964e-05, "loss": 9.4348, "step": 2300 }, { "epoch": 2.9277566539923954, "grad_norm": 0.7990124821662903, "learning_rate": 5.766337137204579e-05, "loss": 9.4428, "step": 2310 }, { "epoch": 2.9404309252217997, "grad_norm": 0.8618285655975342, "learning_rate": 5.7346432981240904e-05, "loss": 9.4394, "step": 2320 }, { "epoch": 2.953105196451204, "grad_norm": 0.7441295981407166, "learning_rate": 5.7029192607350146e-05, "loss": 9.418, "step": 2330 }, { "epoch": 2.9657794676806084, "grad_norm": 0.7739105820655823, "learning_rate": 5.6711663290882776e-05, "loss": 9.4206, "step": 2340 }, { "epoch": 2.9784537389100127, "grad_norm": 0.7851083874702454, "learning_rate": 5.6393858084225305e-05, "loss": 9.4344, "step": 2350 }, { "epoch": 2.991128010139417, "grad_norm": 0.7880497574806213, "learning_rate": 5.6075790051105023e-05, "loss": 9.4423, "step": 2360 }, { "epoch": 3.0038022813688214, "grad_norm": 0.7760215997695923, "learning_rate": 5.575747226605298e-05, "loss": 9.4213, "step": 2370 }, { "epoch": 3.016476552598226, "grad_norm": 0.8748324513435364, "learning_rate": 5.5438917813866554e-05, "loss": 9.415, "step": 2380 }, { "epoch": 3.02915082382763, "grad_norm": 0.7627739906311035, "learning_rate": 5.512013978907157e-05, "loss": 9.4218, "step": 2390 }, { "epoch": 3.041825095057034, "grad_norm": 0.9664641618728638, "learning_rate": 5.480115129538409e-05, "loss": 9.4492, "step": 2400 }, { "epoch": 3.041825095057034, "eval_loss": 9.439058303833008, "eval_runtime": 2.4754, "eval_samples_per_second": 201.987, "eval_steps_per_second": 201.987, "step": 2400 }, { "epoch": 3.0544993662864384, "grad_norm": 0.8026870489120483, "learning_rate": 5.448196544517168e-05, "loss": 9.4205, "step": 2410 }, { "epoch": 3.0671736375158427, "grad_norm": 0.9264091849327087, "learning_rate": 5.416259535891447e-05, "loss": 9.4248, "step": 2420 }, { "epoch": 3.079847908745247, "grad_norm": 0.963367760181427, "learning_rate": 5.384305416466584e-05, "loss": 9.4257, "step": 2430 }, { "epoch": 3.0925221799746514, "grad_norm": 0.72805255651474, "learning_rate": 5.35233549975127e-05, "loss": 9.4387, "step": 2440 }, { "epoch": 3.1051964512040557, "grad_norm": 0.724453866481781, "learning_rate": 5.320351099903565e-05, "loss": 9.4273, "step": 2450 }, { "epoch": 3.11787072243346, "grad_norm": 0.840308427810669, "learning_rate": 5.288353531676873e-05, "loss": 9.4235, "step": 2460 }, { "epoch": 3.1305449936628644, "grad_norm": 0.7261815071105957, "learning_rate": 5.256344110365896e-05, "loss": 9.4174, "step": 2470 }, { "epoch": 3.1432192648922688, "grad_norm": 0.694277286529541, "learning_rate": 5.2243241517525754e-05, "loss": 9.434, "step": 2480 }, { "epoch": 3.155893536121673, "grad_norm": 0.9190409183502197, "learning_rate": 5.192294972051992e-05, "loss": 9.4217, "step": 2490 }, { "epoch": 3.1685678073510775, "grad_norm": 0.738180935382843, "learning_rate": 5.1602578878582776e-05, "loss": 9.4488, "step": 2500 }, { "epoch": 3.181242078580482, "grad_norm": 0.7736183404922485, "learning_rate": 5.128214216090478e-05, "loss": 9.4333, "step": 2510 }, { "epoch": 3.1939163498098857, "grad_norm": 0.7926467657089233, "learning_rate": 5.0961652739384356e-05, "loss": 9.4545, "step": 2520 }, { "epoch": 3.20659062103929, "grad_norm": 0.8107408881187439, "learning_rate": 5.064112378808637e-05, "loss": 9.4382, "step": 2530 }, { "epoch": 3.2192648922686944, "grad_norm": 0.8251469731330872, "learning_rate": 5.0320568482700556e-05, "loss": 9.4286, "step": 2540 }, { "epoch": 3.2319391634980987, "grad_norm": 0.9335879683494568, "learning_rate": 5e-05, "loss": 9.4376, "step": 2550 }, { "epoch": 3.2319391634980987, "eval_loss": 9.4386625289917, "eval_runtime": 2.4627, "eval_samples_per_second": 203.033, "eval_steps_per_second": 203.033, "step": 2550 }, { "epoch": 3.244613434727503, "grad_norm": 0.6870019435882568, "learning_rate": 4.967943151729945e-05, "loss": 9.436, "step": 2560 }, { "epoch": 3.2572877059569074, "grad_norm": 0.9441744089126587, "learning_rate": 4.935887621191364e-05, "loss": 9.4266, "step": 2570 }, { "epoch": 3.2699619771863118, "grad_norm": 0.7505266070365906, "learning_rate": 4.903834726061565e-05, "loss": 9.404, "step": 2580 }, { "epoch": 3.282636248415716, "grad_norm": 0.8613854646682739, "learning_rate": 4.871785783909523e-05, "loss": 9.4373, "step": 2590 }, { "epoch": 3.2953105196451205, "grad_norm": 0.8340327739715576, "learning_rate": 4.839742112141724e-05, "loss": 9.4249, "step": 2600 }, { "epoch": 3.307984790874525, "grad_norm": 0.8505858778953552, "learning_rate": 4.807705027948008e-05, "loss": 9.4303, "step": 2610 }, { "epoch": 3.320659062103929, "grad_norm": 0.7492672801017761, "learning_rate": 4.775675848247427e-05, "loss": 9.4405, "step": 2620 }, { "epoch": 3.3333333333333335, "grad_norm": 0.8370112180709839, "learning_rate": 4.743655889634105e-05, "loss": 9.4239, "step": 2630 }, { "epoch": 3.346007604562738, "grad_norm": 0.7401330471038818, "learning_rate": 4.711646468323129e-05, "loss": 9.4304, "step": 2640 }, { "epoch": 3.3586818757921417, "grad_norm": 0.8236503005027771, "learning_rate": 4.679648900096436e-05, "loss": 9.449, "step": 2650 }, { "epoch": 3.371356147021546, "grad_norm": 0.8208128213882446, "learning_rate": 4.64766450024873e-05, "loss": 9.4284, "step": 2660 }, { "epoch": 3.3840304182509504, "grad_norm": 0.6585504412651062, "learning_rate": 4.6156945835334184e-05, "loss": 9.4524, "step": 2670 }, { "epoch": 3.3967046894803548, "grad_norm": 0.8434840440750122, "learning_rate": 4.583740464108554e-05, "loss": 9.4498, "step": 2680 }, { "epoch": 3.409378960709759, "grad_norm": 0.6719241738319397, "learning_rate": 4.551803455482833e-05, "loss": 9.4304, "step": 2690 }, { "epoch": 3.4220532319391634, "grad_norm": 0.8027125000953674, "learning_rate": 4.5198848704615914e-05, "loss": 9.4315, "step": 2700 }, { "epoch": 3.4220532319391634, "eval_loss": 9.438660621643066, "eval_runtime": 2.4627, "eval_samples_per_second": 203.031, "eval_steps_per_second": 203.031, "step": 2700 }, { "epoch": 3.434727503168568, "grad_norm": 0.8233821988105774, "learning_rate": 4.487986021092844e-05, "loss": 9.424, "step": 2710 }, { "epoch": 3.447401774397972, "grad_norm": 0.7581255435943604, "learning_rate": 4.4561082186133464e-05, "loss": 9.4458, "step": 2720 }, { "epoch": 3.4600760456273765, "grad_norm": 0.6852894425392151, "learning_rate": 4.424252773394704e-05, "loss": 9.4192, "step": 2730 }, { "epoch": 3.472750316856781, "grad_norm": 0.704776406288147, "learning_rate": 4.392420994889498e-05, "loss": 9.4212, "step": 2740 }, { "epoch": 3.485424588086185, "grad_norm": 0.8197017908096313, "learning_rate": 4.3606141915774693e-05, "loss": 9.4381, "step": 2750 }, { "epoch": 3.4980988593155895, "grad_norm": 0.6381480693817139, "learning_rate": 4.328833670911724e-05, "loss": 9.4338, "step": 2760 }, { "epoch": 3.510773130544994, "grad_norm": 0.8601919412612915, "learning_rate": 4.297080739264987e-05, "loss": 9.4415, "step": 2770 }, { "epoch": 3.5234474017743977, "grad_norm": 0.7440756559371948, "learning_rate": 4.265356701875911e-05, "loss": 9.4418, "step": 2780 }, { "epoch": 3.5361216730038025, "grad_norm": 0.6482778787612915, "learning_rate": 4.23366286279542e-05, "loss": 9.4425, "step": 2790 }, { "epoch": 3.5487959442332064, "grad_norm": 0.7173515558242798, "learning_rate": 4.2020005248331054e-05, "loss": 9.4301, "step": 2800 }, { "epoch": 3.5614702154626108, "grad_norm": 0.7305528521537781, "learning_rate": 4.1703709895036625e-05, "loss": 9.4254, "step": 2810 }, { "epoch": 3.574144486692015, "grad_norm": 0.7351227402687073, "learning_rate": 4.138775556973406e-05, "loss": 9.4452, "step": 2820 }, { "epoch": 3.5868187579214195, "grad_norm": 0.7636406421661377, "learning_rate": 4.107215526006817e-05, "loss": 9.4468, "step": 2830 }, { "epoch": 3.599493029150824, "grad_norm": 0.8805913329124451, "learning_rate": 4.0756921939131565e-05, "loss": 9.4249, "step": 2840 }, { "epoch": 3.612167300380228, "grad_norm": 0.8383622169494629, "learning_rate": 4.04420685649314e-05, "loss": 9.4399, "step": 2850 }, { "epoch": 3.612167300380228, "eval_loss": 9.438502311706543, "eval_runtime": 2.4546, "eval_samples_per_second": 203.7, "eval_steps_per_second": 203.7, "step": 2850 }, { "epoch": 3.6248415716096325, "grad_norm": 0.8202491402626038, "learning_rate": 4.012760807985665e-05, "loss": 9.4113, "step": 2860 }, { "epoch": 3.637515842839037, "grad_norm": 0.7821328043937683, "learning_rate": 3.981355341014623e-05, "loss": 9.4378, "step": 2870 }, { "epoch": 3.650190114068441, "grad_norm": 0.7407823801040649, "learning_rate": 3.9499917465357534e-05, "loss": 9.4323, "step": 2880 }, { "epoch": 3.6628643852978455, "grad_norm": 0.8170826435089111, "learning_rate": 3.9186713137835826e-05, "loss": 9.4274, "step": 2890 }, { "epoch": 3.67553865652725, "grad_norm": 0.8330857753753662, "learning_rate": 3.887395330218429e-05, "loss": 9.4356, "step": 2900 }, { "epoch": 3.6882129277566538, "grad_norm": 0.8886080980300903, "learning_rate": 3.856165081473474e-05, "loss": 9.4366, "step": 2910 }, { "epoch": 3.7008871989860586, "grad_norm": 0.7460224032402039, "learning_rate": 3.8249818513019244e-05, "loss": 9.4365, "step": 2920 }, { "epoch": 3.7135614702154625, "grad_norm": 0.6430843472480774, "learning_rate": 3.793846921524237e-05, "loss": 9.438, "step": 2930 }, { "epoch": 3.726235741444867, "grad_norm": 0.7738505601882935, "learning_rate": 3.762761571975429e-05, "loss": 9.4286, "step": 2940 }, { "epoch": 3.738910012674271, "grad_norm": 0.6874530911445618, "learning_rate": 3.731727080452464e-05, "loss": 9.4318, "step": 2950 }, { "epoch": 3.7515842839036755, "grad_norm": 0.852977454662323, "learning_rate": 3.7007447226617366e-05, "loss": 9.4341, "step": 2960 }, { "epoch": 3.76425855513308, "grad_norm": 0.8847272396087646, "learning_rate": 3.6698157721666246e-05, "loss": 9.4332, "step": 2970 }, { "epoch": 3.776932826362484, "grad_norm": 0.6835851073265076, "learning_rate": 3.638941500335145e-05, "loss": 9.4292, "step": 2980 }, { "epoch": 3.7896070975918885, "grad_norm": 0.7002390623092651, "learning_rate": 3.608123176287685e-05, "loss": 9.4427, "step": 2990 }, { "epoch": 3.802281368821293, "grad_norm": 0.8122102618217468, "learning_rate": 3.5773620668448384e-05, "loss": 9.4564, "step": 3000 }, { "epoch": 3.802281368821293, "eval_loss": 9.438515663146973, "eval_runtime": 2.4578, "eval_samples_per_second": 203.432, "eval_steps_per_second": 203.432, "step": 3000 }, { "epoch": 3.814955640050697, "grad_norm": 0.7593154311180115, "learning_rate": 3.5466594364753326e-05, "loss": 9.4533, "step": 3010 }, { "epoch": 3.8276299112801015, "grad_norm": 0.7191845178604126, "learning_rate": 3.5160165472440473e-05, "loss": 9.4465, "step": 3020 }, { "epoch": 3.840304182509506, "grad_norm": 0.6512830853462219, "learning_rate": 3.48543465876014e-05, "loss": 9.4483, "step": 3030 }, { "epoch": 3.85297845373891, "grad_norm": 0.8649701476097107, "learning_rate": 3.4549150281252636e-05, "loss": 9.4296, "step": 3040 }, { "epoch": 3.8656527249683146, "grad_norm": 0.6761658787727356, "learning_rate": 3.424458909881897e-05, "loss": 9.4397, "step": 3050 }, { "epoch": 3.8783269961977185, "grad_norm": 0.7854108810424805, "learning_rate": 3.3940675559617724e-05, "loss": 9.4281, "step": 3060 }, { "epoch": 3.891001267427123, "grad_norm": 0.673559308052063, "learning_rate": 3.363742215634415e-05, "loss": 9.4328, "step": 3070 }, { "epoch": 3.903675538656527, "grad_norm": 0.6618309617042542, "learning_rate": 3.333484135455792e-05, "loss": 9.43, "step": 3080 }, { "epoch": 3.9163498098859315, "grad_norm": 0.8688169121742249, "learning_rate": 3.303294559217063e-05, "loss": 9.4536, "step": 3090 }, { "epoch": 3.929024081115336, "grad_norm": 0.7605611085891724, "learning_rate": 3.273174727893463e-05, "loss": 9.4277, "step": 3100 }, { "epoch": 3.94169835234474, "grad_norm": 0.6806541681289673, "learning_rate": 3.243125879593286e-05, "loss": 9.4447, "step": 3110 }, { "epoch": 3.9543726235741445, "grad_norm": 0.6423962116241455, "learning_rate": 3.213149249506997e-05, "loss": 9.4405, "step": 3120 }, { "epoch": 3.967046894803549, "grad_norm": 0.7948635816574097, "learning_rate": 3.183246069856443e-05, "loss": 9.4281, "step": 3130 }, { "epoch": 3.9797211660329532, "grad_norm": 0.931813657283783, "learning_rate": 3.153417569844219e-05, "loss": 9.4348, "step": 3140 }, { "epoch": 3.9923954372623576, "grad_norm": 0.7128949165344238, "learning_rate": 3.12366497560313e-05, "loss": 9.4384, "step": 3150 }, { "epoch": 3.9923954372623576, "eval_loss": 9.438448905944824, "eval_runtime": 2.4563, "eval_samples_per_second": 203.561, "eval_steps_per_second": 203.561, "step": 3150 }, { "epoch": 4.005069708491762, "grad_norm": 0.7056128978729248, "learning_rate": 3.0939895101457916e-05, "loss": 9.418, "step": 3160 }, { "epoch": 4.017743979721166, "grad_norm": 0.8243343830108643, "learning_rate": 3.06439239331436e-05, "loss": 9.4273, "step": 3170 }, { "epoch": 4.030418250950571, "grad_norm": 0.8485990762710571, "learning_rate": 3.0348748417303823e-05, "loss": 9.4146, "step": 3180 }, { "epoch": 4.0430925221799745, "grad_norm": 0.855148196220398, "learning_rate": 3.005438068744792e-05, "loss": 9.4398, "step": 3190 }, { "epoch": 4.055766793409379, "grad_norm": 0.8174501657485962, "learning_rate": 2.976083284388031e-05, "loss": 9.4305, "step": 3200 }, { "epoch": 4.068441064638783, "grad_norm": 0.7104635834693909, "learning_rate": 2.9468116953203107e-05, "loss": 9.4331, "step": 3210 }, { "epoch": 4.081115335868188, "grad_norm": 0.739596962928772, "learning_rate": 2.917624504782006e-05, "loss": 9.4149, "step": 3220 }, { "epoch": 4.093789607097592, "grad_norm": 0.8063517212867737, "learning_rate": 2.888522912544202e-05, "loss": 9.4309, "step": 3230 }, { "epoch": 4.106463878326996, "grad_norm": 0.7823915481567383, "learning_rate": 2.8595081148593738e-05, "loss": 9.4206, "step": 3240 }, { "epoch": 4.119138149556401, "grad_norm": 0.7477112412452698, "learning_rate": 2.8305813044122097e-05, "loss": 9.4477, "step": 3250 }, { "epoch": 4.1318124207858045, "grad_norm": 1.0264842510223389, "learning_rate": 2.80174367027059e-05, "loss": 9.4018, "step": 3260 }, { "epoch": 4.144486692015209, "grad_norm": 0.8169686794281006, "learning_rate": 2.772996397836704e-05, "loss": 9.4326, "step": 3270 }, { "epoch": 4.157160963244613, "grad_norm": 0.776114821434021, "learning_rate": 2.7443406687983265e-05, "loss": 9.4308, "step": 3280 }, { "epoch": 4.169835234474018, "grad_norm": 0.7463147640228271, "learning_rate": 2.7157776610802415e-05, "loss": 9.4305, "step": 3290 }, { "epoch": 4.182509505703422, "grad_norm": 0.7685773968696594, "learning_rate": 2.687308548795825e-05, "loss": 9.4509, "step": 3300 }, { "epoch": 4.182509505703422, "eval_loss": 9.438457489013672, "eval_runtime": 2.4938, "eval_samples_per_second": 200.497, "eval_steps_per_second": 200.497, "step": 3300 }, { "epoch": 4.195183776932827, "grad_norm": 0.898151159286499, "learning_rate": 2.658934502198772e-05, "loss": 9.442, "step": 3310 }, { "epoch": 4.2078580481622305, "grad_norm": 0.8025258779525757, "learning_rate": 2.630656687635007e-05, "loss": 9.4293, "step": 3320 }, { "epoch": 4.220532319391635, "grad_norm": 0.7758668661117554, "learning_rate": 2.6024762674947313e-05, "loss": 9.4298, "step": 3330 }, { "epoch": 4.233206590621039, "grad_norm": 0.8270909786224365, "learning_rate": 2.574394400164639e-05, "loss": 9.4215, "step": 3340 }, { "epoch": 4.245880861850444, "grad_norm": 0.7470284700393677, "learning_rate": 2.5464122399803125e-05, "loss": 9.4312, "step": 3350 }, { "epoch": 4.258555133079848, "grad_norm": 0.6872633099555969, "learning_rate": 2.5185309371787513e-05, "loss": 9.4422, "step": 3360 }, { "epoch": 4.271229404309253, "grad_norm": 0.8576005697250366, "learning_rate": 2.4907516378511135e-05, "loss": 9.4475, "step": 3370 }, { "epoch": 4.283903675538657, "grad_norm": 0.7631237506866455, "learning_rate": 2.46307548389559e-05, "loss": 9.4141, "step": 3380 }, { "epoch": 4.2965779467680605, "grad_norm": 0.7020424604415894, "learning_rate": 2.43550361297047e-05, "loss": 9.4375, "step": 3390 }, { "epoch": 4.309252217997465, "grad_norm": 0.8000229597091675, "learning_rate": 2.4080371584473748e-05, "loss": 9.4402, "step": 3400 }, { "epoch": 4.321926489226869, "grad_norm": 0.8294202089309692, "learning_rate": 2.3806772493646723e-05, "loss": 9.4495, "step": 3410 }, { "epoch": 4.334600760456274, "grad_norm": 0.7895727753639221, "learning_rate": 2.353425010381063e-05, "loss": 9.4349, "step": 3420 }, { "epoch": 4.347275031685678, "grad_norm": 0.6364753842353821, "learning_rate": 2.3262815617293517e-05, "loss": 9.4388, "step": 3430 }, { "epoch": 4.359949302915083, "grad_norm": 0.8058129549026489, "learning_rate": 2.2992480191704002e-05, "loss": 9.4428, "step": 3440 }, { "epoch": 4.3726235741444865, "grad_norm": 0.7713651657104492, "learning_rate": 2.272325493947257e-05, "loss": 9.4267, "step": 3450 }, { "epoch": 4.3726235741444865, "eval_loss": 9.438471794128418, "eval_runtime": 2.4549, "eval_samples_per_second": 203.676, "eval_steps_per_second": 203.676, "step": 3450 }, { "epoch": 4.385297845373891, "grad_norm": 0.7729995250701904, "learning_rate": 2.245515092739488e-05, "loss": 9.4352, "step": 3460 }, { "epoch": 4.397972116603295, "grad_norm": 0.7999477386474609, "learning_rate": 2.2188179176176766e-05, "loss": 9.4277, "step": 3470 }, { "epoch": 4.4106463878327, "grad_norm": 0.9356645941734314, "learning_rate": 2.192235065998126e-05, "loss": 9.4355, "step": 3480 }, { "epoch": 4.423320659062104, "grad_norm": 0.7459669709205627, "learning_rate": 2.165767630597752e-05, "loss": 9.4374, "step": 3490 }, { "epoch": 4.435994930291509, "grad_norm": 0.6684827208518982, "learning_rate": 2.139416699389153e-05, "loss": 9.4295, "step": 3500 }, { "epoch": 4.448669201520913, "grad_norm": 0.7681196331977844, "learning_rate": 2.1131833555559037e-05, "loss": 9.4395, "step": 3510 }, { "epoch": 4.4613434727503165, "grad_norm": 0.7017407417297363, "learning_rate": 2.0870686774480196e-05, "loss": 9.445, "step": 3520 }, { "epoch": 4.474017743979721, "grad_norm": 0.7644656896591187, "learning_rate": 2.061073738537635e-05, "loss": 9.4428, "step": 3530 }, { "epoch": 4.486692015209125, "grad_norm": 0.7086924910545349, "learning_rate": 2.0351996073748713e-05, "loss": 9.4362, "step": 3540 }, { "epoch": 4.49936628643853, "grad_norm": 0.8612850308418274, "learning_rate": 2.0094473475439202e-05, "loss": 9.4334, "step": 3550 }, { "epoch": 4.512040557667934, "grad_norm": 0.8251783847808838, "learning_rate": 1.9838180176193178e-05, "loss": 9.4317, "step": 3560 }, { "epoch": 4.524714828897339, "grad_norm": 0.6674248576164246, "learning_rate": 1.9583126711224343e-05, "loss": 9.4475, "step": 3570 }, { "epoch": 4.537389100126743, "grad_norm": 0.8897398710250854, "learning_rate": 1.9329323564781682e-05, "loss": 9.4198, "step": 3580 }, { "epoch": 4.550063371356147, "grad_norm": 0.9129829406738281, "learning_rate": 1.9076781169718428e-05, "loss": 9.4156, "step": 3590 }, { "epoch": 4.562737642585551, "grad_norm": 0.7653076648712158, "learning_rate": 1.8825509907063327e-05, "loss": 9.4323, "step": 3600 }, { "epoch": 4.562737642585551, "eval_loss": 9.438435554504395, "eval_runtime": 2.4601, "eval_samples_per_second": 203.245, "eval_steps_per_second": 203.245, "step": 3600 }, { "epoch": 4.575411913814955, "grad_norm": 0.8860560059547424, "learning_rate": 1.8575520105593817e-05, "loss": 9.4432, "step": 3610 }, { "epoch": 4.58808618504436, "grad_norm": 0.7835094928741455, "learning_rate": 1.8326822041411524e-05, "loss": 9.4469, "step": 3620 }, { "epoch": 4.600760456273765, "grad_norm": 0.6383728384971619, "learning_rate": 1.807942593751973e-05, "loss": 9.4299, "step": 3630 }, { "epoch": 4.613434727503169, "grad_norm": 0.7200093269348145, "learning_rate": 1.783334196340331e-05, "loss": 9.4383, "step": 3640 }, { "epoch": 4.6261089987325725, "grad_norm": 0.8070472478866577, "learning_rate": 1.758858023461059e-05, "loss": 9.4506, "step": 3650 }, { "epoch": 4.638783269961977, "grad_norm": 0.9274407029151917, "learning_rate": 1.7345150812337564e-05, "loss": 9.4436, "step": 3660 }, { "epoch": 4.651457541191381, "grad_norm": 0.7327669262886047, "learning_rate": 1.7103063703014372e-05, "loss": 9.4443, "step": 3670 }, { "epoch": 4.664131812420786, "grad_norm": 0.6654913425445557, "learning_rate": 1.6862328857893854e-05, "loss": 9.4395, "step": 3680 }, { "epoch": 4.67680608365019, "grad_norm": 0.7350926995277405, "learning_rate": 1.66229561726426e-05, "loss": 9.4501, "step": 3690 }, { "epoch": 4.689480354879595, "grad_norm": 0.8466333150863647, "learning_rate": 1.6384955486934156e-05, "loss": 9.4099, "step": 3700 }, { "epoch": 4.702154626108999, "grad_norm": 0.7918135523796082, "learning_rate": 1.614833658404454e-05, "loss": 9.4164, "step": 3710 }, { "epoch": 4.714828897338403, "grad_norm": 0.711807370185852, "learning_rate": 1.5913109190450032e-05, "loss": 9.4373, "step": 3720 }, { "epoch": 4.727503168567807, "grad_norm": 0.8387446999549866, "learning_rate": 1.567928297542749e-05, "loss": 9.4319, "step": 3730 }, { "epoch": 4.740177439797211, "grad_norm": 0.8547424077987671, "learning_rate": 1.544686755065677e-05, "loss": 9.4306, "step": 3740 }, { "epoch": 4.752851711026616, "grad_norm": 0.8681782484054565, "learning_rate": 1.5215872469825682e-05, "loss": 9.4338, "step": 3750 }, { "epoch": 4.752851711026616, "eval_loss": 9.438488960266113, "eval_runtime": 2.458, "eval_samples_per_second": 203.415, "eval_steps_per_second": 203.415, "step": 3750 }, { "epoch": 4.765525982256021, "grad_norm": 0.8183345794677734, "learning_rate": 1.4986307228237268e-05, "loss": 9.4172, "step": 3760 }, { "epoch": 4.778200253485425, "grad_norm": 0.8300184011459351, "learning_rate": 1.4758181262419423e-05, "loss": 9.4118, "step": 3770 }, { "epoch": 4.7908745247148286, "grad_norm": 0.9591904878616333, "learning_rate": 1.4531503949737108e-05, "loss": 9.4409, "step": 3780 }, { "epoch": 4.803548795944233, "grad_norm": 0.8495684862136841, "learning_rate": 1.4306284608006836e-05, "loss": 9.4467, "step": 3790 }, { "epoch": 4.816223067173637, "grad_norm": 0.7406283020973206, "learning_rate": 1.4082532495113626e-05, "loss": 9.4377, "step": 3800 }, { "epoch": 4.828897338403042, "grad_norm": 0.7795657515525818, "learning_rate": 1.3860256808630428e-05, "loss": 9.4302, "step": 3810 }, { "epoch": 4.841571609632446, "grad_norm": 0.7132335305213928, "learning_rate": 1.3639466685440132e-05, "loss": 9.4175, "step": 3820 }, { "epoch": 4.854245880861851, "grad_norm": 0.7238666415214539, "learning_rate": 1.3420171201359933e-05, "loss": 9.4401, "step": 3830 }, { "epoch": 4.866920152091255, "grad_norm": 0.7022533416748047, "learning_rate": 1.3202379370768252e-05, "loss": 9.4472, "step": 3840 }, { "epoch": 4.879594423320659, "grad_norm": 0.8752282857894897, "learning_rate": 1.2986100146234232e-05, "loss": 9.4256, "step": 3850 }, { "epoch": 4.892268694550063, "grad_norm": 0.8227030634880066, "learning_rate": 1.2771342418149657e-05, "loss": 9.458, "step": 3860 }, { "epoch": 4.904942965779467, "grad_norm": 0.7343210577964783, "learning_rate": 1.2558115014363592e-05, "loss": 9.458, "step": 3870 }, { "epoch": 4.917617237008872, "grad_norm": 0.6779001355171204, "learning_rate": 1.2346426699819458e-05, "loss": 9.4274, "step": 3880 }, { "epoch": 4.930291508238277, "grad_norm": 0.8285155296325684, "learning_rate": 1.2136286176194745e-05, "loss": 9.4535, "step": 3890 }, { "epoch": 4.942965779467681, "grad_norm": 0.903825044631958, "learning_rate": 1.1927702081543279e-05, "loss": 9.4218, "step": 3900 }, { "epoch": 4.942965779467681, "eval_loss": 9.438400268554688, "eval_runtime": 2.4551, "eval_samples_per_second": 203.66, "eval_steps_per_second": 203.66, "step": 3900 }, { "epoch": 4.955640050697085, "grad_norm": 0.7940249443054199, "learning_rate": 1.1720682989940262e-05, "loss": 9.4317, "step": 3910 }, { "epoch": 4.968314321926489, "grad_norm": 0.7732440233230591, "learning_rate": 1.1515237411129698e-05, "loss": 9.4373, "step": 3920 }, { "epoch": 4.980988593155893, "grad_norm": 0.7014748454093933, "learning_rate": 1.1311373790174657e-05, "loss": 9.4445, "step": 3930 }, { "epoch": 4.993662864385298, "grad_norm": 0.840886652469635, "learning_rate": 1.1109100507110132e-05, "loss": 9.4246, "step": 3940 }, { "epoch": 5.006337135614702, "grad_norm": 0.7634750008583069, "learning_rate": 1.090842587659851e-05, "loss": 9.4296, "step": 3950 }, { "epoch": 5.019011406844107, "grad_norm": 0.7690792083740234, "learning_rate": 1.0709358147587884e-05, "loss": 9.4392, "step": 3960 }, { "epoch": 5.031685678073511, "grad_norm": 0.6004303097724915, "learning_rate": 1.0511905502972886e-05, "loss": 9.4415, "step": 3970 }, { "epoch": 5.044359949302915, "grad_norm": 0.8244099020957947, "learning_rate": 1.031607605925839e-05, "loss": 9.4223, "step": 3980 }, { "epoch": 5.057034220532319, "grad_norm": 0.7591854333877563, "learning_rate": 1.0121877866225781e-05, "loss": 9.4332, "step": 3990 }, { "epoch": 5.069708491761724, "grad_norm": 0.7974857687950134, "learning_rate": 9.929318906602175e-06, "loss": 9.4315, "step": 4000 }, { "epoch": 5.082382762991128, "grad_norm": 0.8457611799240112, "learning_rate": 9.738407095732195e-06, "loss": 9.4383, "step": 4010 }, { "epoch": 5.095057034220532, "grad_norm": 0.6683225035667419, "learning_rate": 9.549150281252633e-06, "loss": 9.4374, "step": 4020 }, { "epoch": 5.107731305449937, "grad_norm": 0.8315187692642212, "learning_rate": 9.36155624276987e-06, "loss": 9.4536, "step": 4030 }, { "epoch": 5.120405576679341, "grad_norm": 0.8504251837730408, "learning_rate": 9.175632691540065e-06, "loss": 9.4555, "step": 4040 }, { "epoch": 5.133079847908745, "grad_norm": 0.8719484210014343, "learning_rate": 8.991387270152201e-06, "loss": 9.4306, "step": 4050 }, { "epoch": 5.133079847908745, "eval_loss": 9.438368797302246, "eval_runtime": 2.4656, "eval_samples_per_second": 202.787, "eval_steps_per_second": 202.787, "step": 4050 }, { "epoch": 5.145754119138149, "grad_norm": 0.8396516442298889, "learning_rate": 8.808827552213916e-06, "loss": 9.433, "step": 4060 }, { "epoch": 5.158428390367554, "grad_norm": 1.0201669931411743, "learning_rate": 8.627961042040184e-06, "loss": 9.4265, "step": 4070 }, { "epoch": 5.171102661596958, "grad_norm": 0.7764204144477844, "learning_rate": 8.448795174344804e-06, "loss": 9.4363, "step": 4080 }, { "epoch": 5.183776932826363, "grad_norm": 0.7810952067375183, "learning_rate": 8.271337313934869e-06, "loss": 9.4256, "step": 4090 }, { "epoch": 5.196451204055767, "grad_norm": 0.6855703592300415, "learning_rate": 8.09559475540797e-06, "loss": 9.4148, "step": 4100 }, { "epoch": 5.2091254752851714, "grad_norm": 0.7424513697624207, "learning_rate": 7.921574722852343e-06, "loss": 9.4451, "step": 4110 }, { "epoch": 5.221799746514575, "grad_norm": 0.721655547618866, "learning_rate": 7.749284369549953e-06, "loss": 9.418, "step": 4120 }, { "epoch": 5.23447401774398, "grad_norm": 0.5920586585998535, "learning_rate": 7.578730777682386e-06, "loss": 9.4412, "step": 4130 }, { "epoch": 5.247148288973384, "grad_norm": 0.7419296503067017, "learning_rate": 7.409920958039795e-06, "loss": 9.4447, "step": 4140 }, { "epoch": 5.259822560202788, "grad_norm": 0.6374722123146057, "learning_rate": 7.242861849732696e-06, "loss": 9.4362, "step": 4150 }, { "epoch": 5.272496831432193, "grad_norm": 0.8588663339614868, "learning_rate": 7.077560319906695e-06, "loss": 9.4211, "step": 4160 }, { "epoch": 5.285171102661597, "grad_norm": 0.7311620712280273, "learning_rate": 6.9140231634602485e-06, "loss": 9.4374, "step": 4170 }, { "epoch": 5.297845373891001, "grad_norm": 0.8202149271965027, "learning_rate": 6.752257102765325e-06, "loss": 9.4303, "step": 4180 }, { "epoch": 5.310519645120405, "grad_norm": 0.930683970451355, "learning_rate": 6.592268787391076e-06, "loss": 9.4358, "step": 4190 }, { "epoch": 5.32319391634981, "grad_norm": 0.9200685620307922, "learning_rate": 6.43406479383053e-06, "loss": 9.4353, "step": 4200 }, { "epoch": 5.32319391634981, "eval_loss": 9.438363075256348, "eval_runtime": 2.4682, "eval_samples_per_second": 202.574, "eval_steps_per_second": 202.574, "step": 4200 }, { "epoch": 5.335868187579214, "grad_norm": 0.7806163430213928, "learning_rate": 6.277651625230219e-06, "loss": 9.4314, "step": 4210 }, { "epoch": 5.348542458808619, "grad_norm": 0.8083299398422241, "learning_rate": 6.12303571112286e-06, "loss": 9.434, "step": 4220 }, { "epoch": 5.361216730038023, "grad_norm": 0.7003039717674255, "learning_rate": 5.9702234071631e-06, "loss": 9.4263, "step": 4230 }, { "epoch": 5.3738910012674275, "grad_norm": 0.8378300070762634, "learning_rate": 5.819220994866237e-06, "loss": 9.4447, "step": 4240 }, { "epoch": 5.386565272496831, "grad_norm": 0.719078779220581, "learning_rate": 5.670034681349995e-06, "loss": 9.4331, "step": 4250 }, { "epoch": 5.399239543726236, "grad_norm": 0.9497711062431335, "learning_rate": 5.5226705990794155e-06, "loss": 9.4173, "step": 4260 }, { "epoch": 5.41191381495564, "grad_norm": 0.8119667768478394, "learning_rate": 5.377134805614714e-06, "loss": 9.4432, "step": 4270 }, { "epoch": 5.424588086185044, "grad_norm": 0.764972984790802, "learning_rate": 5.233433283362349e-06, "loss": 9.4333, "step": 4280 }, { "epoch": 5.437262357414449, "grad_norm": 0.8632189631462097, "learning_rate": 5.091571939329048e-06, "loss": 9.4423, "step": 4290 }, { "epoch": 5.449936628643853, "grad_norm": 0.7099254131317139, "learning_rate": 4.951556604879048e-06, "loss": 9.4395, "step": 4300 }, { "epoch": 5.462610899873257, "grad_norm": 0.7150112390518188, "learning_rate": 4.813393035494329e-06, "loss": 9.4369, "step": 4310 }, { "epoch": 5.475285171102661, "grad_norm": 0.7645243406295776, "learning_rate": 4.677086910538092e-06, "loss": 9.4517, "step": 4320 }, { "epoch": 5.487959442332066, "grad_norm": 0.8079973459243774, "learning_rate": 4.542643833021254e-06, "loss": 9.4395, "step": 4330 }, { "epoch": 5.50063371356147, "grad_norm": 0.7471894025802612, "learning_rate": 4.410069329372152e-06, "loss": 9.431, "step": 4340 }, { "epoch": 5.513307984790875, "grad_norm": 0.7337321639060974, "learning_rate": 4.279368849209381e-06, "loss": 9.4312, "step": 4350 }, { "epoch": 5.513307984790875, "eval_loss": 9.438379287719727, "eval_runtime": 2.466, "eval_samples_per_second": 202.761, "eval_steps_per_second": 202.761, "step": 4350 }, { "epoch": 5.525982256020279, "grad_norm": 0.6859441995620728, "learning_rate": 4.150547765117746e-06, "loss": 9.4248, "step": 4360 }, { "epoch": 5.5386565272496835, "grad_norm": 0.7204738855361938, "learning_rate": 4.023611372427471e-06, "loss": 9.4358, "step": 4370 }, { "epoch": 5.551330798479087, "grad_norm": 0.6800116300582886, "learning_rate": 3.898564888996476e-06, "loss": 9.429, "step": 4380 }, { "epoch": 5.564005069708491, "grad_norm": 0.7014439702033997, "learning_rate": 3.7754134549959297e-06, "loss": 9.4435, "step": 4390 }, { "epoch": 5.576679340937896, "grad_norm": 0.636579155921936, "learning_rate": 3.654162132698918e-06, "loss": 9.4212, "step": 4400 }, { "epoch": 5.589353612167301, "grad_norm": 0.8641238808631897, "learning_rate": 3.534815906272404e-06, "loss": 9.4396, "step": 4410 }, { "epoch": 5.602027883396705, "grad_norm": 0.6646918058395386, "learning_rate": 3.417379681572297e-06, "loss": 9.4336, "step": 4420 }, { "epoch": 5.614702154626109, "grad_norm": 0.7937387824058533, "learning_rate": 3.3018582859418446e-06, "loss": 9.4448, "step": 4430 }, { "epoch": 5.6273764258555135, "grad_norm": 0.7018795609474182, "learning_rate": 3.18825646801314e-06, "loss": 9.4394, "step": 4440 }, { "epoch": 5.640050697084917, "grad_norm": 0.7133225798606873, "learning_rate": 3.076578897511978e-06, "loss": 9.4361, "step": 4450 }, { "epoch": 5.652724968314322, "grad_norm": 0.7572789788246155, "learning_rate": 2.966830165065876e-06, "loss": 9.415, "step": 4460 }, { "epoch": 5.665399239543726, "grad_norm": 0.7813080549240112, "learning_rate": 2.8590147820153513e-06, "loss": 9.4267, "step": 4470 }, { "epoch": 5.678073510773131, "grad_norm": 0.8485764265060425, "learning_rate": 2.753137180228543e-06, "loss": 9.4291, "step": 4480 }, { "epoch": 5.690747782002535, "grad_norm": 0.7320175766944885, "learning_rate": 2.6492017119189417e-06, "loss": 9.4434, "step": 4490 }, { "epoch": 5.7034220532319395, "grad_norm": 0.7832790613174438, "learning_rate": 2.547212649466568e-06, "loss": 9.4319, "step": 4500 }, { "epoch": 5.7034220532319395, "eval_loss": 9.438411712646484, "eval_runtime": 2.4617, "eval_samples_per_second": 203.111, "eval_steps_per_second": 203.111, "step": 4500 }, { "epoch": 5.716096324461343, "grad_norm": 0.8835005760192871, "learning_rate": 2.4471741852423237e-06, "loss": 9.4289, "step": 4510 }, { "epoch": 5.728770595690747, "grad_norm": 0.9277816414833069, "learning_rate": 2.349090431435641e-06, "loss": 9.4264, "step": 4520 }, { "epoch": 5.741444866920152, "grad_norm": 0.7548145651817322, "learning_rate": 2.2529654198854835e-06, "loss": 9.439, "step": 4530 }, { "epoch": 5.754119138149557, "grad_norm": 0.7034807205200195, "learning_rate": 2.1588031019145636e-06, "loss": 9.422, "step": 4540 }, { "epoch": 5.766793409378961, "grad_norm": 0.7236075401306152, "learning_rate": 2.066607348166971e-06, "loss": 9.4403, "step": 4550 }, { "epoch": 5.779467680608365, "grad_norm": 0.7205497622489929, "learning_rate": 1.9763819484490355e-06, "loss": 9.4308, "step": 4560 }, { "epoch": 5.7921419518377695, "grad_norm": 0.67926424741745, "learning_rate": 1.888130611573563e-06, "loss": 9.4504, "step": 4570 }, { "epoch": 5.804816223067173, "grad_norm": 0.7654796838760376, "learning_rate": 1.8018569652073381e-06, "loss": 9.4239, "step": 4580 }, { "epoch": 5.817490494296578, "grad_norm": 0.7510716915130615, "learning_rate": 1.7175645557220566e-06, "loss": 9.4393, "step": 4590 }, { "epoch": 5.830164765525982, "grad_norm": 0.9133705496788025, "learning_rate": 1.6352568480485276e-06, "loss": 9.4242, "step": 4600 }, { "epoch": 5.842839036755387, "grad_norm": 0.8332166075706482, "learning_rate": 1.5549372255342366e-06, "loss": 9.4175, "step": 4610 }, { "epoch": 5.855513307984791, "grad_norm": 0.753537654876709, "learning_rate": 1.4766089898042678e-06, "loss": 9.4436, "step": 4620 }, { "epoch": 5.8681875792141955, "grad_norm": 0.7988256812095642, "learning_rate": 1.400275360625608e-06, "loss": 9.4301, "step": 4630 }, { "epoch": 5.880861850443599, "grad_norm": 0.8596470355987549, "learning_rate": 1.325939475774768e-06, "loss": 9.4153, "step": 4640 }, { "epoch": 5.893536121673003, "grad_norm": 0.9785693883895874, "learning_rate": 1.2536043909088191e-06, "loss": 9.4205, "step": 4650 }, { "epoch": 5.893536121673003, "eval_loss": 9.438403129577637, "eval_runtime": 2.4678, "eval_samples_per_second": 202.61, "eval_steps_per_second": 202.61, "step": 4650 }, { "epoch": 5.906210392902408, "grad_norm": 0.8406782746315002, "learning_rate": 1.183273079439795e-06, "loss": 9.4192, "step": 4660 }, { "epoch": 5.918884664131813, "grad_norm": 0.7711849212646484, "learning_rate": 1.1149484324124327e-06, "loss": 9.4368, "step": 4670 }, { "epoch": 5.931558935361217, "grad_norm": 0.8307945728302002, "learning_rate": 1.0486332583853563e-06, "loss": 9.4324, "step": 4680 }, { "epoch": 5.944233206590621, "grad_norm": 0.8221229314804077, "learning_rate": 9.843302833156376e-07, "loss": 9.4499, "step": 4690 }, { "epoch": 5.9569074778200255, "grad_norm": 0.5712777376174927, "learning_rate": 9.220421504467281e-07, "loss": 9.429, "step": 4700 }, { "epoch": 5.969581749049429, "grad_norm": 0.629254162311554, "learning_rate": 8.617714201998084e-07, "loss": 9.4374, "step": 4710 }, { "epoch": 5.982256020278834, "grad_norm": 0.5955209136009216, "learning_rate": 8.035205700685167e-07, "loss": 9.4252, "step": 4720 }, { "epoch": 5.994930291508238, "grad_norm": 0.8553429841995239, "learning_rate": 7.472919945171631e-07, "loss": 9.4295, "step": 4730 }, { "epoch": 6.007604562737643, "grad_norm": 0.8097705841064453, "learning_rate": 6.93088004882253e-07, "loss": 9.4376, "step": 4740 }, { "epoch": 6.020278833967047, "grad_norm": 0.7857598066329956, "learning_rate": 6.409108292774913e-07, "loss": 9.4366, "step": 4750 }, { "epoch": 6.032953105196452, "grad_norm": 0.7502981424331665, "learning_rate": 5.907626125022159e-07, "loss": 9.4301, "step": 4760 }, { "epoch": 6.0456273764258555, "grad_norm": 0.677966833114624, "learning_rate": 5.426454159531913e-07, "loss": 9.4219, "step": 4770 }, { "epoch": 6.05830164765526, "grad_norm": 0.8806309700012207, "learning_rate": 4.965612175399092e-07, "loss": 9.4297, "step": 4780 }, { "epoch": 6.070975918884664, "grad_norm": 0.8555340766906738, "learning_rate": 4.52511911603265e-07, "loss": 9.4228, "step": 4790 }, { "epoch": 6.083650190114068, "grad_norm": 0.7141437530517578, "learning_rate": 4.104993088376974e-07, "loss": 9.4226, "step": 4800 }, { "epoch": 6.083650190114068, "eval_loss": 9.438398361206055, "eval_runtime": 2.4899, "eval_samples_per_second": 200.807, "eval_steps_per_second": 200.807, "step": 4800 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 20847067690368.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }