{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.02620407735443635, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002620407735443635, "grad_norm": 1.2820378541946411, "learning_rate": 4.999563265377426e-05, "loss": 0.6609, "step": 10 }, { "epoch": 0.000524081547088727, "grad_norm": 1.3106062412261963, "learning_rate": 4.999126530754852e-05, "loss": 0.8066, "step": 20 }, { "epoch": 0.0007861223206330905, "grad_norm": 1.6985974311828613, "learning_rate": 4.9986897961322784e-05, "loss": 0.6863, "step": 30 }, { "epoch": 0.001048163094177454, "grad_norm": 1.2552741765975952, "learning_rate": 4.9982530615097044e-05, "loss": 0.5674, "step": 40 }, { "epoch": 0.0013102038677218176, "grad_norm": 1.3928155899047852, "learning_rate": 4.9978163268871304e-05, "loss": 0.5992, "step": 50 }, { "epoch": 0.001572244641266181, "grad_norm": 1.5794411897659302, "learning_rate": 4.9973795922645565e-05, "loss": 0.7939, "step": 60 }, { "epoch": 0.0018342854148105446, "grad_norm": 1.4024512767791748, "learning_rate": 4.9969428576419825e-05, "loss": 0.6178, "step": 70 }, { "epoch": 0.002096326188354908, "grad_norm": 1.2232664823532104, "learning_rate": 4.9965061230194085e-05, "loss": 0.7544, "step": 80 }, { "epoch": 0.0023583669618992714, "grad_norm": 1.5368870496749878, "learning_rate": 4.9960693883968346e-05, "loss": 0.6645, "step": 90 }, { "epoch": 0.002620407735443635, "grad_norm": 1.6177372932434082, "learning_rate": 4.9956326537742606e-05, "loss": 0.6329, "step": 100 }, { "epoch": 0.0028824485089879986, "grad_norm": 1.3803173303604126, "learning_rate": 4.995195919151687e-05, "loss": 0.7114, "step": 110 }, { "epoch": 0.003144489282532362, "grad_norm": 1.5191670656204224, "learning_rate": 4.9947591845291134e-05, "loss": 0.8717, "step": 120 }, { "epoch": 0.0034065300560767254, "grad_norm": 1.2967548370361328, "learning_rate": 4.994322449906539e-05, "loss": 0.7618, "step": 130 }, { "epoch": 0.003668570829621089, "grad_norm": 1.8742738962173462, "learning_rate": 4.9938857152839655e-05, "loss": 0.5896, "step": 140 }, { "epoch": 0.003930611603165452, "grad_norm": 1.697966456413269, "learning_rate": 4.993448980661391e-05, "loss": 0.7493, "step": 150 }, { "epoch": 0.004192652376709816, "grad_norm": 1.5282775163650513, "learning_rate": 4.9930122460388175e-05, "loss": 0.6359, "step": 160 }, { "epoch": 0.00445469315025418, "grad_norm": 1.2776225805282593, "learning_rate": 4.992575511416243e-05, "loss": 0.7677, "step": 170 }, { "epoch": 0.004716733923798543, "grad_norm": 1.2036848068237305, "learning_rate": 4.9921387767936696e-05, "loss": 0.6234, "step": 180 }, { "epoch": 0.0049787746973429066, "grad_norm": 1.1325912475585938, "learning_rate": 4.991702042171095e-05, "loss": 0.624, "step": 190 }, { "epoch": 0.00524081547088727, "grad_norm": 1.3846409320831299, "learning_rate": 4.991265307548522e-05, "loss": 0.5979, "step": 200 }, { "epoch": 0.005502856244431633, "grad_norm": 1.9792439937591553, "learning_rate": 4.990828572925948e-05, "loss": 0.7897, "step": 210 }, { "epoch": 0.005764897017975997, "grad_norm": 1.8546253442764282, "learning_rate": 4.990391838303374e-05, "loss": 0.7035, "step": 220 }, { "epoch": 0.006026937791520361, "grad_norm": 1.5434975624084473, "learning_rate": 4.9899551036808e-05, "loss": 0.7092, "step": 230 }, { "epoch": 0.006288978565064724, "grad_norm": 1.0328209400177002, "learning_rate": 4.989518369058226e-05, "loss": 0.6239, "step": 240 }, { "epoch": 0.006551019338609088, "grad_norm": 0.9389006495475769, "learning_rate": 4.989081634435652e-05, "loss": 0.6957, "step": 250 }, { "epoch": 0.006813060112153451, "grad_norm": 1.0274962186813354, "learning_rate": 4.988644899813078e-05, "loss": 0.5302, "step": 260 }, { "epoch": 0.0070751008856978145, "grad_norm": 1.2626285552978516, "learning_rate": 4.988208165190504e-05, "loss": 0.6541, "step": 270 }, { "epoch": 0.007337141659242178, "grad_norm": 1.4558316469192505, "learning_rate": 4.98777143056793e-05, "loss": 0.7284, "step": 280 }, { "epoch": 0.007599182432786541, "grad_norm": 1.3997328281402588, "learning_rate": 4.987334695945356e-05, "loss": 0.6888, "step": 290 }, { "epoch": 0.007861223206330904, "grad_norm": 1.7742432355880737, "learning_rate": 4.986897961322782e-05, "loss": 0.6869, "step": 300 }, { "epoch": 0.008123263979875268, "grad_norm": 1.4925923347473145, "learning_rate": 4.986461226700208e-05, "loss": 0.731, "step": 310 }, { "epoch": 0.008385304753419632, "grad_norm": 1.5941082239151, "learning_rate": 4.986024492077634e-05, "loss": 0.7149, "step": 320 }, { "epoch": 0.008647345526963996, "grad_norm": 1.507450819015503, "learning_rate": 4.98558775745506e-05, "loss": 0.6443, "step": 330 }, { "epoch": 0.00890938630050836, "grad_norm": 0.9866214990615845, "learning_rate": 4.985151022832487e-05, "loss": 0.6003, "step": 340 }, { "epoch": 0.009171427074052723, "grad_norm": 0.9682250022888184, "learning_rate": 4.984714288209912e-05, "loss": 0.6602, "step": 350 }, { "epoch": 0.009433467847597085, "grad_norm": 1.7567181587219238, "learning_rate": 4.984277553587338e-05, "loss": 0.5647, "step": 360 }, { "epoch": 0.00969550862114145, "grad_norm": 1.4297257661819458, "learning_rate": 4.9838408189647643e-05, "loss": 0.6628, "step": 370 }, { "epoch": 0.009957549394685813, "grad_norm": 1.515763521194458, "learning_rate": 4.9834040843421904e-05, "loss": 0.568, "step": 380 }, { "epoch": 0.010219590168230177, "grad_norm": 1.1806342601776123, "learning_rate": 4.9829673497196164e-05, "loss": 0.6729, "step": 390 }, { "epoch": 0.01048163094177454, "grad_norm": 1.1328020095825195, "learning_rate": 4.9825306150970425e-05, "loss": 0.6963, "step": 400 }, { "epoch": 0.010743671715318903, "grad_norm": 0.7740004658699036, "learning_rate": 4.9820938804744685e-05, "loss": 0.6654, "step": 410 }, { "epoch": 0.011005712488863267, "grad_norm": 0.9519413113594055, "learning_rate": 4.9816571458518945e-05, "loss": 0.6487, "step": 420 }, { "epoch": 0.01126775326240763, "grad_norm": 0.8964847922325134, "learning_rate": 4.981220411229321e-05, "loss": 0.5667, "step": 430 }, { "epoch": 0.011529794035951994, "grad_norm": 1.428072452545166, "learning_rate": 4.9807836766067466e-05, "loss": 0.8164, "step": 440 }, { "epoch": 0.011791834809496358, "grad_norm": 1.4375147819519043, "learning_rate": 4.980346941984173e-05, "loss": 0.5476, "step": 450 }, { "epoch": 0.012053875583040722, "grad_norm": 1.1702146530151367, "learning_rate": 4.979910207361599e-05, "loss": 0.7342, "step": 460 }, { "epoch": 0.012315916356585084, "grad_norm": 1.2703320980072021, "learning_rate": 4.9794734727390254e-05, "loss": 0.5767, "step": 470 }, { "epoch": 0.012577957130129448, "grad_norm": 1.2520267963409424, "learning_rate": 4.979036738116451e-05, "loss": 0.5969, "step": 480 }, { "epoch": 0.012839997903673812, "grad_norm": 1.413979172706604, "learning_rate": 4.9786000034938775e-05, "loss": 0.7011, "step": 490 }, { "epoch": 0.013102038677218176, "grad_norm": 1.3918565511703491, "learning_rate": 4.978163268871303e-05, "loss": 0.6041, "step": 500 }, { "epoch": 0.01336407945076254, "grad_norm": 0.9175894260406494, "learning_rate": 4.9777265342487296e-05, "loss": 0.5052, "step": 510 }, { "epoch": 0.013626120224306901, "grad_norm": 1.296505331993103, "learning_rate": 4.9772897996261556e-05, "loss": 0.6076, "step": 520 }, { "epoch": 0.013888160997851265, "grad_norm": 1.2490183115005493, "learning_rate": 4.9768530650035816e-05, "loss": 0.6287, "step": 530 }, { "epoch": 0.014150201771395629, "grad_norm": 1.398285984992981, "learning_rate": 4.976416330381008e-05, "loss": 0.7905, "step": 540 }, { "epoch": 0.014412242544939993, "grad_norm": 1.3094829320907593, "learning_rate": 4.975979595758434e-05, "loss": 0.5133, "step": 550 }, { "epoch": 0.014674283318484357, "grad_norm": 1.7128199338912964, "learning_rate": 4.97554286113586e-05, "loss": 0.824, "step": 560 }, { "epoch": 0.014936324092028719, "grad_norm": 1.1319103240966797, "learning_rate": 4.975106126513286e-05, "loss": 0.5893, "step": 570 }, { "epoch": 0.015198364865573083, "grad_norm": 1.6176029443740845, "learning_rate": 4.974669391890712e-05, "loss": 0.581, "step": 580 }, { "epoch": 0.015460405639117446, "grad_norm": 1.590836524963379, "learning_rate": 4.974232657268138e-05, "loss": 0.5814, "step": 590 }, { "epoch": 0.01572244641266181, "grad_norm": 1.6922227144241333, "learning_rate": 4.973795922645564e-05, "loss": 0.5917, "step": 600 }, { "epoch": 0.015984487186206174, "grad_norm": 1.3479337692260742, "learning_rate": 4.97335918802299e-05, "loss": 0.7356, "step": 610 }, { "epoch": 0.016246527959750536, "grad_norm": 2.2019124031066895, "learning_rate": 4.972922453400416e-05, "loss": 0.6197, "step": 620 }, { "epoch": 0.016508568733294902, "grad_norm": 1.6211423873901367, "learning_rate": 4.972485718777842e-05, "loss": 0.6299, "step": 630 }, { "epoch": 0.016770609506839264, "grad_norm": 1.157416582107544, "learning_rate": 4.972048984155268e-05, "loss": 0.6343, "step": 640 }, { "epoch": 0.01703265028038363, "grad_norm": 1.2576712369918823, "learning_rate": 4.971612249532695e-05, "loss": 0.6126, "step": 650 }, { "epoch": 0.01729469105392799, "grad_norm": 1.3852715492248535, "learning_rate": 4.97117551491012e-05, "loss": 0.6352, "step": 660 }, { "epoch": 0.017556731827472354, "grad_norm": 1.0178048610687256, "learning_rate": 4.970738780287547e-05, "loss": 0.5923, "step": 670 }, { "epoch": 0.01781877260101672, "grad_norm": 0.8760583996772766, "learning_rate": 4.970302045664972e-05, "loss": 0.6158, "step": 680 }, { "epoch": 0.01808081337456108, "grad_norm": 0.8956984281539917, "learning_rate": 4.969865311042398e-05, "loss": 0.5746, "step": 690 }, { "epoch": 0.018342854148105447, "grad_norm": 1.1126501560211182, "learning_rate": 4.969428576419824e-05, "loss": 0.6254, "step": 700 }, { "epoch": 0.01860489492164981, "grad_norm": 1.168455958366394, "learning_rate": 4.96899184179725e-05, "loss": 0.652, "step": 710 }, { "epoch": 0.01886693569519417, "grad_norm": 1.3628567457199097, "learning_rate": 4.9685551071746764e-05, "loss": 0.5789, "step": 720 }, { "epoch": 0.019128976468738536, "grad_norm": 1.1971865892410278, "learning_rate": 4.9681183725521024e-05, "loss": 0.6408, "step": 730 }, { "epoch": 0.0193910172422829, "grad_norm": 1.1916868686676025, "learning_rate": 4.9676816379295285e-05, "loss": 0.6461, "step": 740 }, { "epoch": 0.019653058015827264, "grad_norm": 1.1797837018966675, "learning_rate": 4.9672449033069545e-05, "loss": 0.5843, "step": 750 }, { "epoch": 0.019915098789371626, "grad_norm": 0.7941935658454895, "learning_rate": 4.966808168684381e-05, "loss": 0.6165, "step": 760 }, { "epoch": 0.02017713956291599, "grad_norm": 1.4876329898834229, "learning_rate": 4.9663714340618066e-05, "loss": 0.6347, "step": 770 }, { "epoch": 0.020439180336460354, "grad_norm": 1.1482038497924805, "learning_rate": 4.965934699439233e-05, "loss": 0.5662, "step": 780 }, { "epoch": 0.020701221110004716, "grad_norm": 1.3942419290542603, "learning_rate": 4.9654979648166586e-05, "loss": 0.6189, "step": 790 }, { "epoch": 0.02096326188354908, "grad_norm": 0.8826277256011963, "learning_rate": 4.9650612301940854e-05, "loss": 0.6801, "step": 800 }, { "epoch": 0.021225302657093444, "grad_norm": 1.3729712963104248, "learning_rate": 4.964624495571511e-05, "loss": 0.5789, "step": 810 }, { "epoch": 0.021487343430637806, "grad_norm": 0.747199296951294, "learning_rate": 4.9641877609489374e-05, "loss": 0.651, "step": 820 }, { "epoch": 0.02174938420418217, "grad_norm": 0.7911145091056824, "learning_rate": 4.963751026326363e-05, "loss": 0.6834, "step": 830 }, { "epoch": 0.022011424977726533, "grad_norm": 1.1725844144821167, "learning_rate": 4.9633142917037895e-05, "loss": 0.6687, "step": 840 }, { "epoch": 0.0222734657512709, "grad_norm": 1.2759829759597778, "learning_rate": 4.9628775570812156e-05, "loss": 0.6612, "step": 850 }, { "epoch": 0.02253550652481526, "grad_norm": 1.497684359550476, "learning_rate": 4.9624408224586416e-05, "loss": 0.686, "step": 860 }, { "epoch": 0.022797547298359623, "grad_norm": 1.4431102275848389, "learning_rate": 4.9620040878360676e-05, "loss": 0.5838, "step": 870 }, { "epoch": 0.02305958807190399, "grad_norm": 0.8864196538925171, "learning_rate": 4.961567353213494e-05, "loss": 0.6076, "step": 880 }, { "epoch": 0.02332162884544835, "grad_norm": 1.4421597719192505, "learning_rate": 4.96113061859092e-05, "loss": 0.6669, "step": 890 }, { "epoch": 0.023583669618992716, "grad_norm": 1.541601300239563, "learning_rate": 4.960693883968346e-05, "loss": 0.477, "step": 900 }, { "epoch": 0.02384571039253708, "grad_norm": 1.0725853443145752, "learning_rate": 4.960257149345772e-05, "loss": 0.5399, "step": 910 }, { "epoch": 0.024107751166081444, "grad_norm": 1.4579834938049316, "learning_rate": 4.959820414723198e-05, "loss": 0.636, "step": 920 }, { "epoch": 0.024369791939625806, "grad_norm": 1.1018449068069458, "learning_rate": 4.959383680100624e-05, "loss": 0.7731, "step": 930 }, { "epoch": 0.024631832713170168, "grad_norm": 1.3531861305236816, "learning_rate": 4.95894694547805e-05, "loss": 0.7017, "step": 940 }, { "epoch": 0.024893873486714534, "grad_norm": 1.1225773096084595, "learning_rate": 4.958510210855476e-05, "loss": 0.7228, "step": 950 }, { "epoch": 0.025155914260258896, "grad_norm": 0.6442508697509766, "learning_rate": 4.958073476232902e-05, "loss": 0.4421, "step": 960 }, { "epoch": 0.02541795503380326, "grad_norm": 1.1338638067245483, "learning_rate": 4.957636741610328e-05, "loss": 0.6533, "step": 970 }, { "epoch": 0.025679995807347623, "grad_norm": 1.0796573162078857, "learning_rate": 4.957200006987755e-05, "loss": 0.6407, "step": 980 }, { "epoch": 0.025942036580891985, "grad_norm": 1.001578450202942, "learning_rate": 4.95676327236518e-05, "loss": 0.652, "step": 990 }, { "epoch": 0.02620407735443635, "grad_norm": 1.521545648574829, "learning_rate": 4.956326537742607e-05, "loss": 0.4812, "step": 1000 } ], "logging_steps": 10, "max_steps": 114486, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3739037466624000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }