{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 9130, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.054764512595837894, "grad_norm": 4.206851005554199, "learning_rate": 1.095290251916758e-05, "loss": 2.3255, "step": 100 }, { "epoch": 0.10952902519167579, "grad_norm": 2.8459718227386475, "learning_rate": 2.190580503833516e-05, "loss": 1.8525, "step": 200 }, { "epoch": 0.16429353778751368, "grad_norm": 3.025334119796753, "learning_rate": 3.285870755750274e-05, "loss": 1.7615, "step": 300 }, { "epoch": 0.21905805038335158, "grad_norm": 3.115264654159546, "learning_rate": 4.381161007667032e-05, "loss": 1.7006, "step": 400 }, { "epoch": 0.2738225629791895, "grad_norm": 2.6407604217529297, "learning_rate": 5.47645125958379e-05, "loss": 1.6693, "step": 500 }, { "epoch": 0.32858707557502737, "grad_norm": 2.2635467052459717, "learning_rate": 6.571741511500547e-05, "loss": 1.6554, "step": 600 }, { "epoch": 0.3833515881708653, "grad_norm": 2.4853227138519287, "learning_rate": 7.667031763417306e-05, "loss": 1.589, "step": 700 }, { "epoch": 0.43811610076670315, "grad_norm": 2.712339162826538, "learning_rate": 8.762322015334064e-05, "loss": 1.5892, "step": 800 }, { "epoch": 0.4928806133625411, "grad_norm": 2.6400351524353027, "learning_rate": 9.857612267250822e-05, "loss": 1.5559, "step": 900 }, { "epoch": 0.547645125958379, "grad_norm": 3.203749656677246, "learning_rate": 9.997234258138696e-05, "loss": 1.5673, "step": 1000 }, { "epoch": 0.6024096385542169, "grad_norm": 2.3736095428466797, "learning_rate": 9.987226456522884e-05, "loss": 1.5324, "step": 1100 }, { "epoch": 0.6571741511500547, "grad_norm": 2.253758192062378, "learning_rate": 9.969929463456831e-05, "loss": 1.52, "step": 1200 }, { "epoch": 0.7119386637458927, "grad_norm": 2.2543797492980957, "learning_rate": 9.945368559744425e-05, "loss": 1.463, "step": 1300 }, { "epoch": 0.7667031763417306, "grad_norm": 2.094783067703247, "learning_rate": 9.913579642919276e-05, "loss": 1.4819, "step": 1400 }, { "epoch": 0.8214676889375685, "grad_norm": 2.028780937194824, "learning_rate": 9.874609174777887e-05, "loss": 1.4738, "step": 1500 }, { "epoch": 0.8762322015334063, "grad_norm": 2.2184860706329346, "learning_rate": 9.82851411347238e-05, "loss": 1.4481, "step": 1600 }, { "epoch": 0.9309967141292442, "grad_norm": 2.685318946838379, "learning_rate": 9.775361830262055e-05, "loss": 1.4143, "step": 1700 }, { "epoch": 0.9857612267250822, "grad_norm": 2.3410556316375732, "learning_rate": 9.715230011045415e-05, "loss": 1.3997, "step": 1800 }, { "epoch": 1.04052573932092, "grad_norm": 2.4343554973602295, "learning_rate": 9.648206542816636e-05, "loss": 1.2183, "step": 1900 }, { "epoch": 1.095290251916758, "grad_norm": 2.7375845909118652, "learning_rate": 9.574389385212366e-05, "loss": 1.1348, "step": 2000 }, { "epoch": 1.1500547645125958, "grad_norm": 2.488781213760376, "learning_rate": 9.493886427336657e-05, "loss": 1.1794, "step": 2100 }, { "epoch": 1.2048192771084336, "grad_norm": 2.216630697250366, "learning_rate": 9.406815330073244e-05, "loss": 1.1533, "step": 2200 }, { "epoch": 1.2595837897042717, "grad_norm": 2.5205495357513428, "learning_rate": 9.313303354115677e-05, "loss": 1.1383, "step": 2300 }, { "epoch": 1.3143483023001095, "grad_norm": 2.7046151161193848, "learning_rate": 9.213487173966623e-05, "loss": 1.1639, "step": 2400 }, { "epoch": 1.3691128148959475, "grad_norm": 2.2721831798553467, "learning_rate": 9.107512678178223e-05, "loss": 1.1572, "step": 2500 }, { "epoch": 1.4238773274917853, "grad_norm": 2.1899495124816895, "learning_rate": 8.99553475612544e-05, "loss": 1.0975, "step": 2600 }, { "epoch": 1.4786418400876231, "grad_norm": 2.541968822479248, "learning_rate": 8.877717071624055e-05, "loss": 1.1102, "step": 2700 }, { "epoch": 1.5334063526834611, "grad_norm": 1.9248064756393433, "learning_rate": 8.754231823724187e-05, "loss": 1.1012, "step": 2800 }, { "epoch": 1.588170865279299, "grad_norm": 2.002230644226074, "learning_rate": 8.62525949502896e-05, "loss": 1.0992, "step": 2900 }, { "epoch": 1.642935377875137, "grad_norm": 2.2088050842285156, "learning_rate": 8.490988587906137e-05, "loss": 1.1102, "step": 3000 }, { "epoch": 1.6976998904709748, "grad_norm": 2.2829463481903076, "learning_rate": 8.351615348978318e-05, "loss": 1.1058, "step": 3100 }, { "epoch": 1.7524644030668126, "grad_norm": 2.018911600112915, "learning_rate": 8.207343482294323e-05, "loss": 1.0693, "step": 3200 }, { "epoch": 1.8072289156626506, "grad_norm": 1.948006272315979, "learning_rate": 8.058383851601027e-05, "loss": 1.0797, "step": 3300 }, { "epoch": 1.8619934282584885, "grad_norm": 1.9828593730926514, "learning_rate": 7.904954172150776e-05, "loss": 1.0454, "step": 3400 }, { "epoch": 1.9167579408543265, "grad_norm": 2.2209079265594482, "learning_rate": 7.747278692494825e-05, "loss": 1.0665, "step": 3500 }, { "epoch": 1.9715224534501643, "grad_norm": 2.0689291954040527, "learning_rate": 7.585587866727898e-05, "loss": 1.0571, "step": 3600 }, { "epoch": 2.026286966046002, "grad_norm": 1.5938904285430908, "learning_rate": 7.420118017662894e-05, "loss": 0.8671, "step": 3700 }, { "epoch": 2.08105147864184, "grad_norm": 2.3023173809051514, "learning_rate": 7.251110991428034e-05, "loss": 0.6557, "step": 3800 }, { "epoch": 2.135815991237678, "grad_norm": 1.6988605260849, "learning_rate": 7.07881380399129e-05, "loss": 0.6836, "step": 3900 }, { "epoch": 2.190580503833516, "grad_norm": 1.5740976333618164, "learning_rate": 6.903478280128721e-05, "loss": 0.6712, "step": 4000 }, { "epoch": 2.245345016429354, "grad_norm": 2.228999137878418, "learning_rate": 6.725360685364384e-05, "loss": 0.668, "step": 4100 }, { "epoch": 2.3001095290251916, "grad_norm": 2.125016450881958, "learning_rate": 6.54472135141977e-05, "loss": 0.6735, "step": 4200 }, { "epoch": 2.3548740416210294, "grad_norm": 2.3794703483581543, "learning_rate": 6.361824295720199e-05, "loss": 0.6594, "step": 4300 }, { "epoch": 2.4096385542168672, "grad_norm": 2.231340169906616, "learning_rate": 6.176936835514312e-05, "loss": 0.6691, "step": 4400 }, { "epoch": 2.4644030668127055, "grad_norm": 2.1866331100463867, "learning_rate": 5.9903291971706e-05, "loss": 0.687, "step": 4500 }, { "epoch": 2.5191675794085433, "grad_norm": 1.7319729328155518, "learning_rate": 5.8022741212220623e-05, "loss": 0.6605, "step": 4600 }, { "epoch": 2.573932092004381, "grad_norm": 2.021991729736328, "learning_rate": 5.6130464637362466e-05, "loss": 0.6499, "step": 4700 }, { "epoch": 2.628696604600219, "grad_norm": 2.0492424964904785, "learning_rate": 5.4229227945932446e-05, "loss": 0.6634, "step": 4800 }, { "epoch": 2.6834611171960567, "grad_norm": 1.9110430479049683, "learning_rate": 5.2321809932588664e-05, "loss": 0.6407, "step": 4900 }, { "epoch": 2.738225629791895, "grad_norm": 2.151268482208252, "learning_rate": 5.041099842643736e-05, "loss": 0.6336, "step": 5000 }, { "epoch": 2.792990142387733, "grad_norm": 1.8292102813720703, "learning_rate": 4.849958621641945e-05, "loss": 0.6421, "step": 5100 }, { "epoch": 2.8477546549835706, "grad_norm": 1.5233772993087769, "learning_rate": 4.659036696944793e-05, "loss": 0.6052, "step": 5200 }, { "epoch": 2.9025191675794084, "grad_norm": 2.1103007793426514, "learning_rate": 4.4686131147261994e-05, "loss": 0.6316, "step": 5300 }, { "epoch": 2.9572836801752462, "grad_norm": 1.618807315826416, "learning_rate": 4.2789661927965795e-05, "loss": 0.635, "step": 5400 }, { "epoch": 3.0120481927710845, "grad_norm": 1.661723256111145, "learning_rate": 4.090373113821281e-05, "loss": 0.553, "step": 5500 }, { "epoch": 3.0668127053669223, "grad_norm": 1.5551363229751587, "learning_rate": 3.9031095201980976e-05, "loss": 0.3049, "step": 5600 }, { "epoch": 3.12157721796276, "grad_norm": 1.7245244979858398, "learning_rate": 3.717449111186025e-05, "loss": 0.3081, "step": 5700 }, { "epoch": 3.176341730558598, "grad_norm": 1.4479619264602661, "learning_rate": 3.5336632428740265e-05, "loss": 0.3103, "step": 5800 }, { "epoch": 3.2311062431544357, "grad_norm": 2.026216745376587, "learning_rate": 3.352020531574527e-05, "loss": 0.3035, "step": 5900 }, { "epoch": 3.285870755750274, "grad_norm": 1.2143455743789673, "learning_rate": 3.172786461221279e-05, "loss": 0.3146, "step": 6000 }, { "epoch": 3.340635268346112, "grad_norm": 1.7206776142120361, "learning_rate": 2.996222995345437e-05, "loss": 0.3138, "step": 6100 }, { "epoch": 3.3953997809419496, "grad_norm": 1.966302514076233, "learning_rate": 2.822588194196941e-05, "loss": 0.3059, "step": 6200 }, { "epoch": 3.4501642935377874, "grad_norm": 1.6685985326766968, "learning_rate": 2.6521358375708428e-05, "loss": 0.3006, "step": 6300 }, { "epoch": 3.5049288061336252, "grad_norm": 1.6338154077529907, "learning_rate": 2.4851150538898028e-05, "loss": 0.3017, "step": 6400 }, { "epoch": 3.5596933187294635, "grad_norm": 1.6607939004898071, "learning_rate": 2.321769956084937e-05, "loss": 0.2991, "step": 6500 }, { "epoch": 3.6144578313253013, "grad_norm": 1.7949954271316528, "learning_rate": 2.1623392848071354e-05, "loss": 0.3045, "step": 6600 }, { "epoch": 3.669222343921139, "grad_norm": 1.6750141382217407, "learning_rate": 2.007056059490364e-05, "loss": 0.3077, "step": 6700 }, { "epoch": 3.723986856516977, "grad_norm": 1.425621509552002, "learning_rate": 1.856147237776956e-05, "loss": 0.2989, "step": 6800 }, { "epoch": 3.7787513691128147, "grad_norm": 1.534443736076355, "learning_rate": 1.7098333838026275e-05, "loss": 0.2975, "step": 6900 }, { "epoch": 3.833515881708653, "grad_norm": 1.511762261390686, "learning_rate": 1.5683283458260718e-05, "loss": 0.2968, "step": 7000 }, { "epoch": 3.888280394304491, "grad_norm": 1.968000054359436, "learning_rate": 1.4318389436742962e-05, "loss": 0.2907, "step": 7100 }, { "epoch": 3.9430449069003286, "grad_norm": 1.5500705242156982, "learning_rate": 1.3005646664605165e-05, "loss": 0.2922, "step": 7200 }, { "epoch": 3.9978094194961664, "grad_norm": 1.7522929906845093, "learning_rate": 1.1746973810164147e-05, "loss": 0.2815, "step": 7300 }, { "epoch": 4.052573932092004, "grad_norm": 0.8427908420562744, "learning_rate": 1.0544210514649233e-05, "loss": 0.1758, "step": 7400 }, { "epoch": 4.1073384446878425, "grad_norm": 0.8239675760269165, "learning_rate": 9.399114703433688e-06, "loss": 0.17, "step": 7500 }, { "epoch": 4.16210295728368, "grad_norm": 0.85968416929245, "learning_rate": 8.313360016700011e-06, "loss": 0.1679, "step": 7600 }, { "epoch": 4.216867469879518, "grad_norm": 0.9793341159820557, "learning_rate": 7.288533363293959e-06, "loss": 0.1719, "step": 7700 }, { "epoch": 4.271631982475356, "grad_norm": 0.9244991540908813, "learning_rate": 6.32613260134271e-06, "loss": 0.1718, "step": 7800 }, { "epoch": 4.326396495071194, "grad_norm": 1.2018598318099976, "learning_rate": 5.427564349027098e-06, "loss": 0.1669, "step": 7900 }, { "epoch": 4.381161007667032, "grad_norm": 0.9171732664108276, "learning_rate": 4.594141928707629e-06, "loss": 0.1685, "step": 8000 }, { "epoch": 4.435925520262869, "grad_norm": 0.9013499021530151, "learning_rate": 3.8270834474090466e-06, "loss": 0.1684, "step": 8100 }, { "epoch": 4.490690032858708, "grad_norm": 0.8273316621780396, "learning_rate": 3.1275100164689543e-06, "loss": 0.1666, "step": 8200 }, { "epoch": 4.545454545454545, "grad_norm": 0.8490225076675415, "learning_rate": 2.496444112952734e-06, "loss": 0.1709, "step": 8300 }, { "epoch": 4.600219058050383, "grad_norm": 0.9616093635559082, "learning_rate": 1.9348080852294783e-06, "loss": 0.1676, "step": 8400 }, { "epoch": 4.6549835706462215, "grad_norm": 0.6257395148277283, "learning_rate": 1.4434228048932796e-06, "loss": 0.1657, "step": 8500 }, { "epoch": 4.709748083242059, "grad_norm": 1.020652174949646, "learning_rate": 1.023006467000115e-06, "loss": 0.1708, "step": 8600 }, { "epoch": 4.764512595837897, "grad_norm": 1.2859959602355957, "learning_rate": 6.741735403739901e-07, "loss": 0.1712, "step": 8700 }, { "epoch": 4.8192771084337345, "grad_norm": 0.8666063547134399, "learning_rate": 3.974338695163393e-07, "loss": 0.1657, "step": 8800 }, { "epoch": 4.874041621029573, "grad_norm": 0.6807364821434021, "learning_rate": 1.9319192943152986e-07, "loss": 0.1675, "step": 8900 }, { "epoch": 4.928806133625411, "grad_norm": 1.1967036724090576, "learning_rate": 6.174623445742155e-08, "loss": 0.1721, "step": 9000 }, { "epoch": 4.983570646221248, "grad_norm": 0.9348780512809753, "learning_rate": 3.2889019651372033e-09, "loss": 0.1666, "step": 9100 } ], "logging_steps": 100, "max_steps": 9130, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.3866120063130337e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }