{ "best_metric": null, "best_model_checkpoint": null, "epoch": 11.849372384937238, "eval_steps": 500, "global_step": 708, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.17, "grad_norm": 5.769251346588135, "learning_rate": 4.9975392245612254e-05, "loss": 2.1374, "step": 10 }, { "epoch": 0.33, "grad_norm": 5.128970623016357, "learning_rate": 4.9901617425775067e-05, "loss": 0.4576, "step": 20 }, { "epoch": 0.5, "grad_norm": 4.582453727722168, "learning_rate": 4.9778820775100185e-05, "loss": 0.2497, "step": 30 }, { "epoch": 0.67, "grad_norm": 1.4966882467269897, "learning_rate": 4.9607244033573156e-05, "loss": 0.1604, "step": 40 }, { "epoch": 0.84, "grad_norm": 3.9401819705963135, "learning_rate": 4.93872249706591e-05, "loss": 0.1206, "step": 50 }, { "epoch": 1.0, "grad_norm": 1.5695924758911133, "learning_rate": 4.91191967203629e-05, "loss": 0.1109, "step": 60 }, { "epoch": 1.17, "grad_norm": 1.77476167678833, "learning_rate": 4.8803686928552736e-05, "loss": 0.057, "step": 70 }, { "epoch": 1.34, "grad_norm": 2.8595950603485107, "learning_rate": 4.84413167142257e-05, "loss": 0.0569, "step": 80 }, { "epoch": 1.51, "grad_norm": 3.9191925525665283, "learning_rate": 4.803279944676032e-05, "loss": 0.0634, "step": 90 }, { "epoch": 1.67, "grad_norm": 2.3463213443756104, "learning_rate": 4.7578939341563095e-05, "loss": 0.0634, "step": 100 }, { "epoch": 1.84, "grad_norm": 0.22609496116638184, "learning_rate": 4.70806298768736e-05, "loss": 0.0437, "step": 110 }, { "epoch": 2.01, "grad_norm": 2.9491143226623535, "learning_rate": 4.653885203484515e-05, "loss": 0.0627, "step": 120 }, { "epoch": 2.18, "grad_norm": 1.0097993612289429, "learning_rate": 4.595467237036329e-05, "loss": 0.0361, "step": 130 }, { "epoch": 2.34, "grad_norm": 2.7911064624786377, "learning_rate": 4.532924091140417e-05, "loss": 0.0267, "step": 140 }, { "epoch": 2.51, "grad_norm": 1.8213822841644287, "learning_rate": 4.466378889506607e-05, "loss": 0.0362, "step": 150 }, { "epoch": 2.68, "grad_norm": 0.4782440960407257, "learning_rate": 4.395962634373097e-05, "loss": 0.0403, "step": 160 }, { "epoch": 2.85, "grad_norm": 1.6825015544891357, "learning_rate": 4.3218139486127854e-05, "loss": 0.0302, "step": 170 }, { "epoch": 3.01, "grad_norm": 0.773872971534729, "learning_rate": 4.2440788028374624e-05, "loss": 0.048, "step": 180 }, { "epoch": 3.18, "grad_norm": 0.590453565120697, "learning_rate": 4.1629102280370904e-05, "loss": 0.0219, "step": 190 }, { "epoch": 3.35, "grad_norm": 0.30445072054862976, "learning_rate": 4.0784680143198836e-05, "loss": 0.0215, "step": 200 }, { "epoch": 3.51, "grad_norm": 0.2872461676597595, "learning_rate": 3.990918396346254e-05, "loss": 0.0264, "step": 210 }, { "epoch": 3.68, "grad_norm": 0.9185941219329834, "learning_rate": 3.900433726075865e-05, "loss": 0.029, "step": 220 }, { "epoch": 3.85, "grad_norm": 1.8721823692321777, "learning_rate": 3.8071921334720696e-05, "loss": 0.031, "step": 230 }, { "epoch": 4.02, "grad_norm": 0.34600716829299927, "learning_rate": 3.711377175831626e-05, "loss": 0.0205, "step": 240 }, { "epoch": 4.18, "grad_norm": 0.4261312484741211, "learning_rate": 3.613177476430079e-05, "loss": 0.0172, "step": 250 }, { "epoch": 4.35, "grad_norm": 1.4647105932235718, "learning_rate": 3.512786353194134e-05, "loss": 0.0162, "step": 260 }, { "epoch": 4.52, "grad_norm": 0.12821082770824432, "learning_rate": 3.410401438132056e-05, "loss": 0.0136, "step": 270 }, { "epoch": 4.69, "grad_norm": 0.6188161373138428, "learning_rate": 3.3062242882712724e-05, "loss": 0.0186, "step": 280 }, { "epoch": 4.85, "grad_norm": 0.3787616789340973, "learning_rate": 3.200459988869111e-05, "loss": 0.0147, "step": 290 }, { "epoch": 5.02, "grad_norm": 0.27198734879493713, "learning_rate": 3.093316749677788e-05, "loss": 0.0117, "step": 300 }, { "epoch": 5.19, "grad_norm": 0.38122686743736267, "learning_rate": 2.985005495058446e-05, "loss": 0.0099, "step": 310 }, { "epoch": 5.36, "grad_norm": 0.4904286861419678, "learning_rate": 2.875739448751176e-05, "loss": 0.013, "step": 320 }, { "epoch": 5.52, "grad_norm": 5.21450662612915, "learning_rate": 2.7657337141184138e-05, "loss": 0.0178, "step": 330 }, { "epoch": 5.69, "grad_norm": 0.09181027114391327, "learning_rate": 2.655204850688085e-05, "loss": 0.006, "step": 340 }, { "epoch": 5.86, "grad_norm": 0.7764429450035095, "learning_rate": 2.5443704478301154e-05, "loss": 0.0132, "step": 350 }, { "epoch": 6.03, "grad_norm": 0.3942791819572449, "learning_rate": 2.433448696405563e-05, "loss": 0.0086, "step": 360 }, { "epoch": 6.19, "grad_norm": 0.32931485772132874, "learning_rate": 2.3226579592316538e-05, "loss": 0.0073, "step": 370 }, { "epoch": 6.36, "grad_norm": 0.0025393522810190916, "learning_rate": 2.2122163412082927e-05, "loss": 0.0068, "step": 380 }, { "epoch": 6.53, "grad_norm": 0.17522938549518585, "learning_rate": 2.1023412599523204e-05, "loss": 0.0055, "step": 390 }, { "epoch": 6.69, "grad_norm": 0.07497023046016693, "learning_rate": 1.993249017784766e-05, "loss": 0.0104, "step": 400 }, { "epoch": 6.86, "grad_norm": 0.34479033946990967, "learning_rate": 1.8851543759137007e-05, "loss": 0.0086, "step": 410 }, { "epoch": 7.03, "grad_norm": 0.4811187982559204, "learning_rate": 1.778270131650948e-05, "loss": 0.0077, "step": 420 }, { "epoch": 7.2, "grad_norm": 0.33907610177993774, "learning_rate": 1.672806699494966e-05, "loss": 0.0028, "step": 430 }, { "epoch": 7.36, "grad_norm": 0.12760841846466064, "learning_rate": 1.5689716969045848e-05, "loss": 0.0049, "step": 440 }, { "epoch": 7.53, "grad_norm": 0.10119038820266724, "learning_rate": 1.4669695355790552e-05, "loss": 0.0041, "step": 450 }, { "epoch": 7.7, "grad_norm": 0.1975838541984558, "learning_rate": 1.3670010190490073e-05, "loss": 0.0051, "step": 460 }, { "epoch": 7.87, "grad_norm": 0.0027128455694764853, "learning_rate": 1.2692629473705453e-05, "loss": 0.0062, "step": 470 }, { "epoch": 8.03, "grad_norm": 0.45006152987480164, "learning_rate": 1.173947729700644e-05, "loss": 0.0036, "step": 480 }, { "epoch": 8.2, "grad_norm": 0.0031273181084543467, "learning_rate": 1.081243005516571e-05, "loss": 0.0027, "step": 490 }, { "epoch": 8.37, "grad_norm": 0.11013814806938171, "learning_rate": 9.913312752249903e-06, "loss": 0.0025, "step": 500 }, { "epoch": 8.54, "grad_norm": 0.16879281401634216, "learning_rate": 9.043895408879505e-06, "loss": 0.0029, "step": 510 }, { "epoch": 8.7, "grad_norm": 0.4711211323738098, "learning_rate": 8.20588957773018e-06, "loss": 0.0026, "step": 520 }, { "epoch": 8.87, "grad_norm": 0.4484475553035736, "learning_rate": 7.400944974135427e-06, "loss": 0.0029, "step": 530 }, { "epoch": 9.04, "grad_norm": 0.0020081661641597748, "learning_rate": 6.6306462284233234e-06, "loss": 0.0022, "step": 540 }, { "epoch": 9.21, "grad_norm": 0.051797155290842056, "learning_rate": 5.896509766381028e-06, "loss": 0.0011, "step": 550 }, { "epoch": 9.37, "grad_norm": 0.05627438426017761, "learning_rate": 5.199980823988157e-06, "loss": 0.0016, "step": 560 }, { "epoch": 9.54, "grad_norm": 0.054639093577861786, "learning_rate": 4.542430602295774e-06, "loss": 0.0012, "step": 570 }, { "epoch": 9.71, "grad_norm": 0.09268685430288315, "learning_rate": 3.925153568052123e-06, "loss": 0.0011, "step": 580 }, { "epoch": 9.87, "grad_norm": 0.3072189688682556, "learning_rate": 3.3493649053890326e-06, "loss": 0.0018, "step": 590 }, { "epoch": 10.04, "grad_norm": 0.002750764600932598, "learning_rate": 2.8161981235857143e-06, "loss": 0.001, "step": 600 }, { "epoch": 10.21, "grad_norm": 0.14409799873828888, "learning_rate": 2.3267028256193036e-06, "loss": 0.0011, "step": 610 }, { "epoch": 10.38, "grad_norm": 0.0033888304606080055, "learning_rate": 1.881842641895104e-06, "loss": 0.0014, "step": 620 }, { "epoch": 10.54, "grad_norm": 0.05172213539481163, "learning_rate": 1.4824933332241692e-06, "loss": 0.0004, "step": 630 }, { "epoch": 10.71, "grad_norm": 0.26695069670677185, "learning_rate": 1.129441066782702e-06, "loss": 0.001, "step": 640 }, { "epoch": 10.88, "grad_norm": 0.06269415467977524, "learning_rate": 8.233808684473959e-07, "loss": 0.0008, "step": 650 }, { "epoch": 11.05, "grad_norm": 0.1800147444009781, "learning_rate": 5.649152545533332e-07, "loss": 0.0013, "step": 660 }, { "epoch": 11.21, "grad_norm": 0.033635422587394714, "learning_rate": 3.5455304576806347e-07, "loss": 0.0008, "step": 670 }, { "epoch": 11.38, "grad_norm": 0.011946323327720165, "learning_rate": 1.927083654168854e-07, "loss": 0.0004, "step": 680 }, { "epoch": 11.55, "grad_norm": 0.1268051713705063, "learning_rate": 7.969982423124689e-08, "loss": 0.0007, "step": 690 }, { "epoch": 11.72, "grad_norm": 0.20600733160972595, "learning_rate": 1.5749893125160954e-08, "loss": 0.0011, "step": 700 }, { "epoch": 11.85, "step": 708, "total_flos": 2.9001892500996096e+16, "train_loss": 0.05958240834198955, "train_runtime": 2189.2208, "train_samples_per_second": 2.62, "train_steps_per_second": 0.323 } ], "logging_steps": 10, "max_steps": 708, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 1000, "total_flos": 2.9001892500996096e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }