{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9989550679205852, "eval_steps": 500, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020898641588296763, "grad_norm": 216.79754638671875, "learning_rate": 6.2499999999999995e-06, "loss": 57.9838, "step": 1 }, { "epoch": 0.01044932079414838, "grad_norm": 184.4412841796875, "learning_rate": 3.125e-05, "loss": 60.093, "step": 5 }, { "epoch": 0.02089864158829676, "grad_norm": 107.91060638427734, "learning_rate": 6.25e-05, "loss": 48.3094, "step": 10 }, { "epoch": 0.03134796238244514, "grad_norm": 17.1436710357666, "learning_rate": 9.374999999999999e-05, "loss": 33.2668, "step": 15 }, { "epoch": 0.04179728317659352, "grad_norm": 12.335116386413574, "learning_rate": 0.000125, "loss": 27.698, "step": 20 }, { "epoch": 0.0522466039707419, "grad_norm": 6.2943196296691895, "learning_rate": 0.00015625, "loss": 25.9692, "step": 25 }, { "epoch": 0.06269592476489028, "grad_norm": 5.466517448425293, "learning_rate": 0.00018749999999999998, "loss": 25.2691, "step": 30 }, { "epoch": 0.07314524555903866, "grad_norm": 9.744288444519043, "learning_rate": 0.00021874999999999998, "loss": 23.7082, "step": 35 }, { "epoch": 0.08359456635318704, "grad_norm": 19.27219581604004, "learning_rate": 0.00025, "loss": 21.3655, "step": 40 }, { "epoch": 0.09404388714733543, "grad_norm": 41.77222442626953, "learning_rate": 0.00028125, "loss": 16.1707, "step": 45 }, { "epoch": 0.1044932079414838, "grad_norm": 18.60293960571289, "learning_rate": 0.0002999839868651235, "loss": 8.0969, "step": 50 }, { "epoch": 0.11494252873563218, "grad_norm": 11.452897071838379, "learning_rate": 0.00029980387835984494, "loss": 4.1367, "step": 55 }, { "epoch": 0.12539184952978055, "grad_norm": 8.422245979309082, "learning_rate": 0.000299423886051382, "loss": 3.1254, "step": 60 }, { "epoch": 0.13584117032392895, "grad_norm": 2.444629669189453, "learning_rate": 0.0002988445169647103, "loss": 2.4463, "step": 65 }, { "epoch": 0.14629049111807732, "grad_norm": 1.307098627090454, "learning_rate": 0.0002980665441538907, "loss": 2.1685, "step": 70 }, { "epoch": 0.15673981191222572, "grad_norm": 2.10964298248291, "learning_rate": 0.0002970910056705806, "loss": 2.0392, "step": 75 }, { "epoch": 0.1671891327063741, "grad_norm": 1.1905853748321533, "learning_rate": 0.0002959192031789579, "loss": 1.9225, "step": 80 }, { "epoch": 0.17763845350052246, "grad_norm": 0.8916841745376587, "learning_rate": 0.0002945527002189068, "loss": 1.8422, "step": 85 }, { "epoch": 0.18808777429467086, "grad_norm": 3.186051845550537, "learning_rate": 0.00029299332011978107, "loss": 1.748, "step": 90 }, { "epoch": 0.19853709508881923, "grad_norm": 3.865817070007324, "learning_rate": 0.00029124314356752967, "loss": 1.7184, "step": 95 }, { "epoch": 0.2089864158829676, "grad_norm": 2.8790738582611084, "learning_rate": 0.0002893045058284311, "loss": 1.6432, "step": 100 }, { "epoch": 0.219435736677116, "grad_norm": 1.6771491765975952, "learning_rate": 0.00028717999363313967, "loss": 1.6567, "step": 105 }, { "epoch": 0.22988505747126436, "grad_norm": 2.725285530090332, "learning_rate": 0.00028487244172520246, "loss": 1.6157, "step": 110 }, { "epoch": 0.24033437826541273, "grad_norm": 2.289280652999878, "learning_rate": 0.0002823849290786517, "loss": 1.6148, "step": 115 }, { "epoch": 0.2507836990595611, "grad_norm": 2.0211188793182373, "learning_rate": 0.0002797207747897198, "loss": 1.5858, "step": 120 }, { "epoch": 0.2612330198537095, "grad_norm": 2.0264103412628174, "learning_rate": 0.00027688353364815834, "loss": 1.5708, "step": 125 }, { "epoch": 0.2716823406478579, "grad_norm": 0.9253348112106323, "learning_rate": 0.0002738769913940706, "loss": 1.5481, "step": 130 }, { "epoch": 0.28213166144200624, "grad_norm": 3.3143184185028076, "learning_rate": 0.00027070515966658604, "loss": 1.5535, "step": 135 }, { "epoch": 0.29258098223615464, "grad_norm": 4.024845600128174, "learning_rate": 0.0002673722706511174, "loss": 1.5542, "step": 140 }, { "epoch": 0.30303030303030304, "grad_norm": 3.718261241912842, "learning_rate": 0.00026388277143234146, "loss": 1.5507, "step": 145 }, { "epoch": 0.31347962382445144, "grad_norm": 1.9526076316833496, "learning_rate": 0.0002602413180604401, "loss": 1.5251, "step": 150 }, { "epoch": 0.3239289446185998, "grad_norm": 1.5725075006484985, "learning_rate": 0.00025645276933851667, "loss": 1.4937, "step": 155 }, { "epoch": 0.3343782654127482, "grad_norm": 4.266882419586182, "learning_rate": 0.00025252218033947993, "loss": 1.4944, "step": 160 }, { "epoch": 0.3448275862068966, "grad_norm": 2.6647915840148926, "learning_rate": 0.0002484547956610429, "loss": 1.4798, "step": 165 }, { "epoch": 0.3552769070010449, "grad_norm": 2.0770153999328613, "learning_rate": 0.0002442560424278399, "loss": 1.4708, "step": 170 }, { "epoch": 0.3657262277951933, "grad_norm": 1.8132774829864502, "learning_rate": 0.00023993152304999582, "loss": 1.4554, "step": 175 }, { "epoch": 0.3761755485893417, "grad_norm": 1.9493850469589233, "learning_rate": 0.00023548700774781242, "loss": 1.485, "step": 180 }, { "epoch": 0.38662486938349006, "grad_norm": 3.6726951599121094, "learning_rate": 0.00023092842685254442, "loss": 1.4584, "step": 185 }, { "epoch": 0.39707419017763845, "grad_norm": 2.253319501876831, "learning_rate": 0.00022626186289353913, "loss": 1.4569, "step": 190 }, { "epoch": 0.40752351097178685, "grad_norm": 3.336820125579834, "learning_rate": 0.00022149354248229784, "loss": 1.4334, "step": 195 }, { "epoch": 0.4179728317659352, "grad_norm": 3.0895018577575684, "learning_rate": 0.0002166298280042877, "loss": 1.4203, "step": 200 }, { "epoch": 0.4284221525600836, "grad_norm": 1.8486225605010986, "learning_rate": 0.00021167720912959004, "loss": 1.414, "step": 205 }, { "epoch": 0.438871473354232, "grad_norm": 0.7216203808784485, "learning_rate": 0.00020664229415371266, "loss": 1.3897, "step": 210 }, { "epoch": 0.44932079414838033, "grad_norm": 2.909454107284546, "learning_rate": 0.0002015318011801192, "loss": 1.3713, "step": 215 }, { "epoch": 0.45977011494252873, "grad_norm": 1.5531753301620483, "learning_rate": 0.0001963525491562421, "loss": 1.4055, "step": 220 }, { "epoch": 0.4702194357366771, "grad_norm": 4.848015308380127, "learning_rate": 0.00019111144877493873, "loss": 1.435, "step": 225 }, { "epoch": 0.48066875653082547, "grad_norm": 4.833097457885742, "learning_rate": 0.00018581549325353126, "loss": 1.417, "step": 230 }, { "epoch": 0.49111807732497387, "grad_norm": 1.415703296661377, "learning_rate": 0.00018047174900273435, "loss": 1.4449, "step": 235 }, { "epoch": 0.5015673981191222, "grad_norm": 0.9621894359588623, "learning_rate": 0.00017508734619791966, "loss": 1.3907, "step": 240 }, { "epoch": 0.5120167189132706, "grad_norm": 2.091428279876709, "learning_rate": 0.0001696694692653004, "loss": 1.3581, "step": 245 }, { "epoch": 0.522466039707419, "grad_norm": 1.3531287908554077, "learning_rate": 0.00016422534729572738, "loss": 1.3717, "step": 250 }, { "epoch": 0.5329153605015674, "grad_norm": 1.8569897413253784, "learning_rate": 0.0001587622443988899, "loss": 1.3811, "step": 255 }, { "epoch": 0.5433646812957158, "grad_norm": 4.248292446136475, "learning_rate": 0.0001532874500107902, "loss": 1.3797, "step": 260 }, { "epoch": 0.5538140020898642, "grad_norm": 2.5460174083709717, "learning_rate": 0.0001478082691674256, "loss": 1.3576, "step": 265 }, { "epoch": 0.5642633228840125, "grad_norm": 1.3485275506973267, "learning_rate": 0.00014233201275765494, "loss": 1.383, "step": 270 }, { "epoch": 0.5747126436781609, "grad_norm": 1.1686965227127075, "learning_rate": 0.00013686598776825563, "loss": 1.3715, "step": 275 }, { "epoch": 0.5851619644723093, "grad_norm": 1.8593087196350098, "learning_rate": 0.0001314174875341878, "loss": 1.3671, "step": 280 }, { "epoch": 0.5956112852664577, "grad_norm": 1.5989689826965332, "learning_rate": 0.0001259937820070732, "loss": 1.3379, "step": 285 }, { "epoch": 0.6060606060606061, "grad_norm": 3.129467248916626, "learning_rate": 0.00012060210805487529, "loss": 1.3436, "step": 290 }, { "epoch": 0.6165099268547545, "grad_norm": 1.071311593055725, "learning_rate": 0.00011524965980572284, "loss": 1.3711, "step": 295 }, { "epoch": 0.6269592476489029, "grad_norm": 2.8161048889160156, "learning_rate": 0.00010994357904876106, "loss": 1.3242, "step": 300 }, { "epoch": 0.6374085684430512, "grad_norm": 0.9445050954818726, "learning_rate": 0.00010469094570483928, "loss": 1.3217, "step": 305 }, { "epoch": 0.6478578892371996, "grad_norm": 1.53034508228302, "learning_rate": 9.949876837974944e-05, "loss": 1.314, "step": 310 }, { "epoch": 0.658307210031348, "grad_norm": 1.8168761730194092, "learning_rate": 9.437397501262026e-05, "loss": 1.3365, "step": 315 }, { "epoch": 0.6687565308254964, "grad_norm": 1.4955302476882935, "learning_rate": 8.932340363194595e-05, "loss": 1.3154, "step": 320 }, { "epoch": 0.6792058516196448, "grad_norm": 1.2552021741867065, "learning_rate": 8.435379323158218e-05, "loss": 1.3366, "step": 325 }, { "epoch": 0.6896551724137931, "grad_norm": 2.914289712905884, "learning_rate": 7.947177477888472e-05, "loss": 1.3233, "step": 330 }, { "epoch": 0.7001044932079414, "grad_norm": 1.3406000137329102, "learning_rate": 7.46838623669881e-05, "loss": 1.3264, "step": 335 }, { "epoch": 0.7105538140020898, "grad_norm": 0.9025297164916992, "learning_rate": 6.999644452302975e-05, "loss": 1.3197, "step": 340 }, { "epoch": 0.7210031347962382, "grad_norm": 1.2824598550796509, "learning_rate": 6.541577568391758e-05, "loss": 1.3201, "step": 345 }, { "epoch": 0.7314524555903866, "grad_norm": 0.9296241998672485, "learning_rate": 6.0947967851014405e-05, "loss": 1.3097, "step": 350 }, { "epoch": 0.741901776384535, "grad_norm": 0.8738858699798584, "learning_rate": 5.659898243487463e-05, "loss": 1.3044, "step": 355 }, { "epoch": 0.7523510971786834, "grad_norm": 1.8482000827789307, "learning_rate": 5.237462230091467e-05, "loss": 1.3108, "step": 360 }, { "epoch": 0.7628004179728317, "grad_norm": 2.537909746170044, "learning_rate": 4.8280524026630565e-05, "loss": 1.3164, "step": 365 }, { "epoch": 0.7732497387669801, "grad_norm": 1.3068586587905884, "learning_rate": 4.432215038069449e-05, "loss": 1.2782, "step": 370 }, { "epoch": 0.7836990595611285, "grad_norm": 1.3742858171463013, "learning_rate": 4.0504783033964645e-05, "loss": 1.3179, "step": 375 }, { "epoch": 0.7941483803552769, "grad_norm": 1.2923156023025513, "learning_rate": 3.6833515512134606e-05, "loss": 1.2904, "step": 380 }, { "epoch": 0.8045977011494253, "grad_norm": 0.7867398262023926, "learning_rate": 3.331324639942526e-05, "loss": 1.3029, "step": 385 }, { "epoch": 0.8150470219435737, "grad_norm": 1.1442195177078247, "learning_rate": 2.9948672802388135e-05, "loss": 1.3069, "step": 390 }, { "epoch": 0.8254963427377221, "grad_norm": 1.4821033477783203, "learning_rate": 2.67442840825406e-05, "loss": 1.3177, "step": 395 }, { "epoch": 0.8359456635318704, "grad_norm": 0.9633380770683289, "learning_rate": 2.3704355866196373e-05, "loss": 1.3249, "step": 400 }, { "epoch": 0.8463949843260188, "grad_norm": 1.2908155918121338, "learning_rate": 2.083294433948324e-05, "loss": 1.3449, "step": 405 }, { "epoch": 0.8568443051201672, "grad_norm": 1.1834619045257568, "learning_rate": 1.813388083616068e-05, "loss": 1.3086, "step": 410 }, { "epoch": 0.8672936259143156, "grad_norm": 1.1399352550506592, "learning_rate": 1.5610766725458834e-05, "loss": 1.315, "step": 415 }, { "epoch": 0.877742946708464, "grad_norm": 1.2300066947937012, "learning_rate": 1.326696860675981e-05, "loss": 1.2894, "step": 420 }, { "epoch": 0.8881922675026124, "grad_norm": 0.9975532293319702, "learning_rate": 1.1105613817532976e-05, "loss": 1.2953, "step": 425 }, { "epoch": 0.8986415882967607, "grad_norm": 0.9357336163520813, "learning_rate": 9.129586260518634e-06, "loss": 1.3159, "step": 430 }, { "epoch": 0.9090909090909091, "grad_norm": 0.7603440880775452, "learning_rate": 7.34152255572697e-06, "loss": 1.2897, "step": 435 }, { "epoch": 0.9195402298850575, "grad_norm": 0.8711851835250854, "learning_rate": 5.743808522387544e-06, "loss": 1.275, "step": 440 }, { "epoch": 0.9299895506792059, "grad_norm": 0.9144044518470764, "learning_rate": 4.33857599554282e-06, "loss": 1.328, "step": 445 }, { "epoch": 0.9404388714733543, "grad_norm": 0.862479567527771, "learning_rate": 3.1276999815337544e-06, "loss": 1.2879, "step": 450 }, { "epoch": 0.9508881922675027, "grad_norm": 0.7352892756462097, "learning_rate": 2.1127961561727193e-06, "loss": 1.2873, "step": 455 }, { "epoch": 0.9613375130616509, "grad_norm": 2.582821846008301, "learning_rate": 1.2952187089419642e-06, "loss": 1.3191, "step": 460 }, { "epoch": 0.9717868338557993, "grad_norm": 0.7060139179229736, "learning_rate": 6.760585360942872e-07, "loss": 1.3047, "step": 465 }, { "epoch": 0.9822361546499477, "grad_norm": 0.8089200258255005, "learning_rate": 2.5614178506644934e-07, "loss": 1.2743, "step": 470 }, { "epoch": 0.9926854754440961, "grad_norm": 1.2739328145980835, "learning_rate": 3.6028752148081766e-08, "loss": 1.3004, "step": 475 }, { "epoch": 0.9989550679205852, "eval_loss": 1.9203195571899414, "eval_runtime": 0.8302, "eval_samples_per_second": 2.409, "eval_steps_per_second": 1.205, "step": 478 }, { "epoch": 0.9989550679205852, "step": 478, "total_flos": 3.643767570437243e+17, "train_loss": 4.360991338805674, "train_runtime": 2613.4355, "train_samples_per_second": 2.928, "train_steps_per_second": 0.183 } ], "logging_steps": 5, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.643767570437243e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }