{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9973206311402203, "eval_steps": 500, "global_step": 1257, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023816612086930634, "grad_norm": 10.820662608896704, "learning_rate": 5e-06, "loss": 0.7962, "step": 10 }, { "epoch": 0.04763322417386127, "grad_norm": 1.3002290223711226, "learning_rate": 5e-06, "loss": 0.6506, "step": 20 }, { "epoch": 0.0714498362607919, "grad_norm": 2.274364660793754, "learning_rate": 5e-06, "loss": 0.5997, "step": 30 }, { "epoch": 0.09526644834772253, "grad_norm": 0.91227766008184, "learning_rate": 5e-06, "loss": 0.5732, "step": 40 }, { "epoch": 0.11908306043465317, "grad_norm": 0.8965545265977676, "learning_rate": 5e-06, "loss": 0.5522, "step": 50 }, { "epoch": 0.1428996725215838, "grad_norm": 0.7367789232478065, "learning_rate": 5e-06, "loss": 0.5413, "step": 60 }, { "epoch": 0.16671628460851443, "grad_norm": 1.959786214629393, "learning_rate": 5e-06, "loss": 0.5254, "step": 70 }, { "epoch": 0.19053289669544507, "grad_norm": 1.1527597962719172, "learning_rate": 5e-06, "loss": 0.5263, "step": 80 }, { "epoch": 0.21434950878237571, "grad_norm": 0.8809156142645043, "learning_rate": 5e-06, "loss": 0.5139, "step": 90 }, { "epoch": 0.23816612086930633, "grad_norm": 0.6829627408546641, "learning_rate": 5e-06, "loss": 0.5047, "step": 100 }, { "epoch": 0.261982732956237, "grad_norm": 0.5878115863496259, "learning_rate": 5e-06, "loss": 0.4974, "step": 110 }, { "epoch": 0.2857993450431676, "grad_norm": 0.7963943568799848, "learning_rate": 5e-06, "loss": 0.4997, "step": 120 }, { "epoch": 0.30961595713009826, "grad_norm": 0.8168312496362183, "learning_rate": 5e-06, "loss": 0.4909, "step": 130 }, { "epoch": 0.33343256921702885, "grad_norm": 0.5860607753114995, "learning_rate": 5e-06, "loss": 0.4881, "step": 140 }, { "epoch": 0.3572491813039595, "grad_norm": 0.6645019696194568, "learning_rate": 5e-06, "loss": 0.4891, "step": 150 }, { "epoch": 0.38106579339089014, "grad_norm": 1.1349953393684589, "learning_rate": 5e-06, "loss": 0.4886, "step": 160 }, { "epoch": 0.4048824054778208, "grad_norm": 0.842239450444899, "learning_rate": 5e-06, "loss": 0.486, "step": 170 }, { "epoch": 0.42869901756475143, "grad_norm": 0.8029347648800014, "learning_rate": 5e-06, "loss": 0.4794, "step": 180 }, { "epoch": 0.45251562965168207, "grad_norm": 0.7453080201307035, "learning_rate": 5e-06, "loss": 0.4779, "step": 190 }, { "epoch": 0.47633224173861266, "grad_norm": 0.7560106227106046, "learning_rate": 5e-06, "loss": 0.4755, "step": 200 }, { "epoch": 0.5001488538255433, "grad_norm": 0.6803104426449306, "learning_rate": 5e-06, "loss": 0.4738, "step": 210 }, { "epoch": 0.523965465912474, "grad_norm": 0.5983251461773578, "learning_rate": 5e-06, "loss": 0.4765, "step": 220 }, { "epoch": 0.5477820779994046, "grad_norm": 0.9213163769902233, "learning_rate": 5e-06, "loss": 0.466, "step": 230 }, { "epoch": 0.5715986900863352, "grad_norm": 0.7194449280323925, "learning_rate": 5e-06, "loss": 0.4771, "step": 240 }, { "epoch": 0.5954153021732659, "grad_norm": 0.5686308646341993, "learning_rate": 5e-06, "loss": 0.472, "step": 250 }, { "epoch": 0.6192319142601965, "grad_norm": 1.0813640818776593, "learning_rate": 5e-06, "loss": 0.4661, "step": 260 }, { "epoch": 0.6430485263471272, "grad_norm": 0.7358980285755272, "learning_rate": 5e-06, "loss": 0.4659, "step": 270 }, { "epoch": 0.6668651384340577, "grad_norm": 0.6604585093191498, "learning_rate": 5e-06, "loss": 0.4679, "step": 280 }, { "epoch": 0.6906817505209883, "grad_norm": 0.7044805358584626, "learning_rate": 5e-06, "loss": 0.4627, "step": 290 }, { "epoch": 0.714498362607919, "grad_norm": 0.7424204761656356, "learning_rate": 5e-06, "loss": 0.4597, "step": 300 }, { "epoch": 0.7383149746948496, "grad_norm": 0.6489446693829575, "learning_rate": 5e-06, "loss": 0.4596, "step": 310 }, { "epoch": 0.7621315867817803, "grad_norm": 0.7368542344749955, "learning_rate": 5e-06, "loss": 0.4645, "step": 320 }, { "epoch": 0.7859481988687109, "grad_norm": 0.7920469169588614, "learning_rate": 5e-06, "loss": 0.4532, "step": 330 }, { "epoch": 0.8097648109556416, "grad_norm": 0.4836289408504863, "learning_rate": 5e-06, "loss": 0.4546, "step": 340 }, { "epoch": 0.8335814230425722, "grad_norm": 0.4793593414506298, "learning_rate": 5e-06, "loss": 0.455, "step": 350 }, { "epoch": 0.8573980351295029, "grad_norm": 0.6323515901480585, "learning_rate": 5e-06, "loss": 0.4562, "step": 360 }, { "epoch": 0.8812146472164335, "grad_norm": 0.6239832843962634, "learning_rate": 5e-06, "loss": 0.4613, "step": 370 }, { "epoch": 0.9050312593033641, "grad_norm": 0.545469740506038, "learning_rate": 5e-06, "loss": 0.4505, "step": 380 }, { "epoch": 0.9288478713902947, "grad_norm": 0.7122693018169797, "learning_rate": 5e-06, "loss": 0.4548, "step": 390 }, { "epoch": 0.9526644834772253, "grad_norm": 0.7239717244861085, "learning_rate": 5e-06, "loss": 0.4482, "step": 400 }, { "epoch": 0.976481095564156, "grad_norm": 0.8780808883400644, "learning_rate": 5e-06, "loss": 0.4531, "step": 410 }, { "epoch": 0.9979160464423936, "eval_loss": 0.45424818992614746, "eval_runtime": 286.0837, "eval_samples_per_second": 39.551, "eval_steps_per_second": 0.619, "step": 419 }, { "epoch": 1.0020839535576065, "grad_norm": 1.5666613895216177, "learning_rate": 5e-06, "loss": 0.4859, "step": 420 }, { "epoch": 1.025900565644537, "grad_norm": 0.8242103660038579, "learning_rate": 5e-06, "loss": 0.4102, "step": 430 }, { "epoch": 1.0497171777314678, "grad_norm": 0.7452119219276494, "learning_rate": 5e-06, "loss": 0.4092, "step": 440 }, { "epoch": 1.0735337898183983, "grad_norm": 0.6691470793599864, "learning_rate": 5e-06, "loss": 0.4052, "step": 450 }, { "epoch": 1.097350401905329, "grad_norm": 0.7840952133237613, "learning_rate": 5e-06, "loss": 0.4105, "step": 460 }, { "epoch": 1.1211670139922596, "grad_norm": 0.6028706220060683, "learning_rate": 5e-06, "loss": 0.4084, "step": 470 }, { "epoch": 1.1449836260791901, "grad_norm": 0.6295729577533348, "learning_rate": 5e-06, "loss": 0.411, "step": 480 }, { "epoch": 1.1688002381661209, "grad_norm": 0.5614428880763974, "learning_rate": 5e-06, "loss": 0.4035, "step": 490 }, { "epoch": 1.1926168502530514, "grad_norm": 0.7354574620421201, "learning_rate": 5e-06, "loss": 0.4128, "step": 500 }, { "epoch": 1.2164334623399822, "grad_norm": 0.6533242864715311, "learning_rate": 5e-06, "loss": 0.409, "step": 510 }, { "epoch": 1.2402500744269127, "grad_norm": 0.5341268826400728, "learning_rate": 5e-06, "loss": 0.4069, "step": 520 }, { "epoch": 1.2640666865138435, "grad_norm": 0.595790957629028, "learning_rate": 5e-06, "loss": 0.403, "step": 530 }, { "epoch": 1.287883298600774, "grad_norm": 0.5757459346899625, "learning_rate": 5e-06, "loss": 0.4043, "step": 540 }, { "epoch": 1.3116999106877048, "grad_norm": 0.6014475432063849, "learning_rate": 5e-06, "loss": 0.4063, "step": 550 }, { "epoch": 1.3355165227746353, "grad_norm": 0.6816804823593294, "learning_rate": 5e-06, "loss": 0.4054, "step": 560 }, { "epoch": 1.3593331348615658, "grad_norm": 0.8369295577084369, "learning_rate": 5e-06, "loss": 0.4038, "step": 570 }, { "epoch": 1.3831497469484966, "grad_norm": 0.5774046033105074, "learning_rate": 5e-06, "loss": 0.4041, "step": 580 }, { "epoch": 1.4069663590354273, "grad_norm": 0.6188803841652969, "learning_rate": 5e-06, "loss": 0.4064, "step": 590 }, { "epoch": 1.4307829711223579, "grad_norm": 0.6067419957695976, "learning_rate": 5e-06, "loss": 0.3998, "step": 600 }, { "epoch": 1.4545995832092884, "grad_norm": 0.7114287172431103, "learning_rate": 5e-06, "loss": 0.4024, "step": 610 }, { "epoch": 1.4784161952962191, "grad_norm": 0.6126178425690488, "learning_rate": 5e-06, "loss": 0.4026, "step": 620 }, { "epoch": 1.50223280738315, "grad_norm": 0.5281312737413801, "learning_rate": 5e-06, "loss": 0.4027, "step": 630 }, { "epoch": 1.5260494194700804, "grad_norm": 0.7216431853686169, "learning_rate": 5e-06, "loss": 0.4054, "step": 640 }, { "epoch": 1.549866031557011, "grad_norm": 0.6037953128159437, "learning_rate": 5e-06, "loss": 0.402, "step": 650 }, { "epoch": 1.5736826436439415, "grad_norm": 0.7237089947239425, "learning_rate": 5e-06, "loss": 0.3977, "step": 660 }, { "epoch": 1.5974992557308723, "grad_norm": 0.6225585837795767, "learning_rate": 5e-06, "loss": 0.3986, "step": 670 }, { "epoch": 1.621315867817803, "grad_norm": 0.6880389181478974, "learning_rate": 5e-06, "loss": 0.4037, "step": 680 }, { "epoch": 1.6451324799047335, "grad_norm": 0.6033306584926863, "learning_rate": 5e-06, "loss": 0.4022, "step": 690 }, { "epoch": 1.668949091991664, "grad_norm": 0.635681818377172, "learning_rate": 5e-06, "loss": 0.4068, "step": 700 }, { "epoch": 1.6927657040785948, "grad_norm": 0.6212447265932675, "learning_rate": 5e-06, "loss": 0.4003, "step": 710 }, { "epoch": 1.7165823161655256, "grad_norm": 0.8760196584827, "learning_rate": 5e-06, "loss": 0.3978, "step": 720 }, { "epoch": 1.7403989282524561, "grad_norm": 0.6660720739374277, "learning_rate": 5e-06, "loss": 0.4004, "step": 730 }, { "epoch": 1.7642155403393867, "grad_norm": 0.6326799745725241, "learning_rate": 5e-06, "loss": 0.4007, "step": 740 }, { "epoch": 1.7880321524263174, "grad_norm": 0.6030163934725129, "learning_rate": 5e-06, "loss": 0.4011, "step": 750 }, { "epoch": 1.811848764513248, "grad_norm": 0.6024754641341044, "learning_rate": 5e-06, "loss": 0.3941, "step": 760 }, { "epoch": 1.8356653766001787, "grad_norm": 0.5552872150124285, "learning_rate": 5e-06, "loss": 0.4042, "step": 770 }, { "epoch": 1.8594819886871092, "grad_norm": 0.5054694344878702, "learning_rate": 5e-06, "loss": 0.4029, "step": 780 }, { "epoch": 1.8832986007740398, "grad_norm": 0.7189726869611571, "learning_rate": 5e-06, "loss": 0.3992, "step": 790 }, { "epoch": 1.9071152128609705, "grad_norm": 0.5495927800255059, "learning_rate": 5e-06, "loss": 0.3969, "step": 800 }, { "epoch": 1.9309318249479013, "grad_norm": 0.5757953006798125, "learning_rate": 5e-06, "loss": 0.3995, "step": 810 }, { "epoch": 1.9547484370348318, "grad_norm": 0.5635263667269701, "learning_rate": 5e-06, "loss": 0.402, "step": 820 }, { "epoch": 1.9785650491217623, "grad_norm": 0.6248661080671392, "learning_rate": 5e-06, "loss": 0.3953, "step": 830 }, { "epoch": 1.997618338791307, "eval_loss": 0.43693724274635315, "eval_runtime": 286.32, "eval_samples_per_second": 39.519, "eval_steps_per_second": 0.618, "step": 838 }, { "epoch": 2.004167907115213, "grad_norm": 0.8252136402016477, "learning_rate": 5e-06, "loss": 0.4321, "step": 840 }, { "epoch": 2.0279845192021435, "grad_norm": 0.8217614405447083, "learning_rate": 5e-06, "loss": 0.3538, "step": 850 }, { "epoch": 2.051801131289074, "grad_norm": 0.597126610910981, "learning_rate": 5e-06, "loss": 0.353, "step": 860 }, { "epoch": 2.0756177433760046, "grad_norm": 0.5891946978445731, "learning_rate": 5e-06, "loss": 0.3476, "step": 870 }, { "epoch": 2.0994343554629356, "grad_norm": 0.7349012751558448, "learning_rate": 5e-06, "loss": 0.3513, "step": 880 }, { "epoch": 2.123250967549866, "grad_norm": 0.6562744496253617, "learning_rate": 5e-06, "loss": 0.352, "step": 890 }, { "epoch": 2.1470675796367966, "grad_norm": 0.7348610922149896, "learning_rate": 5e-06, "loss": 0.3509, "step": 900 }, { "epoch": 2.170884191723727, "grad_norm": 0.5550207440255053, "learning_rate": 5e-06, "loss": 0.3517, "step": 910 }, { "epoch": 2.194700803810658, "grad_norm": 0.7725877741803382, "learning_rate": 5e-06, "loss": 0.3547, "step": 920 }, { "epoch": 2.2185174158975887, "grad_norm": 0.7254370201802991, "learning_rate": 5e-06, "loss": 0.3513, "step": 930 }, { "epoch": 2.242334027984519, "grad_norm": 0.7099191501450315, "learning_rate": 5e-06, "loss": 0.3534, "step": 940 }, { "epoch": 2.2661506400714497, "grad_norm": 0.6217727164056416, "learning_rate": 5e-06, "loss": 0.3525, "step": 950 }, { "epoch": 2.2899672521583803, "grad_norm": 0.7759463308703756, "learning_rate": 5e-06, "loss": 0.3561, "step": 960 }, { "epoch": 2.3137838642453112, "grad_norm": 0.8023649875600256, "learning_rate": 5e-06, "loss": 0.3574, "step": 970 }, { "epoch": 2.3376004763322418, "grad_norm": 0.6363742954254966, "learning_rate": 5e-06, "loss": 0.3568, "step": 980 }, { "epoch": 2.3614170884191723, "grad_norm": 0.6522324898510586, "learning_rate": 5e-06, "loss": 0.3545, "step": 990 }, { "epoch": 2.385233700506103, "grad_norm": 0.6816643137692279, "learning_rate": 5e-06, "loss": 0.3503, "step": 1000 }, { "epoch": 2.409050312593034, "grad_norm": 0.616355784268503, "learning_rate": 5e-06, "loss": 0.3539, "step": 1010 }, { "epoch": 2.4328669246799643, "grad_norm": 0.8616495256742727, "learning_rate": 5e-06, "loss": 0.3567, "step": 1020 }, { "epoch": 2.456683536766895, "grad_norm": 0.6281577543660147, "learning_rate": 5e-06, "loss": 0.355, "step": 1030 }, { "epoch": 2.4805001488538254, "grad_norm": 0.5605730765386269, "learning_rate": 5e-06, "loss": 0.3551, "step": 1040 }, { "epoch": 2.504316760940756, "grad_norm": 0.6217913711967288, "learning_rate": 5e-06, "loss": 0.3552, "step": 1050 }, { "epoch": 2.528133373027687, "grad_norm": 0.6599048002895368, "learning_rate": 5e-06, "loss": 0.3532, "step": 1060 }, { "epoch": 2.5519499851146175, "grad_norm": 0.7340133449366499, "learning_rate": 5e-06, "loss": 0.3546, "step": 1070 }, { "epoch": 2.575766597201548, "grad_norm": 0.7470486435804491, "learning_rate": 5e-06, "loss": 0.356, "step": 1080 }, { "epoch": 2.599583209288479, "grad_norm": 0.5596330861321896, "learning_rate": 5e-06, "loss": 0.3593, "step": 1090 }, { "epoch": 2.6233998213754095, "grad_norm": 0.655258490698248, "learning_rate": 5e-06, "loss": 0.3574, "step": 1100 }, { "epoch": 2.64721643346234, "grad_norm": 0.6634626122990018, "learning_rate": 5e-06, "loss": 0.3582, "step": 1110 }, { "epoch": 2.6710330455492706, "grad_norm": 0.6875550380429598, "learning_rate": 5e-06, "loss": 0.3567, "step": 1120 }, { "epoch": 2.694849657636201, "grad_norm": 0.6775433440528874, "learning_rate": 5e-06, "loss": 0.3574, "step": 1130 }, { "epoch": 2.7186662697231316, "grad_norm": 0.6304790171059349, "learning_rate": 5e-06, "loss": 0.3547, "step": 1140 }, { "epoch": 2.7424828818100626, "grad_norm": 0.6868875377983221, "learning_rate": 5e-06, "loss": 0.3547, "step": 1150 }, { "epoch": 2.766299493896993, "grad_norm": 0.8782114193263775, "learning_rate": 5e-06, "loss": 0.3586, "step": 1160 }, { "epoch": 2.7901161059839237, "grad_norm": 0.6003211481767972, "learning_rate": 5e-06, "loss": 0.3498, "step": 1170 }, { "epoch": 2.8139327180708547, "grad_norm": 0.6181739427734269, "learning_rate": 5e-06, "loss": 0.3611, "step": 1180 }, { "epoch": 2.837749330157785, "grad_norm": 0.5812515280780504, "learning_rate": 5e-06, "loss": 0.3593, "step": 1190 }, { "epoch": 2.8615659422447157, "grad_norm": 0.6647885239726261, "learning_rate": 5e-06, "loss": 0.357, "step": 1200 }, { "epoch": 2.8853825543316463, "grad_norm": 1.1264669697440912, "learning_rate": 5e-06, "loss": 0.356, "step": 1210 }, { "epoch": 2.909199166418577, "grad_norm": 0.7668811596236461, "learning_rate": 5e-06, "loss": 0.357, "step": 1220 }, { "epoch": 2.9330157785055073, "grad_norm": 0.5719772302769721, "learning_rate": 5e-06, "loss": 0.3531, "step": 1230 }, { "epoch": 2.9568323905924383, "grad_norm": 0.6110386348906062, "learning_rate": 5e-06, "loss": 0.3583, "step": 1240 }, { "epoch": 2.980649002679369, "grad_norm": 0.5843814268513303, "learning_rate": 5e-06, "loss": 0.3539, "step": 1250 }, { "epoch": 2.9973206311402203, "eval_loss": 0.4348444640636444, "eval_runtime": 294.9931, "eval_samples_per_second": 38.357, "eval_steps_per_second": 0.6, "step": 1257 }, { "epoch": 2.9973206311402203, "step": 1257, "total_flos": 2105312437862400.0, "train_loss": 0.41854361285268077, "train_runtime": 41355.0058, "train_samples_per_second": 15.594, "train_steps_per_second": 0.03 } ], "logging_steps": 10, "max_steps": 1257, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2105312437862400.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }