{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1134, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.026455026455026454, "grad_norm": 7.5692021410659995, "learning_rate": 5e-06, "loss": 1.0155, "step": 10 }, { "epoch": 0.05291005291005291, "grad_norm": 2.6174349537450885, "learning_rate": 5e-06, "loss": 0.9187, "step": 20 }, { "epoch": 0.07936507936507936, "grad_norm": 2.793450551829053, "learning_rate": 5e-06, "loss": 0.8889, "step": 30 }, { "epoch": 0.10582010582010581, "grad_norm": 1.0752828617491306, "learning_rate": 5e-06, "loss": 0.8728, "step": 40 }, { "epoch": 0.13227513227513227, "grad_norm": 0.7890229586623537, "learning_rate": 5e-06, "loss": 0.8553, "step": 50 }, { "epoch": 0.15873015873015872, "grad_norm": 0.8515898940332954, "learning_rate": 5e-06, "loss": 0.8472, "step": 60 }, { "epoch": 0.18518518518518517, "grad_norm": 0.6739899043810001, "learning_rate": 5e-06, "loss": 0.8386, "step": 70 }, { "epoch": 0.21164021164021163, "grad_norm": 0.7472386351456437, "learning_rate": 5e-06, "loss": 0.8303, "step": 80 }, { "epoch": 0.23809523809523808, "grad_norm": 0.8201164599585603, "learning_rate": 5e-06, "loss": 0.8296, "step": 90 }, { "epoch": 0.26455026455026454, "grad_norm": 0.5974248203060147, "learning_rate": 5e-06, "loss": 0.8208, "step": 100 }, { "epoch": 0.291005291005291, "grad_norm": 0.7418126551089665, "learning_rate": 5e-06, "loss": 0.817, "step": 110 }, { "epoch": 0.31746031746031744, "grad_norm": 0.8971788447098652, "learning_rate": 5e-06, "loss": 0.816, "step": 120 }, { "epoch": 0.3439153439153439, "grad_norm": 0.6659038218094219, "learning_rate": 5e-06, "loss": 0.8095, "step": 130 }, { "epoch": 0.37037037037037035, "grad_norm": 0.8687534608310304, "learning_rate": 5e-06, "loss": 0.809, "step": 140 }, { "epoch": 0.3968253968253968, "grad_norm": 0.6997183066642144, "learning_rate": 5e-06, "loss": 0.8116, "step": 150 }, { "epoch": 0.42328042328042326, "grad_norm": 0.6784743741082218, "learning_rate": 5e-06, "loss": 0.8043, "step": 160 }, { "epoch": 0.4497354497354497, "grad_norm": 0.8279542209712195, "learning_rate": 5e-06, "loss": 0.8081, "step": 170 }, { "epoch": 0.47619047619047616, "grad_norm": 0.6448467413029932, "learning_rate": 5e-06, "loss": 0.806, "step": 180 }, { "epoch": 0.5026455026455027, "grad_norm": 0.5414700848521968, "learning_rate": 5e-06, "loss": 0.8021, "step": 190 }, { "epoch": 0.5291005291005291, "grad_norm": 0.6833581882793117, "learning_rate": 5e-06, "loss": 0.7978, "step": 200 }, { "epoch": 0.5555555555555556, "grad_norm": 0.9216374614283346, "learning_rate": 5e-06, "loss": 0.7997, "step": 210 }, { "epoch": 0.582010582010582, "grad_norm": 0.6017660770757399, "learning_rate": 5e-06, "loss": 0.7942, "step": 220 }, { "epoch": 0.6084656084656085, "grad_norm": 0.6412167679844537, "learning_rate": 5e-06, "loss": 0.791, "step": 230 }, { "epoch": 0.6349206349206349, "grad_norm": 0.6772848668775633, "learning_rate": 5e-06, "loss": 0.7948, "step": 240 }, { "epoch": 0.6613756613756614, "grad_norm": 0.600355639535383, "learning_rate": 5e-06, "loss": 0.7914, "step": 250 }, { "epoch": 0.6878306878306878, "grad_norm": 0.5606573561606166, "learning_rate": 5e-06, "loss": 0.7941, "step": 260 }, { "epoch": 0.7142857142857143, "grad_norm": 0.5274683257315145, "learning_rate": 5e-06, "loss": 0.7934, "step": 270 }, { "epoch": 0.7407407407407407, "grad_norm": 0.5268307259184533, "learning_rate": 5e-06, "loss": 0.7938, "step": 280 }, { "epoch": 0.7671957671957672, "grad_norm": 0.7728039906376213, "learning_rate": 5e-06, "loss": 0.7917, "step": 290 }, { "epoch": 0.7936507936507936, "grad_norm": 0.5113816020074538, "learning_rate": 5e-06, "loss": 0.7882, "step": 300 }, { "epoch": 0.8201058201058201, "grad_norm": 0.5190008909833674, "learning_rate": 5e-06, "loss": 0.7849, "step": 310 }, { "epoch": 0.8465608465608465, "grad_norm": 0.5836368027639713, "learning_rate": 5e-06, "loss": 0.7884, "step": 320 }, { "epoch": 0.873015873015873, "grad_norm": 0.7393083910981807, "learning_rate": 5e-06, "loss": 0.7832, "step": 330 }, { "epoch": 0.8994708994708994, "grad_norm": 0.6790621837196994, "learning_rate": 5e-06, "loss": 0.7889, "step": 340 }, { "epoch": 0.9259259259259259, "grad_norm": 0.7554140140474603, "learning_rate": 5e-06, "loss": 0.7915, "step": 350 }, { "epoch": 0.9523809523809523, "grad_norm": 0.5810605450609724, "learning_rate": 5e-06, "loss": 0.7863, "step": 360 }, { "epoch": 0.9788359788359788, "grad_norm": 0.6061224991845839, "learning_rate": 5e-06, "loss": 0.7836, "step": 370 }, { "epoch": 1.0, "eval_loss": 0.7853822112083435, "eval_runtime": 36.7907, "eval_samples_per_second": 276.537, "eval_steps_per_second": 1.087, "step": 378 }, { "epoch": 1.0052910052910053, "grad_norm": 0.8837267831248354, "learning_rate": 5e-06, "loss": 0.7782, "step": 380 }, { "epoch": 1.0317460317460316, "grad_norm": 0.8327957648034358, "learning_rate": 5e-06, "loss": 0.752, "step": 390 }, { "epoch": 1.0582010582010581, "grad_norm": 0.7661996686677544, "learning_rate": 5e-06, "loss": 0.7504, "step": 400 }, { "epoch": 1.0846560846560847, "grad_norm": 0.574247744750553, "learning_rate": 5e-06, "loss": 0.753, "step": 410 }, { "epoch": 1.1111111111111112, "grad_norm": 0.6270570009050412, "learning_rate": 5e-06, "loss": 0.7526, "step": 420 }, { "epoch": 1.1375661375661377, "grad_norm": 0.7564791753598593, "learning_rate": 5e-06, "loss": 0.7499, "step": 430 }, { "epoch": 1.164021164021164, "grad_norm": 0.6019767178166228, "learning_rate": 5e-06, "loss": 0.7534, "step": 440 }, { "epoch": 1.1904761904761905, "grad_norm": 0.7045276656881013, "learning_rate": 5e-06, "loss": 0.7516, "step": 450 }, { "epoch": 1.216931216931217, "grad_norm": 0.7151521174595956, "learning_rate": 5e-06, "loss": 0.7525, "step": 460 }, { "epoch": 1.2433862433862433, "grad_norm": 0.8589885709685776, "learning_rate": 5e-06, "loss": 0.7506, "step": 470 }, { "epoch": 1.2698412698412698, "grad_norm": 0.8448779205925924, "learning_rate": 5e-06, "loss": 0.7529, "step": 480 }, { "epoch": 1.2962962962962963, "grad_norm": 0.698173785850267, "learning_rate": 5e-06, "loss": 0.7523, "step": 490 }, { "epoch": 1.3227513227513228, "grad_norm": 0.5810999760014718, "learning_rate": 5e-06, "loss": 0.7493, "step": 500 }, { "epoch": 1.3492063492063493, "grad_norm": 0.6992760350263512, "learning_rate": 5e-06, "loss": 0.7468, "step": 510 }, { "epoch": 1.3756613756613756, "grad_norm": 0.6741207559975058, "learning_rate": 5e-06, "loss": 0.7498, "step": 520 }, { "epoch": 1.402116402116402, "grad_norm": 0.5507423746197616, "learning_rate": 5e-06, "loss": 0.7523, "step": 530 }, { "epoch": 1.4285714285714286, "grad_norm": 0.6210883025743585, "learning_rate": 5e-06, "loss": 0.7491, "step": 540 }, { "epoch": 1.455026455026455, "grad_norm": 0.5352886813564971, "learning_rate": 5e-06, "loss": 0.7501, "step": 550 }, { "epoch": 1.4814814814814814, "grad_norm": 0.6312193974549839, "learning_rate": 5e-06, "loss": 0.7523, "step": 560 }, { "epoch": 1.507936507936508, "grad_norm": 0.5240777963700096, "learning_rate": 5e-06, "loss": 0.7506, "step": 570 }, { "epoch": 1.5343915343915344, "grad_norm": 0.6713658440893504, "learning_rate": 5e-06, "loss": 0.7498, "step": 580 }, { "epoch": 1.560846560846561, "grad_norm": 0.5581636551389646, "learning_rate": 5e-06, "loss": 0.749, "step": 590 }, { "epoch": 1.5873015873015874, "grad_norm": 0.5510394311493787, "learning_rate": 5e-06, "loss": 0.7507, "step": 600 }, { "epoch": 1.6137566137566137, "grad_norm": 0.7490035349717284, "learning_rate": 5e-06, "loss": 0.751, "step": 610 }, { "epoch": 1.6402116402116402, "grad_norm": 0.7019577853473584, "learning_rate": 5e-06, "loss": 0.7452, "step": 620 }, { "epoch": 1.6666666666666665, "grad_norm": 0.5188517209828246, "learning_rate": 5e-06, "loss": 0.747, "step": 630 }, { "epoch": 1.693121693121693, "grad_norm": 0.5852910158185847, "learning_rate": 5e-06, "loss": 0.7478, "step": 640 }, { "epoch": 1.7195767195767195, "grad_norm": 0.5163509954560109, "learning_rate": 5e-06, "loss": 0.7493, "step": 650 }, { "epoch": 1.746031746031746, "grad_norm": 0.5259629378155002, "learning_rate": 5e-06, "loss": 0.7488, "step": 660 }, { "epoch": 1.7724867724867726, "grad_norm": 0.5847699777528805, "learning_rate": 5e-06, "loss": 0.7484, "step": 670 }, { "epoch": 1.798941798941799, "grad_norm": 0.6909699114073832, "learning_rate": 5e-06, "loss": 0.7425, "step": 680 }, { "epoch": 1.8253968253968254, "grad_norm": 0.5026266608814028, "learning_rate": 5e-06, "loss": 0.7481, "step": 690 }, { "epoch": 1.8518518518518519, "grad_norm": 0.4905863322036327, "learning_rate": 5e-06, "loss": 0.7427, "step": 700 }, { "epoch": 1.8783068783068781, "grad_norm": 0.4940567635795736, "learning_rate": 5e-06, "loss": 0.7468, "step": 710 }, { "epoch": 1.9047619047619047, "grad_norm": 0.5002276189718439, "learning_rate": 5e-06, "loss": 0.7411, "step": 720 }, { "epoch": 1.9312169312169312, "grad_norm": 0.5990930220626486, "learning_rate": 5e-06, "loss": 0.7445, "step": 730 }, { "epoch": 1.9576719576719577, "grad_norm": 0.575362618033918, "learning_rate": 5e-06, "loss": 0.746, "step": 740 }, { "epoch": 1.9841269841269842, "grad_norm": 0.5971817140113224, "learning_rate": 5e-06, "loss": 0.7439, "step": 750 }, { "epoch": 2.0, "eval_loss": 0.7724016904830933, "eval_runtime": 36.4294, "eval_samples_per_second": 279.28, "eval_steps_per_second": 1.098, "step": 756 }, { "epoch": 2.0105820105820107, "grad_norm": 0.7946153532333404, "learning_rate": 5e-06, "loss": 0.7315, "step": 760 }, { "epoch": 2.037037037037037, "grad_norm": 0.6230352829055962, "learning_rate": 5e-06, "loss": 0.7084, "step": 770 }, { "epoch": 2.0634920634920633, "grad_norm": 0.6511688564166924, "learning_rate": 5e-06, "loss": 0.7103, "step": 780 }, { "epoch": 2.0899470899470898, "grad_norm": 0.6162972908978676, "learning_rate": 5e-06, "loss": 0.7106, "step": 790 }, { "epoch": 2.1164021164021163, "grad_norm": 0.5483322240242068, "learning_rate": 5e-06, "loss": 0.7103, "step": 800 }, { "epoch": 2.142857142857143, "grad_norm": 0.5167695346041687, "learning_rate": 5e-06, "loss": 0.7118, "step": 810 }, { "epoch": 2.1693121693121693, "grad_norm": 0.5731147019555838, "learning_rate": 5e-06, "loss": 0.7117, "step": 820 }, { "epoch": 2.195767195767196, "grad_norm": 0.5992340825814363, "learning_rate": 5e-06, "loss": 0.7115, "step": 830 }, { "epoch": 2.2222222222222223, "grad_norm": 0.8204374260544979, "learning_rate": 5e-06, "loss": 0.7116, "step": 840 }, { "epoch": 2.248677248677249, "grad_norm": 0.6452436861380945, "learning_rate": 5e-06, "loss": 0.7132, "step": 850 }, { "epoch": 2.2751322751322753, "grad_norm": 0.7905676999430032, "learning_rate": 5e-06, "loss": 0.7156, "step": 860 }, { "epoch": 2.3015873015873014, "grad_norm": 0.6453047673638057, "learning_rate": 5e-06, "loss": 0.7134, "step": 870 }, { "epoch": 2.328042328042328, "grad_norm": 0.6161389768819914, "learning_rate": 5e-06, "loss": 0.7182, "step": 880 }, { "epoch": 2.3544973544973544, "grad_norm": 0.5925752010088033, "learning_rate": 5e-06, "loss": 0.7152, "step": 890 }, { "epoch": 2.380952380952381, "grad_norm": 0.5465043223339282, "learning_rate": 5e-06, "loss": 0.7164, "step": 900 }, { "epoch": 2.4074074074074074, "grad_norm": 0.5991686328257099, "learning_rate": 5e-06, "loss": 0.7169, "step": 910 }, { "epoch": 2.433862433862434, "grad_norm": 0.5529948466057861, "learning_rate": 5e-06, "loss": 0.7134, "step": 920 }, { "epoch": 2.4603174603174605, "grad_norm": 0.5700704291829093, "learning_rate": 5e-06, "loss": 0.714, "step": 930 }, { "epoch": 2.4867724867724865, "grad_norm": 0.7107367519714186, "learning_rate": 5e-06, "loss": 0.7119, "step": 940 }, { "epoch": 2.5132275132275135, "grad_norm": 0.5145994918143386, "learning_rate": 5e-06, "loss": 0.7164, "step": 950 }, { "epoch": 2.5396825396825395, "grad_norm": 0.8526143229072503, "learning_rate": 5e-06, "loss": 0.7176, "step": 960 }, { "epoch": 2.566137566137566, "grad_norm": 0.6460058195221838, "learning_rate": 5e-06, "loss": 0.718, "step": 970 }, { "epoch": 2.5925925925925926, "grad_norm": 0.5722175187803165, "learning_rate": 5e-06, "loss": 0.7158, "step": 980 }, { "epoch": 2.619047619047619, "grad_norm": 0.6199806409264405, "learning_rate": 5e-06, "loss": 0.7152, "step": 990 }, { "epoch": 2.6455026455026456, "grad_norm": 0.6295374166579548, "learning_rate": 5e-06, "loss": 0.7167, "step": 1000 }, { "epoch": 2.671957671957672, "grad_norm": 0.5562186015106506, "learning_rate": 5e-06, "loss": 0.7143, "step": 1010 }, { "epoch": 2.6984126984126986, "grad_norm": 0.5802340156160444, "learning_rate": 5e-06, "loss": 0.7201, "step": 1020 }, { "epoch": 2.7248677248677247, "grad_norm": 0.5674240362878424, "learning_rate": 5e-06, "loss": 0.7151, "step": 1030 }, { "epoch": 2.751322751322751, "grad_norm": 0.5579240804629998, "learning_rate": 5e-06, "loss": 0.7113, "step": 1040 }, { "epoch": 2.7777777777777777, "grad_norm": 0.5511568456212458, "learning_rate": 5e-06, "loss": 0.7153, "step": 1050 }, { "epoch": 2.804232804232804, "grad_norm": 0.53261750815606, "learning_rate": 5e-06, "loss": 0.7147, "step": 1060 }, { "epoch": 2.8306878306878307, "grad_norm": 0.601473528074606, "learning_rate": 5e-06, "loss": 0.7165, "step": 1070 }, { "epoch": 2.857142857142857, "grad_norm": 0.4922870741419976, "learning_rate": 5e-06, "loss": 0.7147, "step": 1080 }, { "epoch": 2.8835978835978837, "grad_norm": 0.5647388229398058, "learning_rate": 5e-06, "loss": 0.7146, "step": 1090 }, { "epoch": 2.91005291005291, "grad_norm": 0.6422541209526703, "learning_rate": 5e-06, "loss": 0.7175, "step": 1100 }, { "epoch": 2.9365079365079367, "grad_norm": 0.6977559447017284, "learning_rate": 5e-06, "loss": 0.7162, "step": 1110 }, { "epoch": 2.962962962962963, "grad_norm": 0.7941392765996913, "learning_rate": 5e-06, "loss": 0.714, "step": 1120 }, { "epoch": 2.9894179894179893, "grad_norm": 0.6766160989486909, "learning_rate": 5e-06, "loss": 0.7146, "step": 1130 }, { "epoch": 3.0, "eval_loss": 0.7703806757926941, "eval_runtime": 35.7626, "eval_samples_per_second": 284.487, "eval_steps_per_second": 1.118, "step": 1134 }, { "epoch": 3.0, "step": 1134, "total_flos": 1899492236328960.0, "train_loss": 0.7598387925923397, "train_runtime": 7118.4294, "train_samples_per_second": 81.465, "train_steps_per_second": 0.159 } ], "logging_steps": 10, "max_steps": 1134, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1899492236328960.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }