{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1008, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02976190476190476, "grad_norm": 9.853419747991937, "learning_rate": 3.2894736842105264e-07, "loss": 0.8831, "step": 10 }, { "epoch": 0.05952380952380952, "grad_norm": 4.045058363073172, "learning_rate": 6.578947368421053e-07, "loss": 0.7946, "step": 20 }, { "epoch": 0.08928571428571429, "grad_norm": 1.7645796284122057, "learning_rate": 9.86842105263158e-07, "loss": 0.7121, "step": 30 }, { "epoch": 0.11904761904761904, "grad_norm": 1.140830497110258, "learning_rate": 1.3157894736842106e-06, "loss": 0.66, "step": 40 }, { "epoch": 0.1488095238095238, "grad_norm": 1.1358598524344525, "learning_rate": 1.6447368421052635e-06, "loss": 0.6275, "step": 50 }, { "epoch": 0.17857142857142858, "grad_norm": 1.7270256067309684, "learning_rate": 1.973684210526316e-06, "loss": 0.6049, "step": 60 }, { "epoch": 0.20833333333333334, "grad_norm": 2.1582181783133034, "learning_rate": 2.3026315789473684e-06, "loss": 0.585, "step": 70 }, { "epoch": 0.23809523809523808, "grad_norm": 1.9485604694706637, "learning_rate": 2.631578947368421e-06, "loss": 0.5771, "step": 80 }, { "epoch": 0.26785714285714285, "grad_norm": 1.450573301317074, "learning_rate": 2.960526315789474e-06, "loss": 0.5624, "step": 90 }, { "epoch": 0.2976190476190476, "grad_norm": 2.318439909309654, "learning_rate": 3.289473684210527e-06, "loss": 0.5586, "step": 100 }, { "epoch": 0.3273809523809524, "grad_norm": 5.04002159560027, "learning_rate": 3.618421052631579e-06, "loss": 0.5465, "step": 110 }, { "epoch": 0.35714285714285715, "grad_norm": 4.226893330745477, "learning_rate": 3.947368421052632e-06, "loss": 0.5422, "step": 120 }, { "epoch": 0.3869047619047619, "grad_norm": 4.180911282442447, "learning_rate": 4.276315789473684e-06, "loss": 0.5394, "step": 130 }, { "epoch": 0.4166666666666667, "grad_norm": 4.077795241551045, "learning_rate": 4.605263157894737e-06, "loss": 0.5326, "step": 140 }, { "epoch": 0.44642857142857145, "grad_norm": 3.9687123000345506, "learning_rate": 4.9342105263157895e-06, "loss": 0.5322, "step": 150 }, { "epoch": 0.47619047619047616, "grad_norm": 3.014174464468374, "learning_rate": 4.999030264010747e-06, "loss": 0.5272, "step": 160 }, { "epoch": 0.5059523809523809, "grad_norm": 2.0856834626596275, "learning_rate": 4.9950921441328395e-06, "loss": 0.5261, "step": 170 }, { "epoch": 0.5357142857142857, "grad_norm": 2.7059052944218003, "learning_rate": 4.988130331649192e-06, "loss": 0.5217, "step": 180 }, { "epoch": 0.5654761904761905, "grad_norm": 1.6208448485657272, "learning_rate": 4.978154202736626e-06, "loss": 0.5168, "step": 190 }, { "epoch": 0.5952380952380952, "grad_norm": 1.9297119158158096, "learning_rate": 4.965177193256699e-06, "loss": 0.5122, "step": 200 }, { "epoch": 0.625, "grad_norm": 2.0287171057013684, "learning_rate": 4.9492167806602625e-06, "loss": 0.5133, "step": 210 }, { "epoch": 0.6547619047619048, "grad_norm": 2.2552084633650846, "learning_rate": 4.930294460448816e-06, "loss": 0.5087, "step": 220 }, { "epoch": 0.6845238095238095, "grad_norm": 2.2162529540923024, "learning_rate": 4.908435717224345e-06, "loss": 0.504, "step": 230 }, { "epoch": 0.7142857142857143, "grad_norm": 1.4266591985844514, "learning_rate": 4.88366999036662e-06, "loss": 0.4993, "step": 240 }, { "epoch": 0.7440476190476191, "grad_norm": 1.915228524418746, "learning_rate": 4.856030634384218e-06, "loss": 0.4979, "step": 250 }, { "epoch": 0.7738095238095238, "grad_norm": 2.1002785366463375, "learning_rate": 4.825554873992629e-06, "loss": 0.4999, "step": 260 }, { "epoch": 0.8035714285714286, "grad_norm": 1.9708512596204752, "learning_rate": 4.792283753979964e-06, "loss": 0.4917, "step": 270 }, { "epoch": 0.8333333333333334, "grad_norm": 1.7157029682302167, "learning_rate": 4.756262083927795e-06, "loss": 0.4918, "step": 280 }, { "epoch": 0.8630952380952381, "grad_norm": 2.143462386090336, "learning_rate": 4.7175383778615595e-06, "loss": 0.4878, "step": 290 }, { "epoch": 0.8928571428571429, "grad_norm": 1.6613911634138163, "learning_rate": 4.676164788911806e-06, "loss": 0.4866, "step": 300 }, { "epoch": 0.9226190476190477, "grad_norm": 2.0367771816212406, "learning_rate": 4.632197039074318e-06, "loss": 0.4809, "step": 310 }, { "epoch": 0.9523809523809523, "grad_norm": 1.575376900323717, "learning_rate": 4.585694344163653e-06, "loss": 0.4832, "step": 320 }, { "epoch": 0.9821428571428571, "grad_norm": 1.9768132398324354, "learning_rate": 4.536719334061227e-06, "loss": 0.4781, "step": 330 }, { "epoch": 1.0, "eval_loss": 0.059962574392557144, "eval_runtime": 228.8477, "eval_samples_per_second": 79.105, "eval_steps_per_second": 0.621, "step": 336 }, { "epoch": 1.0119047619047619, "grad_norm": 2.34643440784643, "learning_rate": 4.485337968365309e-06, "loss": 0.4622, "step": 340 }, { "epoch": 1.0416666666666667, "grad_norm": 1.6102961139341114, "learning_rate": 4.431619447556573e-06, "loss": 0.4232, "step": 350 }, { "epoch": 1.0714285714285714, "grad_norm": 1.9499238441377658, "learning_rate": 4.375636119798806e-06, "loss": 0.423, "step": 360 }, { "epoch": 1.1011904761904763, "grad_norm": 1.7808321014904356, "learning_rate": 4.317463383500321e-06, "loss": 0.4219, "step": 370 }, { "epoch": 1.130952380952381, "grad_norm": 1.940742296181746, "learning_rate": 4.257179585767301e-06, "loss": 0.4226, "step": 380 }, { "epoch": 1.1607142857142858, "grad_norm": 1.8656621875495936, "learning_rate": 4.194865916885821e-06, "loss": 0.425, "step": 390 }, { "epoch": 1.1904761904761905, "grad_norm": 1.4942061788258214, "learning_rate": 4.130606300974687e-06, "loss": 0.4198, "step": 400 }, { "epoch": 1.2202380952380953, "grad_norm": 1.951098620743733, "learning_rate": 4.064487282956336e-06, "loss": 0.4262, "step": 410 }, { "epoch": 1.25, "grad_norm": 1.9569780029027728, "learning_rate": 3.996597911998038e-06, "loss": 0.4227, "step": 420 }, { "epoch": 1.2797619047619047, "grad_norm": 1.8685853825631114, "learning_rate": 3.9270296215803815e-06, "loss": 0.4206, "step": 430 }, { "epoch": 1.3095238095238095, "grad_norm": 1.7224194040971124, "learning_rate": 3.855876106354553e-06, "loss": 0.4185, "step": 440 }, { "epoch": 1.3392857142857144, "grad_norm": 1.8443268208690127, "learning_rate": 3.7832331959542816e-06, "loss": 0.4187, "step": 450 }, { "epoch": 1.369047619047619, "grad_norm": 1.3756169280889354, "learning_rate": 3.7091987259323816e-06, "loss": 0.4216, "step": 460 }, { "epoch": 1.3988095238095237, "grad_norm": 2.2331014007027528, "learning_rate": 3.633872405995715e-06, "loss": 0.4202, "step": 470 }, { "epoch": 1.4285714285714286, "grad_norm": 3.7569416006059155, "learning_rate": 3.5573556857160563e-06, "loss": 0.4175, "step": 480 }, { "epoch": 1.4583333333333333, "grad_norm": 2.573858640338651, "learning_rate": 3.4797516178976858e-06, "loss": 0.4139, "step": 490 }, { "epoch": 1.4880952380952381, "grad_norm": 1.503996972347894, "learning_rate": 3.4011647197857655e-06, "loss": 0.4151, "step": 500 }, { "epoch": 1.5178571428571428, "grad_norm": 1.8257697563677955, "learning_rate": 3.3217008323023924e-06, "loss": 0.4141, "step": 510 }, { "epoch": 1.5476190476190477, "grad_norm": 1.646549094801715, "learning_rate": 3.241466977499929e-06, "loss": 0.4116, "step": 520 }, { "epoch": 1.5773809523809523, "grad_norm": 1.679698382063382, "learning_rate": 3.160571214423583e-06, "loss": 0.4123, "step": 530 }, { "epoch": 1.6071428571428572, "grad_norm": 2.491128845254276, "learning_rate": 3.0791224935773623e-06, "loss": 0.4108, "step": 540 }, { "epoch": 1.6369047619047619, "grad_norm": 2.1203988379746925, "learning_rate": 2.9972305101894167e-06, "loss": 0.4147, "step": 550 }, { "epoch": 1.6666666666666665, "grad_norm": 1.4047183307353543, "learning_rate": 2.915005556474384e-06, "loss": 0.4096, "step": 560 }, { "epoch": 1.6964285714285714, "grad_norm": 1.739087804086072, "learning_rate": 2.832558373091709e-06, "loss": 0.4114, "step": 570 }, { "epoch": 1.7261904761904763, "grad_norm": 1.4269268648653617, "learning_rate": 2.7500000000000004e-06, "loss": 0.4127, "step": 580 }, { "epoch": 1.755952380952381, "grad_norm": 1.3007410089880698, "learning_rate": 2.6674416269082913e-06, "loss": 0.4083, "step": 590 }, { "epoch": 1.7857142857142856, "grad_norm": 1.4678600812447118, "learning_rate": 2.584994443525617e-06, "loss": 0.4075, "step": 600 }, { "epoch": 1.8154761904761905, "grad_norm": 2.290734461567148, "learning_rate": 2.502769489810584e-06, "loss": 0.4094, "step": 610 }, { "epoch": 1.8452380952380953, "grad_norm": 1.5060589346846796, "learning_rate": 2.4208775064226384e-06, "loss": 0.4061, "step": 620 }, { "epoch": 1.875, "grad_norm": 1.299886717976971, "learning_rate": 2.339428785576417e-06, "loss": 0.4082, "step": 630 }, { "epoch": 1.9047619047619047, "grad_norm": 2.180528134017198, "learning_rate": 2.258533022500071e-06, "loss": 0.4047, "step": 640 }, { "epoch": 1.9345238095238095, "grad_norm": 1.5831471294790294, "learning_rate": 2.1782991676976083e-06, "loss": 0.406, "step": 650 }, { "epoch": 1.9642857142857144, "grad_norm": 1.4465628941001263, "learning_rate": 2.098835280214235e-06, "loss": 0.4059, "step": 660 }, { "epoch": 1.994047619047619, "grad_norm": 1.235988141224481, "learning_rate": 2.020248382102315e-06, "loss": 0.4036, "step": 670 }, { "epoch": 2.0, "eval_loss": 0.05653192102909088, "eval_runtime": 232.7101, "eval_samples_per_second": 77.792, "eval_steps_per_second": 0.61, "step": 672 }, { "epoch": 2.0238095238095237, "grad_norm": 1.5990318568634403, "learning_rate": 1.9426443142839448e-06, "loss": 0.3553, "step": 680 }, { "epoch": 2.0535714285714284, "grad_norm": 1.3112274286588734, "learning_rate": 1.8661275940042853e-06, "loss": 0.3432, "step": 690 }, { "epoch": 2.0833333333333335, "grad_norm": 1.2128906572024227, "learning_rate": 1.7908012740676195e-06, "loss": 0.3409, "step": 700 }, { "epoch": 2.113095238095238, "grad_norm": 1.1302482102468034, "learning_rate": 1.7167668040457187e-06, "loss": 0.3384, "step": 710 }, { "epoch": 2.142857142857143, "grad_norm": 1.2305052793614717, "learning_rate": 1.6441238936454482e-06, "loss": 0.3398, "step": 720 }, { "epoch": 2.1726190476190474, "grad_norm": 1.2857213828075238, "learning_rate": 1.572970378419619e-06, "loss": 0.3399, "step": 730 }, { "epoch": 2.2023809523809526, "grad_norm": 1.066433322606292, "learning_rate": 1.503402088001962e-06, "loss": 0.3388, "step": 740 }, { "epoch": 2.232142857142857, "grad_norm": 1.2630644907839987, "learning_rate": 1.4355127170436645e-06, "loss": 0.3418, "step": 750 }, { "epoch": 2.261904761904762, "grad_norm": 1.280082300687477, "learning_rate": 1.369393699025314e-06, "loss": 0.3425, "step": 760 }, { "epoch": 2.2916666666666665, "grad_norm": 1.3394156515192486, "learning_rate": 1.3051340831141795e-06, "loss": 0.3383, "step": 770 }, { "epoch": 2.3214285714285716, "grad_norm": 1.1409474866684266, "learning_rate": 1.2428204142327e-06, "loss": 0.3398, "step": 780 }, { "epoch": 2.3511904761904763, "grad_norm": 1.0278292145581185, "learning_rate": 1.1825366164996793e-06, "loss": 0.3368, "step": 790 }, { "epoch": 2.380952380952381, "grad_norm": 1.1114664418747453, "learning_rate": 1.1243638802011956e-06, "loss": 0.3395, "step": 800 }, { "epoch": 2.4107142857142856, "grad_norm": 1.0690574034691915, "learning_rate": 1.0683805524434283e-06, "loss": 0.3392, "step": 810 }, { "epoch": 2.4404761904761907, "grad_norm": 1.1211941269328196, "learning_rate": 1.014662031634692e-06, "loss": 0.3368, "step": 820 }, { "epoch": 2.4702380952380953, "grad_norm": 1.118275924178868, "learning_rate": 9.632806659387742e-07, "loss": 0.3392, "step": 830 }, { "epoch": 2.5, "grad_norm": 1.2319030631628767, "learning_rate": 9.143056558363464e-07, "loss": 0.3368, "step": 840 }, { "epoch": 2.5297619047619047, "grad_norm": 0.9990792698056495, "learning_rate": 8.678029609256824e-07, "loss": 0.3392, "step": 850 }, { "epoch": 2.5595238095238093, "grad_norm": 1.0987255069136912, "learning_rate": 8.238352110881944e-07, "loss": 0.3392, "step": 860 }, { "epoch": 2.5892857142857144, "grad_norm": 1.0429750984949957, "learning_rate": 7.824616221384424e-07, "loss": 0.3425, "step": 870 }, { "epoch": 2.619047619047619, "grad_norm": 1.0138139074720265, "learning_rate": 7.437379160722046e-07, "loss": 0.3378, "step": 880 }, { "epoch": 2.6488095238095237, "grad_norm": 1.0338539878119035, "learning_rate": 7.077162460200362e-07, "loss": 0.3389, "step": 890 }, { "epoch": 2.678571428571429, "grad_norm": 1.0616016727820259, "learning_rate": 6.744451260073717e-07, "loss": 0.3373, "step": 900 }, { "epoch": 2.7083333333333335, "grad_norm": 1.0423094245688327, "learning_rate": 6.439693656157822e-07, "loss": 0.3381, "step": 910 }, { "epoch": 2.738095238095238, "grad_norm": 1.039005579526591, "learning_rate": 6.163300096333806e-07, "loss": 0.3402, "step": 920 }, { "epoch": 2.767857142857143, "grad_norm": 1.0254993845509726, "learning_rate": 5.915642827756557e-07, "loss": 0.339, "step": 930 }, { "epoch": 2.7976190476190474, "grad_norm": 1.0846255072616904, "learning_rate": 5.697055395511835e-07, "loss": 0.3368, "step": 940 }, { "epoch": 2.8273809523809526, "grad_norm": 1.0181027005902297, "learning_rate": 5.50783219339738e-07, "loss": 0.3379, "step": 950 }, { "epoch": 2.857142857142857, "grad_norm": 0.9988570535916693, "learning_rate": 5.348228067433014e-07, "loss": 0.3372, "step": 960 }, { "epoch": 2.886904761904762, "grad_norm": 0.9760473236017329, "learning_rate": 5.21845797263374e-07, "loss": 0.3358, "step": 970 }, { "epoch": 2.9166666666666665, "grad_norm": 1.0604235456414122, "learning_rate": 5.118696683508087e-07, "loss": 0.334, "step": 980 }, { "epoch": 2.946428571428571, "grad_norm": 1.085549116610976, "learning_rate": 5.049078558671606e-07, "loss": 0.3383, "step": 990 }, { "epoch": 2.9761904761904763, "grad_norm": 1.0583971598110677, "learning_rate": 5.009697359892537e-07, "loss": 0.3348, "step": 1000 }, { "epoch": 3.0, "eval_loss": 0.05723509192466736, "eval_runtime": 226.8197, "eval_samples_per_second": 79.812, "eval_steps_per_second": 0.626, "step": 1008 }, { "epoch": 3.0, "step": 1008, "total_flos": 3376456327495680.0, "train_loss": 0.43546260041849955, "train_runtime": 33174.1128, "train_samples_per_second": 31.104, "train_steps_per_second": 0.03 } ], "logging_steps": 10, "max_steps": 1008, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3376456327495680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }