{ "best_metric": 0.20909027755260468, "best_model_checkpoint": "/root/pretrain_utg4java_02/checkpoint-91689", "epoch": 40.0, "eval_steps": 500, "global_step": 94040, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.21, "grad_norm": 1.1109575033187866, "learning_rate": 9.98e-05, "loss": 0.4973, "step": 500 }, { "epoch": 0.43, "grad_norm": 1.1452946662902832, "learning_rate": 0.0001998, "loss": 0.4793, "step": 1000 }, { "epoch": 0.64, "grad_norm": 0.9271925091743469, "learning_rate": 0.0001989273430782459, "loss": 0.4999, "step": 1500 }, { "epoch": 0.85, "grad_norm": 0.9055812358856201, "learning_rate": 0.0001978525365434222, "loss": 0.4934, "step": 2000 }, { "epoch": 1.0, "eval_loss": 0.4301895797252655, "eval_runtime": 56.7204, "eval_samples_per_second": 331.733, "eval_steps_per_second": 2.592, "step": 2351 }, { "epoch": 1.06, "grad_norm": 0.8988948464393616, "learning_rate": 0.00019677773000859847, "loss": 0.4818, "step": 2500 }, { "epoch": 1.28, "grad_norm": 0.9011508822441101, "learning_rate": 0.00019570292347377474, "loss": 0.4651, "step": 3000 }, { "epoch": 1.49, "grad_norm": 0.905346691608429, "learning_rate": 0.00019463026655202063, "loss": 0.467, "step": 3500 }, { "epoch": 1.7, "grad_norm": 0.7411425709724426, "learning_rate": 0.0001935597592433362, "loss": 0.4625, "step": 4000 }, { "epoch": 1.91, "grad_norm": 0.8985588550567627, "learning_rate": 0.00019248495270851249, "loss": 0.4572, "step": 4500 }, { "epoch": 2.0, "eval_loss": 0.4058227837085724, "eval_runtime": 56.7252, "eval_samples_per_second": 331.705, "eval_steps_per_second": 2.591, "step": 4702 }, { "epoch": 2.13, "grad_norm": 0.827357292175293, "learning_rate": 0.00019141014617368875, "loss": 0.4472, "step": 5000 }, { "epoch": 2.34, "grad_norm": 0.736473023891449, "learning_rate": 0.00019033533963886502, "loss": 0.4372, "step": 5500 }, { "epoch": 2.55, "grad_norm": 0.7558779120445251, "learning_rate": 0.00018926053310404129, "loss": 0.4362, "step": 6000 }, { "epoch": 2.76, "grad_norm": 0.7970576882362366, "learning_rate": 0.00018818572656921755, "loss": 0.4332, "step": 6500 }, { "epoch": 2.98, "grad_norm": 0.7692726850509644, "learning_rate": 0.00018711092003439382, "loss": 0.43, "step": 7000 }, { "epoch": 3.0, "eval_loss": 0.38832882046699524, "eval_runtime": 56.9476, "eval_samples_per_second": 330.409, "eval_steps_per_second": 2.581, "step": 7053 }, { "epoch": 3.19, "grad_norm": 0.7109490633010864, "learning_rate": 0.00018603611349957009, "loss": 0.4217, "step": 7500 }, { "epoch": 3.4, "grad_norm": 0.7125234603881836, "learning_rate": 0.00018496130696474635, "loss": 0.4141, "step": 8000 }, { "epoch": 3.62, "grad_norm": 0.6914758682250977, "learning_rate": 0.00018388650042992262, "loss": 0.4169, "step": 8500 }, { "epoch": 3.83, "grad_norm": 0.7215930223464966, "learning_rate": 0.00018281169389509889, "loss": 0.4132, "step": 9000 }, { "epoch": 4.0, "eval_loss": 0.3813731372356415, "eval_runtime": 56.8812, "eval_samples_per_second": 330.794, "eval_steps_per_second": 2.584, "step": 9404 }, { "epoch": 4.04, "grad_norm": 0.7283121943473816, "learning_rate": 0.00018173688736027515, "loss": 0.4131, "step": 9500 }, { "epoch": 4.25, "grad_norm": 0.6947267055511475, "learning_rate": 0.00018066208082545142, "loss": 0.4059, "step": 10000 }, { "epoch": 4.47, "grad_norm": 0.7544755339622498, "learning_rate": 0.0001795872742906277, "loss": 0.4023, "step": 10500 }, { "epoch": 4.68, "grad_norm": 0.6204919815063477, "learning_rate": 0.00017851246775580398, "loss": 0.4038, "step": 11000 }, { "epoch": 4.89, "grad_norm": 0.7036948800086975, "learning_rate": 0.00017743766122098022, "loss": 0.4017, "step": 11500 }, { "epoch": 5.0, "eval_loss": 0.3638169765472412, "eval_runtime": 56.8241, "eval_samples_per_second": 331.127, "eval_steps_per_second": 2.587, "step": 11755 }, { "epoch": 5.1, "grad_norm": 0.62198805809021, "learning_rate": 0.0001763628546861565, "loss": 0.3921, "step": 12000 }, { "epoch": 5.32, "grad_norm": 0.6095067858695984, "learning_rate": 0.00017528804815133278, "loss": 0.3901, "step": 12500 }, { "epoch": 5.53, "grad_norm": 1.0518782138824463, "learning_rate": 0.00017421324161650902, "loss": 0.3888, "step": 13000 }, { "epoch": 5.74, "grad_norm": 0.6801586151123047, "learning_rate": 0.00017313843508168531, "loss": 0.3859, "step": 13500 }, { "epoch": 5.95, "grad_norm": 0.6489918828010559, "learning_rate": 0.00017206362854686158, "loss": 0.3839, "step": 14000 }, { "epoch": 6.0, "eval_loss": 0.35733696818351746, "eval_runtime": 56.779, "eval_samples_per_second": 331.39, "eval_steps_per_second": 2.589, "step": 14106 }, { "epoch": 6.17, "grad_norm": 0.6174113154411316, "learning_rate": 0.00017098882201203782, "loss": 0.3782, "step": 14500 }, { "epoch": 6.38, "grad_norm": 0.641315221786499, "learning_rate": 0.00016991401547721411, "loss": 0.3792, "step": 15000 }, { "epoch": 6.59, "grad_norm": 0.6454831957817078, "learning_rate": 0.00016883920894239038, "loss": 0.3745, "step": 15500 }, { "epoch": 6.81, "grad_norm": 0.6489464640617371, "learning_rate": 0.00016776440240756665, "loss": 0.3729, "step": 16000 }, { "epoch": 7.0, "eval_loss": 0.3489345610141754, "eval_runtime": 57.0727, "eval_samples_per_second": 329.685, "eval_steps_per_second": 2.576, "step": 16457 }, { "epoch": 7.02, "grad_norm": 0.6929277181625366, "learning_rate": 0.0001666938950988822, "loss": 0.3718, "step": 16500 }, { "epoch": 7.23, "grad_norm": 0.5726625323295593, "learning_rate": 0.00016562123817712813, "loss": 0.3624, "step": 17000 }, { "epoch": 7.44, "grad_norm": 0.603529155254364, "learning_rate": 0.0001645464316423044, "loss": 0.3664, "step": 17500 }, { "epoch": 7.66, "grad_norm": 0.7436888813972473, "learning_rate": 0.00016347162510748066, "loss": 0.3614, "step": 18000 }, { "epoch": 7.87, "grad_norm": 0.6405676007270813, "learning_rate": 0.00016239681857265693, "loss": 0.3646, "step": 18500 }, { "epoch": 8.0, "eval_loss": 0.3384884297847748, "eval_runtime": 56.7342, "eval_samples_per_second": 331.652, "eval_steps_per_second": 2.591, "step": 18808 }, { "epoch": 8.08, "grad_norm": 0.660517156124115, "learning_rate": 0.0001613220120378332, "loss": 0.3623, "step": 19000 }, { "epoch": 8.29, "grad_norm": 0.6352570056915283, "learning_rate": 0.0001602493551160791, "loss": 0.3518, "step": 19500 }, { "epoch": 8.51, "grad_norm": 0.6094992756843567, "learning_rate": 0.0001591745485812554, "loss": 0.3569, "step": 20000 }, { "epoch": 8.72, "grad_norm": 0.6646167635917664, "learning_rate": 0.00015809974204643166, "loss": 0.3567, "step": 20500 }, { "epoch": 8.93, "grad_norm": 0.6240518093109131, "learning_rate": 0.00015702493551160792, "loss": 0.353, "step": 21000 }, { "epoch": 9.0, "eval_loss": 0.3339434862136841, "eval_runtime": 56.979, "eval_samples_per_second": 330.227, "eval_steps_per_second": 2.58, "step": 21159 }, { "epoch": 9.15, "grad_norm": 0.6990819573402405, "learning_rate": 0.0001559501289767842, "loss": 0.3491, "step": 21500 }, { "epoch": 9.36, "grad_norm": 0.6751831769943237, "learning_rate": 0.00015487532244196046, "loss": 0.346, "step": 22000 }, { "epoch": 9.57, "grad_norm": 0.615511417388916, "learning_rate": 0.00015380051590713672, "loss": 0.3488, "step": 22500 }, { "epoch": 9.78, "grad_norm": 0.5277218222618103, "learning_rate": 0.000152725709372313, "loss": 0.3466, "step": 23000 }, { "epoch": 10.0, "grad_norm": 0.5809108018875122, "learning_rate": 0.00015165090283748926, "loss": 0.3441, "step": 23500 }, { "epoch": 10.0, "eval_loss": 0.324989378452301, "eval_runtime": 56.9322, "eval_samples_per_second": 330.499, "eval_steps_per_second": 2.582, "step": 23510 }, { "epoch": 10.21, "grad_norm": 0.5285485982894897, "learning_rate": 0.00015058039552880484, "loss": 0.3407, "step": 24000 }, { "epoch": 10.42, "grad_norm": 0.6268051862716675, "learning_rate": 0.00014950558899398108, "loss": 0.3389, "step": 24500 }, { "epoch": 10.63, "grad_norm": 0.5879684090614319, "learning_rate": 0.00014843078245915735, "loss": 0.3387, "step": 25000 }, { "epoch": 10.85, "grad_norm": 0.5434576869010925, "learning_rate": 0.00014735597592433364, "loss": 0.3379, "step": 25500 }, { "epoch": 11.0, "eval_loss": 0.31874018907546997, "eval_runtime": 57.0459, "eval_samples_per_second": 329.84, "eval_steps_per_second": 2.577, "step": 25861 }, { "epoch": 11.06, "grad_norm": 0.6547732949256897, "learning_rate": 0.00014628116938950988, "loss": 0.3369, "step": 26000 }, { "epoch": 11.27, "grad_norm": 0.648601233959198, "learning_rate": 0.00014520636285468615, "loss": 0.3308, "step": 26500 }, { "epoch": 11.48, "grad_norm": 0.6311826109886169, "learning_rate": 0.00014413155631986244, "loss": 0.3298, "step": 27000 }, { "epoch": 11.7, "grad_norm": 0.5571895837783813, "learning_rate": 0.00014305674978503868, "loss": 0.3292, "step": 27500 }, { "epoch": 11.91, "grad_norm": 0.5373964309692383, "learning_rate": 0.00014198194325021498, "loss": 0.3284, "step": 28000 }, { "epoch": 12.0, "eval_loss": 0.3123379051685333, "eval_runtime": 56.7858, "eval_samples_per_second": 331.351, "eval_steps_per_second": 2.589, "step": 28212 }, { "epoch": 12.12, "grad_norm": 0.5307437777519226, "learning_rate": 0.00014090928632846088, "loss": 0.3247, "step": 28500 }, { "epoch": 12.34, "grad_norm": 0.6422705054283142, "learning_rate": 0.00013983447979363714, "loss": 0.3223, "step": 29000 }, { "epoch": 12.55, "grad_norm": 0.6680347323417664, "learning_rate": 0.00013875967325881344, "loss": 0.3223, "step": 29500 }, { "epoch": 12.76, "grad_norm": 0.5851155519485474, "learning_rate": 0.00013768701633705933, "loss": 0.3245, "step": 30000 }, { "epoch": 12.97, "grad_norm": 0.544624924659729, "learning_rate": 0.0001366122098022356, "loss": 0.3252, "step": 30500 }, { "epoch": 13.0, "eval_loss": 0.3076089322566986, "eval_runtime": 57.0752, "eval_samples_per_second": 329.67, "eval_steps_per_second": 2.576, "step": 30563 }, { "epoch": 13.19, "grad_norm": 0.5343388319015503, "learning_rate": 0.00013553740326741187, "loss": 0.3185, "step": 31000 }, { "epoch": 13.4, "grad_norm": 0.6074999570846558, "learning_rate": 0.00013446259673258814, "loss": 0.3191, "step": 31500 }, { "epoch": 13.61, "grad_norm": 0.6202041506767273, "learning_rate": 0.00013338779019776443, "loss": 0.3186, "step": 32000 }, { "epoch": 13.82, "grad_norm": 0.6058717966079712, "learning_rate": 0.00013231298366294067, "loss": 0.3134, "step": 32500 }, { "epoch": 14.0, "eval_loss": 0.3018127679824829, "eval_runtime": 57.0557, "eval_samples_per_second": 329.783, "eval_steps_per_second": 2.576, "step": 32914 }, { "epoch": 14.04, "grad_norm": 0.6439224481582642, "learning_rate": 0.00013123817712811694, "loss": 0.3161, "step": 33000 }, { "epoch": 14.25, "grad_norm": 0.5888046622276306, "learning_rate": 0.00013016337059329323, "loss": 0.3094, "step": 33500 }, { "epoch": 14.46, "grad_norm": 0.6644863486289978, "learning_rate": 0.00012908856405846947, "loss": 0.3092, "step": 34000 }, { "epoch": 14.67, "grad_norm": 0.5649863481521606, "learning_rate": 0.00012801375752364576, "loss": 0.3117, "step": 34500 }, { "epoch": 14.89, "grad_norm": 0.4947267174720764, "learning_rate": 0.00012693895098882203, "loss": 0.3072, "step": 35000 }, { "epoch": 15.0, "eval_loss": 0.29680636525154114, "eval_runtime": 57.044, "eval_samples_per_second": 329.851, "eval_steps_per_second": 2.577, "step": 35265 }, { "epoch": 15.1, "grad_norm": 0.6015808582305908, "learning_rate": 0.00012586414445399827, "loss": 0.3067, "step": 35500 }, { "epoch": 15.31, "grad_norm": 0.6003320217132568, "learning_rate": 0.00012478933791917456, "loss": 0.3027, "step": 36000 }, { "epoch": 15.53, "grad_norm": 0.5643883347511292, "learning_rate": 0.00012371453138435083, "loss": 0.3012, "step": 36500 }, { "epoch": 15.74, "grad_norm": 0.5768193602561951, "learning_rate": 0.00012264187446259673, "loss": 0.303, "step": 37000 }, { "epoch": 15.95, "grad_norm": 0.5599430799484253, "learning_rate": 0.00012156921754084265, "loss": 0.3034, "step": 37500 }, { "epoch": 16.0, "eval_loss": 0.2938060164451599, "eval_runtime": 56.7422, "eval_samples_per_second": 331.605, "eval_steps_per_second": 2.591, "step": 37616 }, { "epoch": 16.16, "grad_norm": 0.6380453705787659, "learning_rate": 0.00012049441100601892, "loss": 0.2987, "step": 38000 }, { "epoch": 16.38, "grad_norm": 0.5312320590019226, "learning_rate": 0.0001194196044711952, "loss": 0.2975, "step": 38500 }, { "epoch": 16.59, "grad_norm": 0.7027547955513, "learning_rate": 0.00011834479793637145, "loss": 0.2934, "step": 39000 }, { "epoch": 16.8, "grad_norm": 0.5144539475440979, "learning_rate": 0.00011726999140154773, "loss": 0.2959, "step": 39500 }, { "epoch": 17.0, "eval_loss": 0.28754693269729614, "eval_runtime": 56.9555, "eval_samples_per_second": 330.363, "eval_steps_per_second": 2.581, "step": 39967 }, { "epoch": 17.01, "grad_norm": 0.5759513974189758, "learning_rate": 0.00011619733447979365, "loss": 0.2997, "step": 40000 }, { "epoch": 17.23, "grad_norm": 0.593640148639679, "learning_rate": 0.00011512467755803955, "loss": 0.294, "step": 40500 }, { "epoch": 17.44, "grad_norm": 0.6821351647377014, "learning_rate": 0.00011404987102321583, "loss": 0.2904, "step": 41000 }, { "epoch": 17.65, "grad_norm": 0.5297681093215942, "learning_rate": 0.0001129750644883921, "loss": 0.2896, "step": 41500 }, { "epoch": 17.86, "grad_norm": 0.5864290595054626, "learning_rate": 0.00011190025795356836, "loss": 0.2884, "step": 42000 }, { "epoch": 18.0, "eval_loss": 0.28015270829200745, "eval_runtime": 56.8879, "eval_samples_per_second": 330.756, "eval_steps_per_second": 2.584, "step": 42318 }, { "epoch": 18.08, "grad_norm": 0.5764068961143494, "learning_rate": 0.00011082545141874463, "loss": 0.2874, "step": 42500 }, { "epoch": 18.29, "grad_norm": 0.6073163747787476, "learning_rate": 0.0001097506448839209, "loss": 0.2828, "step": 43000 }, { "epoch": 18.5, "grad_norm": 0.5691092610359192, "learning_rate": 0.00010867583834909716, "loss": 0.2848, "step": 43500 }, { "epoch": 18.72, "grad_norm": 0.5399264097213745, "learning_rate": 0.00010760103181427344, "loss": 0.2861, "step": 44000 }, { "epoch": 18.93, "grad_norm": 0.6221365928649902, "learning_rate": 0.0001065262252794497, "loss": 0.2839, "step": 44500 }, { "epoch": 19.0, "eval_loss": 0.27618157863616943, "eval_runtime": 56.9107, "eval_samples_per_second": 330.623, "eval_steps_per_second": 2.583, "step": 44669 }, { "epoch": 19.14, "grad_norm": 0.5080223679542542, "learning_rate": 0.00010545141874462596, "loss": 0.2791, "step": 45000 }, { "epoch": 19.35, "grad_norm": 0.567764163017273, "learning_rate": 0.00010437661220980224, "loss": 0.2792, "step": 45500 }, { "epoch": 19.57, "grad_norm": 0.5838685035705566, "learning_rate": 0.00010330180567497851, "loss": 0.2768, "step": 46000 }, { "epoch": 19.78, "grad_norm": 0.5778998136520386, "learning_rate": 0.00010222699914015476, "loss": 0.2776, "step": 46500 }, { "epoch": 19.99, "grad_norm": 0.615337073802948, "learning_rate": 0.0001011543422184007, "loss": 0.2785, "step": 47000 }, { "epoch": 20.0, "eval_loss": 0.2724262773990631, "eval_runtime": 56.852, "eval_samples_per_second": 330.965, "eval_steps_per_second": 2.586, "step": 47020 }, { "epoch": 20.2, "grad_norm": 0.6153652667999268, "learning_rate": 0.00010007953568357695, "loss": 0.2712, "step": 47500 }, { "epoch": 20.42, "grad_norm": 0.6126906275749207, "learning_rate": 9.900472914875323e-05, "loss": 0.2718, "step": 48000 }, { "epoch": 20.63, "grad_norm": 0.5799471139907837, "learning_rate": 9.792992261392949e-05, "loss": 0.2701, "step": 48500 }, { "epoch": 20.84, "grad_norm": 0.5679476261138916, "learning_rate": 9.685511607910577e-05, "loss": 0.2727, "step": 49000 }, { "epoch": 21.0, "eval_loss": 0.26610955595970154, "eval_runtime": 56.7263, "eval_samples_per_second": 331.698, "eval_steps_per_second": 2.591, "step": 49371 }, { "epoch": 21.05, "grad_norm": 0.5923852324485779, "learning_rate": 9.578245915735168e-05, "loss": 0.2694, "step": 49500 }, { "epoch": 21.27, "grad_norm": 0.6184679269790649, "learning_rate": 9.470765262252796e-05, "loss": 0.2649, "step": 50000 }, { "epoch": 21.48, "grad_norm": 0.5939807295799255, "learning_rate": 9.363284608770421e-05, "loss": 0.2697, "step": 50500 }, { "epoch": 21.69, "grad_norm": 0.5925255417823792, "learning_rate": 9.255803955288048e-05, "loss": 0.2652, "step": 51000 }, { "epoch": 21.91, "grad_norm": 0.5666438341140747, "learning_rate": 9.148323301805676e-05, "loss": 0.266, "step": 51500 }, { "epoch": 22.0, "eval_loss": 0.266626238822937, "eval_runtime": 56.6361, "eval_samples_per_second": 332.226, "eval_steps_per_second": 2.596, "step": 51722 }, { "epoch": 22.12, "grad_norm": 0.5490565299987793, "learning_rate": 9.040842648323303e-05, "loss": 0.2613, "step": 52000 }, { "epoch": 22.33, "grad_norm": 0.6534095406532288, "learning_rate": 8.933361994840929e-05, "loss": 0.2592, "step": 52500 }, { "epoch": 22.54, "grad_norm": 0.568953812122345, "learning_rate": 8.82609630266552e-05, "loss": 0.2604, "step": 53000 }, { "epoch": 22.76, "grad_norm": 0.5968701243400574, "learning_rate": 8.718615649183147e-05, "loss": 0.2602, "step": 53500 }, { "epoch": 22.97, "grad_norm": 0.5995833277702332, "learning_rate": 8.61134995700774e-05, "loss": 0.2614, "step": 54000 }, { "epoch": 23.0, "eval_loss": 0.26461803913116455, "eval_runtime": 56.8736, "eval_samples_per_second": 330.839, "eval_steps_per_second": 2.585, "step": 54073 }, { "epoch": 23.18, "grad_norm": 0.577324390411377, "learning_rate": 8.503869303525366e-05, "loss": 0.2502, "step": 54500 }, { "epoch": 23.39, "grad_norm": 0.631363034248352, "learning_rate": 8.396603611349956e-05, "loss": 0.256, "step": 55000 }, { "epoch": 23.61, "grad_norm": 0.5862709879875183, "learning_rate": 8.289122957867584e-05, "loss": 0.2559, "step": 55500 }, { "epoch": 23.82, "grad_norm": 0.5620314478874207, "learning_rate": 8.181857265692176e-05, "loss": 0.2549, "step": 56000 }, { "epoch": 24.0, "eval_loss": 0.2593855559825897, "eval_runtime": 57.337, "eval_samples_per_second": 328.165, "eval_steps_per_second": 2.564, "step": 56424 }, { "epoch": 24.03, "grad_norm": 0.5514592528343201, "learning_rate": 8.074376612209804e-05, "loss": 0.2542, "step": 56500 }, { "epoch": 24.25, "grad_norm": 0.6351886987686157, "learning_rate": 7.966895958727429e-05, "loss": 0.2483, "step": 57000 }, { "epoch": 24.46, "grad_norm": 0.6075631380081177, "learning_rate": 7.859415305245056e-05, "loss": 0.2503, "step": 57500 }, { "epoch": 24.67, "grad_norm": 0.5592435002326965, "learning_rate": 7.751934651762684e-05, "loss": 0.249, "step": 58000 }, { "epoch": 24.88, "grad_norm": 0.5757043957710266, "learning_rate": 7.64445399828031e-05, "loss": 0.2497, "step": 58500 }, { "epoch": 25.0, "eval_loss": 0.2535327076911926, "eval_runtime": 56.789, "eval_samples_per_second": 331.332, "eval_steps_per_second": 2.589, "step": 58775 }, { "epoch": 25.1, "grad_norm": 0.5995878577232361, "learning_rate": 7.536973344797937e-05, "loss": 0.245, "step": 59000 }, { "epoch": 25.31, "grad_norm": 0.6077148914337158, "learning_rate": 7.429492691315564e-05, "loss": 0.2461, "step": 59500 }, { "epoch": 25.52, "grad_norm": 0.5592058300971985, "learning_rate": 7.32201203783319e-05, "loss": 0.2433, "step": 60000 }, { "epoch": 25.73, "grad_norm": 0.6243628859519958, "learning_rate": 7.214531384350817e-05, "loss": 0.2444, "step": 60500 }, { "epoch": 25.95, "grad_norm": 0.6848371624946594, "learning_rate": 7.107050730868444e-05, "loss": 0.243, "step": 61000 }, { "epoch": 26.0, "eval_loss": 0.250180721282959, "eval_runtime": 56.9862, "eval_samples_per_second": 330.185, "eval_steps_per_second": 2.58, "step": 61126 }, { "epoch": 26.16, "grad_norm": 0.5177180171012878, "learning_rate": 6.99957007738607e-05, "loss": 0.2382, "step": 61500 }, { "epoch": 26.37, "grad_norm": 0.6144821047782898, "learning_rate": 6.892304385210663e-05, "loss": 0.2349, "step": 62000 }, { "epoch": 26.58, "grad_norm": 0.5862034559249878, "learning_rate": 6.78482373172829e-05, "loss": 0.2385, "step": 62500 }, { "epoch": 26.8, "grad_norm": 0.5799028277397156, "learning_rate": 6.677343078245916e-05, "loss": 0.2383, "step": 63000 }, { "epoch": 27.0, "eval_loss": 0.24638937413692474, "eval_runtime": 56.9276, "eval_samples_per_second": 330.525, "eval_steps_per_second": 2.582, "step": 63477 }, { "epoch": 27.01, "grad_norm": 0.6466034054756165, "learning_rate": 6.569862424763543e-05, "loss": 0.2375, "step": 63500 }, { "epoch": 27.22, "grad_norm": 0.6161571741104126, "learning_rate": 6.46238177128117e-05, "loss": 0.2337, "step": 64000 }, { "epoch": 27.44, "grad_norm": 0.6162907481193542, "learning_rate": 6.354901117798796e-05, "loss": 0.2328, "step": 64500 }, { "epoch": 27.65, "grad_norm": 0.6643475890159607, "learning_rate": 6.247420464316423e-05, "loss": 0.2316, "step": 65000 }, { "epoch": 27.86, "grad_norm": 0.5605382323265076, "learning_rate": 6.13993981083405e-05, "loss": 0.2325, "step": 65500 }, { "epoch": 28.0, "eval_loss": 0.24465161561965942, "eval_runtime": 57.0032, "eval_samples_per_second": 330.087, "eval_steps_per_second": 2.579, "step": 65828 }, { "epoch": 28.07, "grad_norm": 0.6108579635620117, "learning_rate": 6.032459157351676e-05, "loss": 0.231, "step": 66000 }, { "epoch": 28.29, "grad_norm": 0.5990898013114929, "learning_rate": 5.924978503869304e-05, "loss": 0.2269, "step": 66500 }, { "epoch": 28.5, "grad_norm": 0.5171389579772949, "learning_rate": 5.817712811693895e-05, "loss": 0.2281, "step": 67000 }, { "epoch": 28.71, "grad_norm": 0.6478968262672424, "learning_rate": 5.710232158211523e-05, "loss": 0.23, "step": 67500 }, { "epoch": 28.92, "grad_norm": 0.5454473495483398, "learning_rate": 5.602751504729149e-05, "loss": 0.224, "step": 68000 }, { "epoch": 29.0, "eval_loss": 0.23835687339305878, "eval_runtime": 56.7685, "eval_samples_per_second": 331.452, "eval_steps_per_second": 2.589, "step": 68179 }, { "epoch": 29.14, "grad_norm": 0.5909966826438904, "learning_rate": 5.4952708512467756e-05, "loss": 0.2237, "step": 68500 }, { "epoch": 29.35, "grad_norm": 0.7489617466926575, "learning_rate": 5.387790197764403e-05, "loss": 0.2218, "step": 69000 }, { "epoch": 29.56, "grad_norm": 0.5880895853042603, "learning_rate": 5.2803095442820296e-05, "loss": 0.2224, "step": 69500 }, { "epoch": 29.77, "grad_norm": 0.6126420497894287, "learning_rate": 5.1728288907996556e-05, "loss": 0.2235, "step": 70000 }, { "epoch": 29.99, "grad_norm": 0.7013407349586487, "learning_rate": 5.0653482373172836e-05, "loss": 0.2189, "step": 70500 }, { "epoch": 30.0, "eval_loss": 0.23514553904533386, "eval_runtime": 56.6615, "eval_samples_per_second": 332.077, "eval_steps_per_second": 2.594, "step": 70530 }, { "epoch": 30.2, "grad_norm": 0.6700057983398438, "learning_rate": 4.9578675838349096e-05, "loss": 0.2175, "step": 71000 }, { "epoch": 30.41, "grad_norm": 0.5728330016136169, "learning_rate": 4.8506018916595015e-05, "loss": 0.2169, "step": 71500 }, { "epoch": 30.63, "grad_norm": 0.5460176467895508, "learning_rate": 4.743121238177128e-05, "loss": 0.2159, "step": 72000 }, { "epoch": 30.84, "grad_norm": 0.5714312791824341, "learning_rate": 4.63585554600172e-05, "loss": 0.2131, "step": 72500 }, { "epoch": 31.0, "eval_loss": 0.23221717774868011, "eval_runtime": 56.8018, "eval_samples_per_second": 331.257, "eval_steps_per_second": 2.588, "step": 72881 }, { "epoch": 31.05, "grad_norm": 0.5165086984634399, "learning_rate": 4.528374892519347e-05, "loss": 0.2139, "step": 73000 }, { "epoch": 31.26, "grad_norm": 0.6647119522094727, "learning_rate": 4.4208942390369734e-05, "loss": 0.2098, "step": 73500 }, { "epoch": 31.48, "grad_norm": 0.6132521629333496, "learning_rate": 4.3134135855546e-05, "loss": 0.213, "step": 74000 }, { "epoch": 31.69, "grad_norm": 0.5501160025596619, "learning_rate": 4.2059329320722275e-05, "loss": 0.2111, "step": 74500 }, { "epoch": 31.9, "grad_norm": 0.6153804659843445, "learning_rate": 4.098667239896819e-05, "loss": 0.2095, "step": 75000 }, { "epoch": 32.0, "eval_loss": 0.22537025809288025, "eval_runtime": 56.6384, "eval_samples_per_second": 332.213, "eval_steps_per_second": 2.595, "step": 75232 }, { "epoch": 32.11, "grad_norm": 0.5587669014930725, "learning_rate": 3.9911865864144454e-05, "loss": 0.2102, "step": 75500 }, { "epoch": 32.33, "grad_norm": 0.6313532590866089, "learning_rate": 3.883705932932073e-05, "loss": 0.2066, "step": 76000 }, { "epoch": 32.54, "grad_norm": 0.6371473670005798, "learning_rate": 3.7762252794496994e-05, "loss": 0.2067, "step": 76500 }, { "epoch": 32.75, "grad_norm": 0.6398050785064697, "learning_rate": 3.668744625967326e-05, "loss": 0.2049, "step": 77000 }, { "epoch": 32.96, "grad_norm": 0.6136685013771057, "learning_rate": 3.561263972484953e-05, "loss": 0.2072, "step": 77500 }, { "epoch": 33.0, "eval_loss": 0.22324109077453613, "eval_runtime": 56.9404, "eval_samples_per_second": 330.451, "eval_steps_per_second": 2.582, "step": 77583 }, { "epoch": 33.18, "grad_norm": 0.652153730392456, "learning_rate": 3.45378331900258e-05, "loss": 0.2024, "step": 78000 }, { "epoch": 33.39, "grad_norm": 0.5828467011451721, "learning_rate": 3.346302665520207e-05, "loss": 0.2008, "step": 78500 }, { "epoch": 33.6, "grad_norm": 0.5484930872917175, "learning_rate": 3.239036973344798e-05, "loss": 0.2032, "step": 79000 }, { "epoch": 33.82, "grad_norm": 0.5094757676124573, "learning_rate": 3.1315563198624247e-05, "loss": 0.2012, "step": 79500 }, { "epoch": 34.0, "eval_loss": 0.22044067084789276, "eval_runtime": 56.7734, "eval_samples_per_second": 331.423, "eval_steps_per_second": 2.589, "step": 79934 }, { "epoch": 34.03, "grad_norm": 0.6236255764961243, "learning_rate": 3.0240756663800517e-05, "loss": 0.1984, "step": 80000 }, { "epoch": 34.24, "grad_norm": 0.6019966006278992, "learning_rate": 2.9165950128976787e-05, "loss": 0.1958, "step": 80500 }, { "epoch": 34.45, "grad_norm": 0.6366199851036072, "learning_rate": 2.8093293207222702e-05, "loss": 0.1988, "step": 81000 }, { "epoch": 34.67, "grad_norm": 0.6756080389022827, "learning_rate": 2.701848667239897e-05, "loss": 0.1975, "step": 81500 }, { "epoch": 34.88, "grad_norm": 0.587624728679657, "learning_rate": 2.594368013757524e-05, "loss": 0.1938, "step": 82000 }, { "epoch": 35.0, "eval_loss": 0.21603567898273468, "eval_runtime": 56.8324, "eval_samples_per_second": 331.079, "eval_steps_per_second": 2.587, "step": 82285 }, { "epoch": 35.09, "grad_norm": 0.5856329798698425, "learning_rate": 2.4868873602751506e-05, "loss": 0.193, "step": 82500 }, { "epoch": 35.3, "grad_norm": 0.6045832633972168, "learning_rate": 2.3794067067927776e-05, "loss": 0.1936, "step": 83000 }, { "epoch": 35.52, "grad_norm": 0.5652228593826294, "learning_rate": 2.2719260533104043e-05, "loss": 0.194, "step": 83500 }, { "epoch": 35.73, "grad_norm": 0.6424867510795593, "learning_rate": 2.1644453998280313e-05, "loss": 0.1896, "step": 84000 }, { "epoch": 35.94, "grad_norm": 0.5543941855430603, "learning_rate": 2.0571797076526225e-05, "loss": 0.1921, "step": 84500 }, { "epoch": 36.0, "eval_loss": 0.21675606071949005, "eval_runtime": 56.9718, "eval_samples_per_second": 330.269, "eval_steps_per_second": 2.58, "step": 84636 }, { "epoch": 36.15, "grad_norm": 0.6019295454025269, "learning_rate": 1.9496990541702495e-05, "loss": 0.1884, "step": 85000 }, { "epoch": 36.37, "grad_norm": 0.645598292350769, "learning_rate": 1.8422184006878762e-05, "loss": 0.1888, "step": 85500 }, { "epoch": 36.58, "grad_norm": 0.6292818188667297, "learning_rate": 1.7347377472055032e-05, "loss": 0.1886, "step": 86000 }, { "epoch": 36.79, "grad_norm": 0.5169576406478882, "learning_rate": 1.62725709372313e-05, "loss": 0.1898, "step": 86500 }, { "epoch": 37.0, "eval_loss": 0.21486429870128632, "eval_runtime": 56.8955, "eval_samples_per_second": 330.711, "eval_steps_per_second": 2.584, "step": 86987 }, { "epoch": 37.01, "grad_norm": 0.680117130279541, "learning_rate": 1.5197764402407566e-05, "loss": 0.187, "step": 87000 }, { "epoch": 37.22, "grad_norm": 0.5357288718223572, "learning_rate": 1.4122957867583836e-05, "loss": 0.1841, "step": 87500 }, { "epoch": 37.43, "grad_norm": 0.5753670334815979, "learning_rate": 1.3048151332760103e-05, "loss": 0.1829, "step": 88000 }, { "epoch": 37.64, "grad_norm": 0.654707133769989, "learning_rate": 1.1975494411006018e-05, "loss": 0.1857, "step": 88500 }, { "epoch": 37.86, "grad_norm": 0.5702338218688965, "learning_rate": 1.0900687876182287e-05, "loss": 0.1829, "step": 89000 }, { "epoch": 38.0, "eval_loss": 0.21089451014995575, "eval_runtime": 56.6335, "eval_samples_per_second": 332.241, "eval_steps_per_second": 2.596, "step": 89338 }, { "epoch": 38.07, "grad_norm": 0.6671079993247986, "learning_rate": 9.825881341358555e-06, "loss": 0.1846, "step": 89500 }, { "epoch": 38.28, "grad_norm": 0.5165778398513794, "learning_rate": 8.751074806534824e-06, "loss": 0.1809, "step": 90000 }, { "epoch": 38.49, "grad_norm": 0.6374748945236206, "learning_rate": 7.676268271711092e-06, "loss": 0.1793, "step": 90500 }, { "epoch": 38.71, "grad_norm": 0.5679296851158142, "learning_rate": 6.60146173688736e-06, "loss": 0.1804, "step": 91000 }, { "epoch": 38.92, "grad_norm": 0.5970684885978699, "learning_rate": 5.526655202063629e-06, "loss": 0.1804, "step": 91500 }, { "epoch": 39.0, "eval_loss": 0.20909027755260468, "eval_runtime": 56.346, "eval_samples_per_second": 333.937, "eval_steps_per_second": 2.609, "step": 91689 }, { "epoch": 39.13, "grad_norm": 0.5444430112838745, "learning_rate": 4.4539982803095445e-06, "loss": 0.1804, "step": 92000 }, { "epoch": 39.34, "grad_norm": 0.6086540818214417, "learning_rate": 3.379191745485813e-06, "loss": 0.1786, "step": 92500 }, { "epoch": 39.56, "grad_norm": 0.5558965802192688, "learning_rate": 2.304385210662081e-06, "loss": 0.1758, "step": 93000 }, { "epoch": 39.77, "grad_norm": 0.5531004667282104, "learning_rate": 1.2295786758383493e-06, "loss": 0.1788, "step": 93500 }, { "epoch": 39.98, "grad_norm": 0.5611669421195984, "learning_rate": 1.5477214101461737e-07, "loss": 0.1773, "step": 94000 }, { "epoch": 40.0, "eval_loss": 0.20976465940475464, "eval_runtime": 56.6405, "eval_samples_per_second": 332.2, "eval_steps_per_second": 2.595, "step": 94040 }, { "epoch": 40.0, "step": 94040, "total_flos": 3.6650496018087936e+18, "train_loss": 0.2880888063569923, "train_runtime": 54838.0441, "train_samples_per_second": 109.795, "train_steps_per_second": 1.715 } ], "logging_steps": 500, "max_steps": 94040, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 500, "total_flos": 3.6650496018087936e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }