{ "best_metric": 6.4970197677612305, "best_model_checkpoint": "./results/models/checkpoint-50230", "epoch": 10.0, "eval_steps": 500, "global_step": 50230, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09954210631096955, "grad_norm": 0.330078125, "learning_rate": 0.003992036631495123, "loss": 6.6471, "step": 500 }, { "epoch": 0.1990842126219391, "grad_norm": 1.3984375, "learning_rate": 0.0039840732629902445, "loss": 6.5906, "step": 1000 }, { "epoch": 0.29862631893290864, "grad_norm": 0.365234375, "learning_rate": 0.003976109894485367, "loss": 6.5877, "step": 1500 }, { "epoch": 0.3981684252438782, "grad_norm": 0.31640625, "learning_rate": 0.00396814652598049, "loss": 6.5756, "step": 2000 }, { "epoch": 0.4977105315548477, "grad_norm": 0.5546875, "learning_rate": 0.003960183157475612, "loss": 6.5466, "step": 2500 }, { "epoch": 0.5972526378658173, "grad_norm": 0.89453125, "learning_rate": 0.003952219788970734, "loss": 6.5571, "step": 3000 }, { "epoch": 0.6967947441767868, "grad_norm": 1.4296875, "learning_rate": 0.003944256420465857, "loss": 6.596, "step": 3500 }, { "epoch": 0.7963368504877564, "grad_norm": 17.375, "learning_rate": 0.003936293051960979, "loss": 6.6171, "step": 4000 }, { "epoch": 0.8958789567987259, "grad_norm": 0.83203125, "learning_rate": 0.003928329683456102, "loss": 6.6177, "step": 4500 }, { "epoch": 0.9954210631096954, "grad_norm": 2.65625, "learning_rate": 0.003920366314951225, "loss": 6.6059, "step": 5000 }, { "epoch": 1.0, "eval_loss": 6.5949907302856445, "eval_runtime": 105.2723, "eval_samples_per_second": 18.998, "eval_steps_per_second": 1.187, "step": 5023 }, { "epoch": 1.094963169420665, "grad_norm": 2.109375, "learning_rate": 0.003912402946446347, "loss": 6.5889, "step": 5500 }, { "epoch": 1.1945052757316346, "grad_norm": 2.15625, "learning_rate": 0.0039044395779414694, "loss": 6.5864, "step": 6000 }, { "epoch": 1.294047382042604, "grad_norm": 4.4375, "learning_rate": 0.003896476209436592, "loss": 6.5705, "step": 6500 }, { "epoch": 1.3935894883535735, "grad_norm": 37.5, "learning_rate": 0.0038885128409317142, "loss": 6.5518, "step": 7000 }, { "epoch": 1.4931315946645431, "grad_norm": 3.046875, "learning_rate": 0.0038805494724268364, "loss": 6.5569, "step": 7500 }, { "epoch": 1.5926737009755128, "grad_norm": 1.296875, "learning_rate": 0.003872586103921959, "loss": 6.5753, "step": 8000 }, { "epoch": 1.6922158072864821, "grad_norm": 2.109375, "learning_rate": 0.0038646227354170812, "loss": 6.5595, "step": 8500 }, { "epoch": 1.7917579135974517, "grad_norm": 2.671875, "learning_rate": 0.003856659366912204, "loss": 6.5684, "step": 9000 }, { "epoch": 1.8913000199084213, "grad_norm": 2.078125, "learning_rate": 0.0038486959984073265, "loss": 6.5734, "step": 9500 }, { "epoch": 1.9908421262193907, "grad_norm": 3.53125, "learning_rate": 0.003840732629902449, "loss": 6.5618, "step": 10000 }, { "epoch": 2.0, "eval_loss": 6.562187671661377, "eval_runtime": 94.6787, "eval_samples_per_second": 21.124, "eval_steps_per_second": 1.32, "step": 10046 }, { "epoch": 2.0903842325303605, "grad_norm": 1.953125, "learning_rate": 0.003832769261397571, "loss": 6.5687, "step": 10500 }, { "epoch": 2.18992633884133, "grad_norm": 2.421875, "learning_rate": 0.0038248058928926935, "loss": 6.5667, "step": 11000 }, { "epoch": 2.2894684451522993, "grad_norm": 1.5234375, "learning_rate": 0.003816842524387816, "loss": 6.5603, "step": 11500 }, { "epoch": 2.389010551463269, "grad_norm": 1.671875, "learning_rate": 0.0038088791558829388, "loss": 6.5622, "step": 12000 }, { "epoch": 2.4885526577742385, "grad_norm": 0.69921875, "learning_rate": 0.003800915787378061, "loss": 6.5542, "step": 12500 }, { "epoch": 2.588094764085208, "grad_norm": 3.203125, "learning_rate": 0.0037929524188731836, "loss": 6.563, "step": 13000 }, { "epoch": 2.6876368703961777, "grad_norm": 1.0859375, "learning_rate": 0.003784989050368306, "loss": 6.5676, "step": 13500 }, { "epoch": 2.787178976707147, "grad_norm": 2.453125, "learning_rate": 0.003777025681863428, "loss": 6.5519, "step": 14000 }, { "epoch": 2.8867210830181165, "grad_norm": 4.9375, "learning_rate": 0.0037690623133585506, "loss": 6.569, "step": 14500 }, { "epoch": 2.9862631893290863, "grad_norm": 1.453125, "learning_rate": 0.003761098944853673, "loss": 6.5753, "step": 15000 }, { "epoch": 3.0, "eval_loss": 6.565845966339111, "eval_runtime": 97.7357, "eval_samples_per_second": 20.463, "eval_steps_per_second": 1.279, "step": 15069 }, { "epoch": 3.0858052956400557, "grad_norm": 4.46875, "learning_rate": 0.003753135576348796, "loss": 6.5751, "step": 15500 }, { "epoch": 3.1853474019510255, "grad_norm": 2.703125, "learning_rate": 0.003745172207843918, "loss": 6.5708, "step": 16000 }, { "epoch": 3.284889508261995, "grad_norm": 2.09375, "learning_rate": 0.0037372088393390402, "loss": 6.5625, "step": 16500 }, { "epoch": 3.3844316145729643, "grad_norm": 1.6484375, "learning_rate": 0.003729245470834163, "loss": 6.5586, "step": 17000 }, { "epoch": 3.483973720883934, "grad_norm": 0.9765625, "learning_rate": 0.0037212821023292855, "loss": 6.5566, "step": 17500 }, { "epoch": 3.5835158271949035, "grad_norm": 4.3125, "learning_rate": 0.0037133187338244077, "loss": 6.5453, "step": 18000 }, { "epoch": 3.683057933505873, "grad_norm": 2.71875, "learning_rate": 0.0037053553653195303, "loss": 6.5485, "step": 18500 }, { "epoch": 3.7826000398168427, "grad_norm": 3.09375, "learning_rate": 0.003697391996814653, "loss": 6.54, "step": 19000 }, { "epoch": 3.882142146127812, "grad_norm": 2.46875, "learning_rate": 0.0036894286283097755, "loss": 6.5386, "step": 19500 }, { "epoch": 3.9816842524387814, "grad_norm": 1.6328125, "learning_rate": 0.0036814652598048973, "loss": 6.5367, "step": 20000 }, { "epoch": 4.0, "eval_loss": 6.537178039550781, "eval_runtime": 103.7321, "eval_samples_per_second": 19.28, "eval_steps_per_second": 1.205, "step": 20092 }, { "epoch": 4.081226358749751, "grad_norm": 1.7421875, "learning_rate": 0.00367350189130002, "loss": 6.5276, "step": 20500 }, { "epoch": 4.180768465060721, "grad_norm": 2.765625, "learning_rate": 0.0036655385227951425, "loss": 6.5268, "step": 21000 }, { "epoch": 4.2803105713716905, "grad_norm": 0.94921875, "learning_rate": 0.0036575751542902647, "loss": 6.5288, "step": 21500 }, { "epoch": 4.37985267768266, "grad_norm": 3.046875, "learning_rate": 0.0036496117857853874, "loss": 6.5404, "step": 22000 }, { "epoch": 4.479394783993629, "grad_norm": 3.140625, "learning_rate": 0.0036416484172805096, "loss": 6.5289, "step": 22500 }, { "epoch": 4.578936890304599, "grad_norm": 2.296875, "learning_rate": 0.003633685048775632, "loss": 6.5337, "step": 23000 }, { "epoch": 4.678478996615569, "grad_norm": 2.140625, "learning_rate": 0.0036257216802707544, "loss": 6.529, "step": 23500 }, { "epoch": 4.778021102926538, "grad_norm": 1.7578125, "learning_rate": 0.003617758311765877, "loss": 6.5412, "step": 24000 }, { "epoch": 4.877563209237508, "grad_norm": 3.0625, "learning_rate": 0.0036097949432609996, "loss": 6.5493, "step": 24500 }, { "epoch": 4.977105315548477, "grad_norm": 2.125, "learning_rate": 0.0036018315747561223, "loss": 6.54, "step": 25000 }, { "epoch": 5.0, "eval_loss": 6.5385332107543945, "eval_runtime": 104.5195, "eval_samples_per_second": 19.135, "eval_steps_per_second": 1.196, "step": 25115 }, { "epoch": 5.076647421859446, "grad_norm": 1.0625, "learning_rate": 0.003593868206251244, "loss": 6.5453, "step": 25500 }, { "epoch": 5.176189528170416, "grad_norm": 2.65625, "learning_rate": 0.0035859048377463666, "loss": 6.543, "step": 26000 }, { "epoch": 5.275731634481385, "grad_norm": 4.0625, "learning_rate": 0.0035779414692414893, "loss": 6.5414, "step": 26500 }, { "epoch": 5.375273740792355, "grad_norm": 0.88671875, "learning_rate": 0.0035699781007366115, "loss": 6.536, "step": 27000 }, { "epoch": 5.474815847103325, "grad_norm": 3.109375, "learning_rate": 0.003562014732231734, "loss": 6.5312, "step": 27500 }, { "epoch": 5.574357953414294, "grad_norm": 3.46875, "learning_rate": 0.0035540513637268567, "loss": 6.5319, "step": 28000 }, { "epoch": 5.673900059725264, "grad_norm": 10.5, "learning_rate": 0.003546087995221979, "loss": 6.5259, "step": 28500 }, { "epoch": 5.773442166036233, "grad_norm": 4.0, "learning_rate": 0.003538124626717101, "loss": 6.5254, "step": 29000 }, { "epoch": 5.872984272347203, "grad_norm": 8.25, "learning_rate": 0.0035301612582122237, "loss": 6.5284, "step": 29500 }, { "epoch": 5.972526378658173, "grad_norm": 2.96875, "learning_rate": 0.0035221978897073463, "loss": 6.5215, "step": 30000 }, { "epoch": 6.0, "eval_loss": 6.523362159729004, "eval_runtime": 95.426, "eval_samples_per_second": 20.959, "eval_steps_per_second": 1.31, "step": 30138 }, { "epoch": 6.072068484969142, "grad_norm": 7.6875, "learning_rate": 0.003514234521202469, "loss": 6.5284, "step": 30500 }, { "epoch": 6.171610591280111, "grad_norm": 5.4375, "learning_rate": 0.003506271152697591, "loss": 6.5254, "step": 31000 }, { "epoch": 6.271152697591081, "grad_norm": 2.953125, "learning_rate": 0.0034983077841927133, "loss": 6.5273, "step": 31500 }, { "epoch": 6.370694803902051, "grad_norm": 3.46875, "learning_rate": 0.003490344415687836, "loss": 6.5383, "step": 32000 }, { "epoch": 6.47023691021302, "grad_norm": 4.78125, "learning_rate": 0.0034823810471829586, "loss": 6.5333, "step": 32500 }, { "epoch": 6.56977901652399, "grad_norm": 2.53125, "learning_rate": 0.003474417678678081, "loss": 6.5317, "step": 33000 }, { "epoch": 6.669321122834959, "grad_norm": 1.6484375, "learning_rate": 0.0034664543101732034, "loss": 6.5267, "step": 33500 }, { "epoch": 6.7688632291459285, "grad_norm": 3.84375, "learning_rate": 0.003458490941668326, "loss": 6.5231, "step": 34000 }, { "epoch": 6.868405335456898, "grad_norm": 2.296875, "learning_rate": 0.003450527573163448, "loss": 6.5232, "step": 34500 }, { "epoch": 6.967947441767868, "grad_norm": 2.421875, "learning_rate": 0.0034425642046585704, "loss": 6.5246, "step": 35000 }, { "epoch": 7.0, "eval_loss": 6.521819114685059, "eval_runtime": 108.6846, "eval_samples_per_second": 18.402, "eval_steps_per_second": 1.15, "step": 35161 }, { "epoch": 7.067489548078838, "grad_norm": 1.4296875, "learning_rate": 0.003434600836153693, "loss": 6.5235, "step": 35500 }, { "epoch": 7.167031654389807, "grad_norm": 2.203125, "learning_rate": 0.0034266374676488157, "loss": 6.5215, "step": 36000 }, { "epoch": 7.266573760700776, "grad_norm": 1.1015625, "learning_rate": 0.003418674099143938, "loss": 6.5195, "step": 36500 }, { "epoch": 7.366115867011746, "grad_norm": 2.703125, "learning_rate": 0.0034107107306390605, "loss": 6.5242, "step": 37000 }, { "epoch": 7.465657973322715, "grad_norm": 2.796875, "learning_rate": 0.0034027473621341827, "loss": 6.5123, "step": 37500 }, { "epoch": 7.565200079633685, "grad_norm": 3.359375, "learning_rate": 0.0033947839936293053, "loss": 6.5117, "step": 38000 }, { "epoch": 7.664742185944655, "grad_norm": 3.046875, "learning_rate": 0.0033868206251244275, "loss": 6.517, "step": 38500 }, { "epoch": 7.764284292255624, "grad_norm": 1.7421875, "learning_rate": 0.00337885725661955, "loss": 6.5092, "step": 39000 }, { "epoch": 7.8638263985665935, "grad_norm": 1.7265625, "learning_rate": 0.0033708938881146728, "loss": 6.5193, "step": 39500 }, { "epoch": 7.963368504877563, "grad_norm": 2.21875, "learning_rate": 0.003362930519609795, "loss": 6.5172, "step": 40000 }, { "epoch": 8.0, "eval_loss": 6.515054702758789, "eval_runtime": 111.1357, "eval_samples_per_second": 17.996, "eval_steps_per_second": 1.125, "step": 40184 }, { "epoch": 8.062910611188533, "grad_norm": 1.46875, "learning_rate": 0.003354967151104917, "loss": 6.5142, "step": 40500 }, { "epoch": 8.162452717499502, "grad_norm": 2.4375, "learning_rate": 0.0033470037826000398, "loss": 6.5132, "step": 41000 }, { "epoch": 8.261994823810472, "grad_norm": 1.984375, "learning_rate": 0.0033390404140951624, "loss": 6.4996, "step": 41500 }, { "epoch": 8.361536930121442, "grad_norm": 2.4375, "learning_rate": 0.0033310770455902846, "loss": 6.5111, "step": 42000 }, { "epoch": 8.46107903643241, "grad_norm": 1.4453125, "learning_rate": 0.003323113677085407, "loss": 6.5004, "step": 42500 }, { "epoch": 8.560621142743381, "grad_norm": 4.0625, "learning_rate": 0.00331515030858053, "loss": 6.51, "step": 43000 }, { "epoch": 8.66016324905435, "grad_norm": 5.8125, "learning_rate": 0.003307186940075652, "loss": 6.5153, "step": 43500 }, { "epoch": 8.75970535536532, "grad_norm": 4.09375, "learning_rate": 0.003299223571570774, "loss": 6.5184, "step": 44000 }, { "epoch": 8.85924746167629, "grad_norm": 1.0625, "learning_rate": 0.003291260203065897, "loss": 6.515, "step": 44500 }, { "epoch": 8.958789567987258, "grad_norm": 1.1953125, "learning_rate": 0.0032832968345610195, "loss": 6.5175, "step": 45000 }, { "epoch": 9.0, "eval_loss": 6.506955623626709, "eval_runtime": 110.8553, "eval_samples_per_second": 18.042, "eval_steps_per_second": 1.128, "step": 45207 }, { "epoch": 9.058331674298229, "grad_norm": 12.875, "learning_rate": 0.003275333466056142, "loss": 6.509, "step": 45500 }, { "epoch": 9.157873780609197, "grad_norm": 3.875, "learning_rate": 0.0032673700975512643, "loss": 6.5169, "step": 46000 }, { "epoch": 9.257415886920167, "grad_norm": 3.234375, "learning_rate": 0.0032594067290463865, "loss": 6.5157, "step": 46500 }, { "epoch": 9.356957993231136, "grad_norm": 2.75, "learning_rate": 0.003251443360541509, "loss": 6.5165, "step": 47000 }, { "epoch": 9.456500099542106, "grad_norm": 1.9609375, "learning_rate": 0.0032434799920366313, "loss": 6.5163, "step": 47500 }, { "epoch": 9.556042205853077, "grad_norm": 7.375, "learning_rate": 0.003235516623531754, "loss": 6.5083, "step": 48000 }, { "epoch": 9.655584312164045, "grad_norm": 1.9921875, "learning_rate": 0.0032275532550268765, "loss": 6.5071, "step": 48500 }, { "epoch": 9.755126418475015, "grad_norm": 1.5078125, "learning_rate": 0.003219589886521999, "loss": 6.5, "step": 49000 }, { "epoch": 9.854668524785984, "grad_norm": 4.40625, "learning_rate": 0.003211626518017121, "loss": 6.5108, "step": 49500 }, { "epoch": 9.954210631096954, "grad_norm": 5.09375, "learning_rate": 0.0032036631495122436, "loss": 6.4953, "step": 50000 }, { "epoch": 10.0, "eval_loss": 6.4970197677612305, "eval_runtime": 113.697, "eval_samples_per_second": 17.591, "eval_steps_per_second": 1.099, "step": 50230 } ], "logging_steps": 500, "max_steps": 251150, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.647492890752254e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }