{ "best_metric": 4.579585075378418, "best_model_checkpoint": "/home/p318482/babyLM_controlled/models_trained/de_clm/childes_42/checkpoint-32000", "epoch": 92.24318658280923, "eval_steps": 2000, "global_step": 88000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.0964360587002098, "eval_loss": 7.0908098220825195, "eval_runtime": 0.9834, "eval_samples_per_second": 1398.23, "eval_steps_per_second": 87.453, "step": 2000 }, { "epoch": 4.1928721174004195, "grad_norm": 1.4699604511260986, "learning_rate": 1e-05, "loss": 6.9765, "step": 4000 }, { "epoch": 4.1928721174004195, "eval_loss": 5.874638557434082, "eval_runtime": 0.973, "eval_samples_per_second": 1413.214, "eval_steps_per_second": 88.39, "step": 4000 }, { "epoch": 6.289308176100629, "eval_loss": 5.544075965881348, "eval_runtime": 0.9898, "eval_samples_per_second": 1389.228, "eval_steps_per_second": 86.89, "step": 6000 }, { "epoch": 8.385744234800839, "grad_norm": 2.451873779296875, "learning_rate": 1.9997500000000003e-05, "loss": 5.2182, "step": 8000 }, { "epoch": 8.385744234800839, "eval_loss": 5.278811454772949, "eval_runtime": 0.9809, "eval_samples_per_second": 1401.823, "eval_steps_per_second": 87.678, "step": 8000 }, { "epoch": 10.482180293501049, "eval_loss": 5.099233150482178, "eval_runtime": 0.9694, "eval_samples_per_second": 1418.396, "eval_steps_per_second": 88.714, "step": 10000 }, { "epoch": 12.578616352201259, "grad_norm": 2.490121841430664, "learning_rate": 2.9995e-05, "loss": 4.7379, "step": 12000 }, { "epoch": 12.578616352201259, "eval_loss": 4.970981597900391, "eval_runtime": 0.9729, "eval_samples_per_second": 1413.251, "eval_steps_per_second": 88.392, "step": 12000 }, { "epoch": 14.675052410901468, "eval_loss": 4.875853061676025, "eval_runtime": 0.9902, "eval_samples_per_second": 1388.55, "eval_steps_per_second": 86.848, "step": 14000 }, { "epoch": 16.771488469601678, "grad_norm": 2.485079765319824, "learning_rate": 3.99925e-05, "loss": 4.4249, "step": 16000 }, { "epoch": 16.771488469601678, "eval_loss": 4.8005266189575195, "eval_runtime": 0.9709, "eval_samples_per_second": 1416.144, "eval_steps_per_second": 88.573, "step": 16000 }, { "epoch": 18.867924528301888, "eval_loss": 4.743557453155518, "eval_runtime": 0.9764, "eval_samples_per_second": 1408.167, "eval_steps_per_second": 88.074, "step": 18000 }, { "epoch": 20.964360587002098, "grad_norm": 2.311079978942871, "learning_rate": 4.99875e-05, "loss": 4.1842, "step": 20000 }, { "epoch": 20.964360587002098, "eval_loss": 4.692162990570068, "eval_runtime": 0.9728, "eval_samples_per_second": 1413.396, "eval_steps_per_second": 88.401, "step": 20000 }, { "epoch": 23.060796645702307, "eval_loss": 4.648115634918213, "eval_runtime": 0.9711, "eval_samples_per_second": 1415.941, "eval_steps_per_second": 88.561, "step": 22000 }, { "epoch": 25.157232704402517, "grad_norm": 2.3314199447631836, "learning_rate": 5.998250000000001e-05, "loss": 3.9843, "step": 24000 }, { "epoch": 25.157232704402517, "eval_loss": 4.615537166595459, "eval_runtime": 0.9745, "eval_samples_per_second": 1410.994, "eval_steps_per_second": 88.251, "step": 24000 }, { "epoch": 27.253668763102727, "eval_loss": 4.59817361831665, "eval_runtime": 0.9725, "eval_samples_per_second": 1413.828, "eval_steps_per_second": 88.429, "step": 26000 }, { "epoch": 29.350104821802937, "grad_norm": 2.2022252082824707, "learning_rate": 6.99775e-05, "loss": 3.8181, "step": 28000 }, { "epoch": 29.350104821802937, "eval_loss": 4.584521293640137, "eval_runtime": 0.9704, "eval_samples_per_second": 1416.917, "eval_steps_per_second": 88.622, "step": 28000 }, { "epoch": 31.446540880503143, "eval_loss": 4.581137657165527, "eval_runtime": 0.9781, "eval_samples_per_second": 1405.736, "eval_steps_per_second": 87.922, "step": 30000 }, { "epoch": 33.542976939203356, "grad_norm": 2.109912872314453, "learning_rate": 7.997250000000001e-05, "loss": 3.6751, "step": 32000 }, { "epoch": 33.542976939203356, "eval_loss": 4.579585075378418, "eval_runtime": 0.9725, "eval_samples_per_second": 1413.852, "eval_steps_per_second": 88.43, "step": 32000 }, { "epoch": 35.63941299790356, "eval_loss": 4.582756042480469, "eval_runtime": 0.9724, "eval_samples_per_second": 1413.996, "eval_steps_per_second": 88.439, "step": 34000 }, { "epoch": 37.735849056603776, "grad_norm": 2.1550486087799072, "learning_rate": 8.996750000000001e-05, "loss": 3.5484, "step": 36000 }, { "epoch": 37.735849056603776, "eval_loss": 4.586859226226807, "eval_runtime": 0.979, "eval_samples_per_second": 1404.481, "eval_steps_per_second": 87.844, "step": 36000 }, { "epoch": 39.83228511530398, "eval_loss": 4.597586154937744, "eval_runtime": 0.9841, "eval_samples_per_second": 1397.167, "eval_steps_per_second": 87.386, "step": 38000 }, { "epoch": 41.928721174004195, "grad_norm": 2.2574758529663086, "learning_rate": 9.99625e-05, "loss": 3.4328, "step": 40000 }, { "epoch": 41.928721174004195, "eval_loss": 4.6090407371521, "eval_runtime": 0.9945, "eval_samples_per_second": 1382.571, "eval_steps_per_second": 86.474, "step": 40000 }, { "epoch": 44.0251572327044, "eval_loss": 4.629756450653076, "eval_runtime": 0.9745, "eval_samples_per_second": 1410.989, "eval_steps_per_second": 88.251, "step": 42000 }, { "epoch": 46.121593291404615, "grad_norm": 2.2658331394195557, "learning_rate": 9.336166666666667e-05, "loss": 3.31, "step": 44000 }, { "epoch": 46.121593291404615, "eval_loss": 4.659794330596924, "eval_runtime": 0.9812, "eval_samples_per_second": 1401.309, "eval_steps_per_second": 87.645, "step": 44000 }, { "epoch": 48.21802935010482, "eval_loss": 4.698253631591797, "eval_runtime": 0.981, "eval_samples_per_second": 1401.638, "eval_steps_per_second": 87.666, "step": 46000 }, { "epoch": 50.314465408805034, "grad_norm": 2.4032843112945557, "learning_rate": 8.669833333333334e-05, "loss": 3.1908, "step": 48000 }, { "epoch": 50.314465408805034, "eval_loss": 4.72628927230835, "eval_runtime": 0.9912, "eval_samples_per_second": 1387.274, "eval_steps_per_second": 86.768, "step": 48000 }, { "epoch": 52.41090146750524, "eval_loss": 4.762426376342773, "eval_runtime": 0.9771, "eval_samples_per_second": 1407.288, "eval_steps_per_second": 88.019, "step": 50000 }, { "epoch": 54.507337526205454, "grad_norm": 2.641611337661743, "learning_rate": 8.0035e-05, "loss": 3.0864, "step": 52000 }, { "epoch": 54.507337526205454, "eval_loss": 4.79134464263916, "eval_runtime": 0.9732, "eval_samples_per_second": 1412.893, "eval_steps_per_second": 88.37, "step": 52000 }, { "epoch": 56.60377358490566, "eval_loss": 4.826347827911377, "eval_runtime": 0.9719, "eval_samples_per_second": 1414.733, "eval_steps_per_second": 88.485, "step": 54000 }, { "epoch": 58.700209643605874, "grad_norm": 2.6470491886138916, "learning_rate": 7.337166666666667e-05, "loss": 2.993, "step": 56000 }, { "epoch": 58.700209643605874, "eval_loss": 4.853786945343018, "eval_runtime": 0.975, "eval_samples_per_second": 1410.209, "eval_steps_per_second": 88.202, "step": 56000 }, { "epoch": 60.79664570230608, "eval_loss": 4.8770365715026855, "eval_runtime": 0.9725, "eval_samples_per_second": 1413.926, "eval_steps_per_second": 88.435, "step": 58000 }, { "epoch": 62.893081761006286, "grad_norm": 2.761204957962036, "learning_rate": 6.670833333333333e-05, "loss": 2.9108, "step": 60000 }, { "epoch": 62.893081761006286, "eval_loss": 4.909708499908447, "eval_runtime": 0.9753, "eval_samples_per_second": 1409.824, "eval_steps_per_second": 88.178, "step": 60000 }, { "epoch": 64.98951781970649, "eval_loss": 4.94859504699707, "eval_runtime": 0.9735, "eval_samples_per_second": 1412.386, "eval_steps_per_second": 88.338, "step": 62000 }, { "epoch": 67.08595387840671, "grad_norm": 3.005192279815674, "learning_rate": 6.0045000000000005e-05, "loss": 2.8352, "step": 64000 }, { "epoch": 67.08595387840671, "eval_loss": 4.992880821228027, "eval_runtime": 0.9755, "eval_samples_per_second": 1409.563, "eval_steps_per_second": 88.162, "step": 64000 }, { "epoch": 69.18238993710692, "eval_loss": 5.033893585205078, "eval_runtime": 0.9733, "eval_samples_per_second": 1412.778, "eval_steps_per_second": 88.363, "step": 66000 }, { "epoch": 71.27882599580713, "grad_norm": 2.9778764247894287, "learning_rate": 5.338166666666668e-05, "loss": 2.7677, "step": 68000 }, { "epoch": 71.27882599580713, "eval_loss": 5.051624774932861, "eval_runtime": 0.9743, "eval_samples_per_second": 1411.21, "eval_steps_per_second": 88.265, "step": 68000 }, { "epoch": 73.37526205450733, "eval_loss": 5.0868730545043945, "eval_runtime": 0.9734, "eval_samples_per_second": 1412.585, "eval_steps_per_second": 88.351, "step": 70000 }, { "epoch": 75.47169811320755, "grad_norm": 3.199427366256714, "learning_rate": 4.6718333333333336e-05, "loss": 2.708, "step": 72000 }, { "epoch": 75.47169811320755, "eval_loss": 5.107797622680664, "eval_runtime": 0.9734, "eval_samples_per_second": 1412.585, "eval_steps_per_second": 88.351, "step": 72000 }, { "epoch": 77.56813417190776, "eval_loss": 5.131660461425781, "eval_runtime": 0.9755, "eval_samples_per_second": 1409.548, "eval_steps_per_second": 88.161, "step": 74000 }, { "epoch": 79.66457023060796, "grad_norm": 3.2438466548919678, "learning_rate": 4.0055e-05, "loss": 2.6552, "step": 76000 }, { "epoch": 79.66457023060796, "eval_loss": 5.159754276275635, "eval_runtime": 0.9741, "eval_samples_per_second": 1411.574, "eval_steps_per_second": 88.288, "step": 76000 }, { "epoch": 81.76100628930817, "eval_loss": 5.17739200592041, "eval_runtime": 0.9699, "eval_samples_per_second": 1417.607, "eval_steps_per_second": 88.665, "step": 78000 }, { "epoch": 83.85744234800839, "grad_norm": 3.3774938583374023, "learning_rate": 3.339166666666667e-05, "loss": 2.6082, "step": 80000 }, { "epoch": 83.85744234800839, "eval_loss": 5.192777156829834, "eval_runtime": 0.9709, "eval_samples_per_second": 1416.242, "eval_steps_per_second": 88.579, "step": 80000 }, { "epoch": 85.9538784067086, "eval_loss": 5.227346897125244, "eval_runtime": 0.9893, "eval_samples_per_second": 1389.87, "eval_steps_per_second": 86.93, "step": 82000 }, { "epoch": 88.0503144654088, "grad_norm": 3.3834476470947266, "learning_rate": 2.6728333333333333e-05, "loss": 2.5633, "step": 84000 }, { "epoch": 88.0503144654088, "eval_loss": 5.249727725982666, "eval_runtime": 0.9723, "eval_samples_per_second": 1414.208, "eval_steps_per_second": 88.452, "step": 84000 }, { "epoch": 90.14675052410901, "eval_loss": 5.264400959014893, "eval_runtime": 0.9715, "eval_samples_per_second": 1415.317, "eval_steps_per_second": 88.522, "step": 86000 }, { "epoch": 92.24318658280923, "grad_norm": 3.5035271644592285, "learning_rate": 2.0065000000000002e-05, "loss": 2.5227, "step": 88000 }, { "epoch": 92.24318658280923, "eval_loss": 5.28403902053833, "eval_runtime": 0.977, "eval_samples_per_second": 1407.338, "eval_steps_per_second": 88.023, "step": 88000 } ], "logging_steps": 4000, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 105, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.2753828744495104e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }