{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 1225, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04081632653061224, "grad_norm": 9.589848518371582, "learning_rate": 9.918367346938776e-06, "loss": 0.2612, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 10, "total_memory_available (GB)": 94.62 }, { "epoch": 0.08163265306122448, "grad_norm": 6.701302528381348, "learning_rate": 9.836734693877552e-06, "loss": 0.154, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 20, "total_memory_available (GB)": 94.62 }, { "epoch": 0.12244897959183673, "grad_norm": 5.337311267852783, "learning_rate": 9.755102040816327e-06, "loss": 0.1235, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 30, "total_memory_available (GB)": 94.62 }, { "epoch": 0.16326530612244897, "grad_norm": 4.5042338371276855, "learning_rate": 9.673469387755103e-06, "loss": 0.1096, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 40, "total_memory_available (GB)": 94.62 }, { "epoch": 0.20408163265306123, "grad_norm": 4.461822032928467, "learning_rate": 9.591836734693878e-06, "loss": 0.1196, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 50, "total_memory_available (GB)": 94.62 }, { "epoch": 0.24489795918367346, "grad_norm": 2.2825701236724854, "learning_rate": 9.510204081632653e-06, "loss": 0.0805, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 60, "total_memory_available (GB)": 94.62 }, { "epoch": 0.2857142857142857, "grad_norm": 3.725268602371216, "learning_rate": 9.42857142857143e-06, "loss": 0.1026, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 70, "total_memory_available (GB)": 94.62 }, { "epoch": 0.32653061224489793, "grad_norm": 1.707739233970642, "learning_rate": 9.346938775510204e-06, "loss": 0.1111, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 80, "total_memory_available (GB)": 94.62 }, { "epoch": 0.3673469387755102, "grad_norm": 4.5863938331604, "learning_rate": 9.26530612244898e-06, "loss": 0.0856, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 90, "total_memory_available (GB)": 94.62 }, { "epoch": 0.40816326530612246, "grad_norm": 11.972647666931152, "learning_rate": 9.183673469387756e-06, "loss": 0.0759, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 100, "total_memory_available (GB)": 94.62 }, { "epoch": 0.4489795918367347, "grad_norm": 4.550654888153076, "learning_rate": 9.102040816326532e-06, "loss": 0.0717, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 110, "total_memory_available (GB)": 94.62 }, { "epoch": 0.4897959183673469, "grad_norm": 4.418276786804199, "learning_rate": 9.020408163265307e-06, "loss": 0.0717, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 120, "total_memory_available (GB)": 94.62 }, { "epoch": 0.5306122448979592, "grad_norm": 1.651443600654602, "learning_rate": 8.938775510204082e-06, "loss": 0.0581, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 130, "total_memory_available (GB)": 94.62 }, { "epoch": 0.5714285714285714, "grad_norm": 1.5251814126968384, "learning_rate": 8.857142857142858e-06, "loss": 0.0481, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 140, "total_memory_available (GB)": 94.62 }, { "epoch": 0.6122448979591837, "grad_norm": 1.7455183267593384, "learning_rate": 8.775510204081633e-06, "loss": 0.0625, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 150, "total_memory_available (GB)": 94.62 }, { "epoch": 0.6530612244897959, "grad_norm": 1.7588891983032227, "learning_rate": 8.69387755102041e-06, "loss": 0.0711, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 160, "total_memory_available (GB)": 94.62 }, { "epoch": 0.6938775510204082, "grad_norm": 2.7675328254699707, "learning_rate": 8.612244897959184e-06, "loss": 0.0747, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 170, "total_memory_available (GB)": 94.62 }, { "epoch": 0.7346938775510204, "grad_norm": 1.781469464302063, "learning_rate": 8.530612244897961e-06, "loss": 0.061, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 180, "total_memory_available (GB)": 94.62 }, { "epoch": 0.7755102040816326, "grad_norm": 2.3728435039520264, "learning_rate": 8.448979591836736e-06, "loss": 0.0588, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 190, "total_memory_available (GB)": 94.62 }, { "epoch": 0.8163265306122449, "grad_norm": 0.8711996674537659, "learning_rate": 8.36734693877551e-06, "loss": 0.062, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 200, "total_memory_available (GB)": 94.62 }, { "epoch": 0.8571428571428571, "grad_norm": 1.1986733675003052, "learning_rate": 8.285714285714287e-06, "loss": 0.0627, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 210, "total_memory_available (GB)": 94.62 }, { "epoch": 0.8979591836734694, "grad_norm": 2.8968520164489746, "learning_rate": 8.204081632653062e-06, "loss": 0.0604, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 220, "total_memory_available (GB)": 94.62 }, { "epoch": 0.9387755102040817, "grad_norm": 0.8414793610572815, "learning_rate": 8.122448979591837e-06, "loss": 0.0559, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 230, "total_memory_available (GB)": 94.62 }, { "epoch": 0.9795918367346939, "grad_norm": 0.7434167861938477, "learning_rate": 8.040816326530613e-06, "loss": 0.0498, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 240, "total_memory_available (GB)": 94.62 }, { "epoch": 1.0204081632653061, "grad_norm": 0.8703041076660156, "learning_rate": 7.959183673469388e-06, "loss": 0.0618, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 250, "total_memory_available (GB)": 94.62 }, { "epoch": 1.0612244897959184, "grad_norm": 1.0856379270553589, "learning_rate": 7.877551020408164e-06, "loss": 0.056, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 260, "total_memory_available (GB)": 94.62 }, { "epoch": 1.1020408163265305, "grad_norm": 0.8847401142120361, "learning_rate": 7.79591836734694e-06, "loss": 0.0625, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 270, "total_memory_available (GB)": 94.62 }, { "epoch": 1.1428571428571428, "grad_norm": 1.5929882526397705, "learning_rate": 7.714285714285716e-06, "loss": 0.0571, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 280, "total_memory_available (GB)": 94.62 }, { "epoch": 1.183673469387755, "grad_norm": 0.8007532954216003, "learning_rate": 7.63265306122449e-06, "loss": 0.0511, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 290, "total_memory_available (GB)": 94.62 }, { "epoch": 1.2244897959183674, "grad_norm": 1.2002859115600586, "learning_rate": 7.551020408163265e-06, "loss": 0.065, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 300, "total_memory_available (GB)": 94.62 }, { "epoch": 1.2653061224489797, "grad_norm": 12.871713638305664, "learning_rate": 7.469387755102041e-06, "loss": 0.0664, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 310, "total_memory_available (GB)": 94.62 }, { "epoch": 1.306122448979592, "grad_norm": 2.46173357963562, "learning_rate": 7.387755102040817e-06, "loss": 0.0495, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 320, "total_memory_available (GB)": 94.62 }, { "epoch": 1.346938775510204, "grad_norm": 0.860598087310791, "learning_rate": 7.306122448979592e-06, "loss": 0.0603, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 330, "total_memory_available (GB)": 94.62 }, { "epoch": 1.3877551020408163, "grad_norm": 2.5583598613739014, "learning_rate": 7.224489795918368e-06, "loss": 0.0547, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 340, "total_memory_available (GB)": 94.62 }, { "epoch": 1.4285714285714286, "grad_norm": 0.37155964970588684, "learning_rate": 7.1428571428571436e-06, "loss": 0.048, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 350, "total_memory_available (GB)": 94.62 }, { "epoch": 1.469387755102041, "grad_norm": 1.808316707611084, "learning_rate": 7.061224489795919e-06, "loss": 0.0462, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 360, "total_memory_available (GB)": 94.62 }, { "epoch": 1.510204081632653, "grad_norm": 1.0183931589126587, "learning_rate": 6.979591836734695e-06, "loss": 0.0594, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 370, "total_memory_available (GB)": 94.62 }, { "epoch": 1.5510204081632653, "grad_norm": 0.5249583721160889, "learning_rate": 6.8979591836734705e-06, "loss": 0.0479, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 380, "total_memory_available (GB)": 94.62 }, { "epoch": 1.5918367346938775, "grad_norm": 1.1005572080612183, "learning_rate": 6.816326530612245e-06, "loss": 0.0649, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 390, "total_memory_available (GB)": 94.62 }, { "epoch": 1.6326530612244898, "grad_norm": 0.6047573089599609, "learning_rate": 6.734693877551021e-06, "loss": 0.0607, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 400, "total_memory_available (GB)": 94.62 }, { "epoch": 1.6734693877551021, "grad_norm": 0.7261654734611511, "learning_rate": 6.653061224489797e-06, "loss": 0.0606, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 410, "total_memory_available (GB)": 94.62 }, { "epoch": 1.7142857142857144, "grad_norm": 0.848527193069458, "learning_rate": 6.571428571428572e-06, "loss": 0.0532, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 420, "total_memory_available (GB)": 94.62 }, { "epoch": 1.7551020408163265, "grad_norm": 0.23483288288116455, "learning_rate": 6.489795918367348e-06, "loss": 0.068, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 430, "total_memory_available (GB)": 94.62 }, { "epoch": 1.7959183673469388, "grad_norm": 2.0767459869384766, "learning_rate": 6.408163265306124e-06, "loss": 0.0617, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 440, "total_memory_available (GB)": 94.62 }, { "epoch": 1.836734693877551, "grad_norm": 0.5654011368751526, "learning_rate": 6.326530612244899e-06, "loss": 0.044, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 450, "total_memory_available (GB)": 94.62 }, { "epoch": 1.8775510204081631, "grad_norm": 0.7382919788360596, "learning_rate": 6.244897959183675e-06, "loss": 0.0537, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 460, "total_memory_available (GB)": 94.62 }, { "epoch": 1.9183673469387754, "grad_norm": 1.3547204732894897, "learning_rate": 6.163265306122449e-06, "loss": 0.0432, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 470, "total_memory_available (GB)": 94.62 }, { "epoch": 1.9591836734693877, "grad_norm": 0.19681082665920258, "learning_rate": 6.0816326530612245e-06, "loss": 0.0498, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 480, "total_memory_available (GB)": 94.62 }, { "epoch": 2.0, "grad_norm": 1.109737515449524, "learning_rate": 6e-06, "loss": 0.0639, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 490, "total_memory_available (GB)": 94.62 }, { "epoch": 2.0408163265306123, "grad_norm": 0.5894625782966614, "learning_rate": 5.918367346938776e-06, "loss": 0.0593, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 500, "total_memory_available (GB)": 94.62 }, { "epoch": 2.0816326530612246, "grad_norm": 0.7122555375099182, "learning_rate": 5.8367346938775515e-06, "loss": 0.0498, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 510, "total_memory_available (GB)": 94.62 }, { "epoch": 2.122448979591837, "grad_norm": 0.8958902955055237, "learning_rate": 5.755102040816327e-06, "loss": 0.0457, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 520, "total_memory_available (GB)": 94.62 }, { "epoch": 2.163265306122449, "grad_norm": 11.620415687561035, "learning_rate": 5.673469387755103e-06, "loss": 0.0626, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 530, "total_memory_available (GB)": 94.62 }, { "epoch": 2.204081632653061, "grad_norm": 0.3538230061531067, "learning_rate": 5.591836734693878e-06, "loss": 0.0584, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 540, "total_memory_available (GB)": 94.62 }, { "epoch": 2.2448979591836733, "grad_norm": 1.5313146114349365, "learning_rate": 5.510204081632653e-06, "loss": 0.0627, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 550, "total_memory_available (GB)": 94.62 }, { "epoch": 2.2857142857142856, "grad_norm": 1.3519809246063232, "learning_rate": 5.428571428571429e-06, "loss": 0.0572, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 560, "total_memory_available (GB)": 94.62 }, { "epoch": 2.326530612244898, "grad_norm": 1.0263270139694214, "learning_rate": 5.3469387755102045e-06, "loss": 0.0585, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 570, "total_memory_available (GB)": 94.62 }, { "epoch": 2.36734693877551, "grad_norm": 0.8926671147346497, "learning_rate": 5.26530612244898e-06, "loss": 0.0673, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 580, "total_memory_available (GB)": 94.62 }, { "epoch": 2.4081632653061225, "grad_norm": 0.3185974955558777, "learning_rate": 5.183673469387756e-06, "loss": 0.0537, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 590, "total_memory_available (GB)": 94.62 }, { "epoch": 2.4489795918367347, "grad_norm": 0.944624662399292, "learning_rate": 5.1020408163265315e-06, "loss": 0.0442, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 600, "total_memory_available (GB)": 94.62 }, { "epoch": 2.489795918367347, "grad_norm": 0.32796111702919006, "learning_rate": 5.020408163265307e-06, "loss": 0.0413, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 610, "total_memory_available (GB)": 94.62 }, { "epoch": 2.5306122448979593, "grad_norm": 0.7929801940917969, "learning_rate": 4.938775510204082e-06, "loss": 0.0428, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 620, "total_memory_available (GB)": 94.62 }, { "epoch": 2.571428571428571, "grad_norm": 0.910254955291748, "learning_rate": 4.857142857142858e-06, "loss": 0.0813, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 630, "total_memory_available (GB)": 94.62 }, { "epoch": 2.612244897959184, "grad_norm": 1.101942539215088, "learning_rate": 4.775510204081633e-06, "loss": 0.0495, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 640, "total_memory_available (GB)": 94.62 }, { "epoch": 2.6530612244897958, "grad_norm": 0.7182526588439941, "learning_rate": 4.693877551020409e-06, "loss": 0.0471, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 650, "total_memory_available (GB)": 94.62 }, { "epoch": 2.693877551020408, "grad_norm": 0.8068158626556396, "learning_rate": 4.612244897959184e-06, "loss": 0.0469, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 660, "total_memory_available (GB)": 94.62 }, { "epoch": 2.7346938775510203, "grad_norm": 1.2375913858413696, "learning_rate": 4.530612244897959e-06, "loss": 0.0857, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 670, "total_memory_available (GB)": 94.62 }, { "epoch": 2.7755102040816326, "grad_norm": 1.1524357795715332, "learning_rate": 4.448979591836735e-06, "loss": 0.0488, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 680, "total_memory_available (GB)": 94.62 }, { "epoch": 2.816326530612245, "grad_norm": 0.3913586437702179, "learning_rate": 4.367346938775511e-06, "loss": 0.0451, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 690, "total_memory_available (GB)": 94.62 }, { "epoch": 2.857142857142857, "grad_norm": 0.47935113310813904, "learning_rate": 4.2857142857142855e-06, "loss": 0.0433, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 700, "total_memory_available (GB)": 94.62 }, { "epoch": 2.8979591836734695, "grad_norm": 0.8084143996238708, "learning_rate": 4.204081632653061e-06, "loss": 0.0548, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 710, "total_memory_available (GB)": 94.62 }, { "epoch": 2.938775510204082, "grad_norm": 1.7315497398376465, "learning_rate": 4.122448979591837e-06, "loss": 0.0587, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 720, "total_memory_available (GB)": 94.62 }, { "epoch": 2.979591836734694, "grad_norm": 0.20743349194526672, "learning_rate": 4.040816326530612e-06, "loss": 0.0342, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 730, "total_memory_available (GB)": 94.62 }, { "epoch": 3.020408163265306, "grad_norm": 0.8024761080741882, "learning_rate": 3.959183673469388e-06, "loss": 0.053, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 740, "total_memory_available (GB)": 94.62 }, { "epoch": 3.061224489795918, "grad_norm": 0.45326006412506104, "learning_rate": 3.877551020408164e-06, "loss": 0.0619, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 750, "total_memory_available (GB)": 94.62 }, { "epoch": 3.1020408163265305, "grad_norm": 0.6953087449073792, "learning_rate": 3.795918367346939e-06, "loss": 0.0527, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 760, "total_memory_available (GB)": 94.62 }, { "epoch": 3.142857142857143, "grad_norm": 1.2290390729904175, "learning_rate": 3.7142857142857146e-06, "loss": 0.0689, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 770, "total_memory_available (GB)": 94.62 }, { "epoch": 3.183673469387755, "grad_norm": 0.6281890869140625, "learning_rate": 3.6326530612244903e-06, "loss": 0.0647, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 780, "total_memory_available (GB)": 94.62 }, { "epoch": 3.2244897959183674, "grad_norm": 0.3096281588077545, "learning_rate": 3.5510204081632655e-06, "loss": 0.0522, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 790, "total_memory_available (GB)": 94.62 }, { "epoch": 3.2653061224489797, "grad_norm": 0.9390127062797546, "learning_rate": 3.469387755102041e-06, "loss": 0.0432, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 800, "total_memory_available (GB)": 94.62 }, { "epoch": 3.306122448979592, "grad_norm": 0.87565016746521, "learning_rate": 3.3877551020408168e-06, "loss": 0.0555, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 810, "total_memory_available (GB)": 94.62 }, { "epoch": 3.3469387755102042, "grad_norm": 1.0797837972640991, "learning_rate": 3.3061224489795924e-06, "loss": 0.0455, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 820, "total_memory_available (GB)": 94.62 }, { "epoch": 3.387755102040816, "grad_norm": 0.3658354878425598, "learning_rate": 3.2244897959183672e-06, "loss": 0.0487, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 830, "total_memory_available (GB)": 94.62 }, { "epoch": 3.4285714285714284, "grad_norm": 0.4766336977481842, "learning_rate": 3.142857142857143e-06, "loss": 0.053, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 840, "total_memory_available (GB)": 94.62 }, { "epoch": 3.4693877551020407, "grad_norm": 0.49318933486938477, "learning_rate": 3.0612244897959185e-06, "loss": 0.0812, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 850, "total_memory_available (GB)": 94.62 }, { "epoch": 3.510204081632653, "grad_norm": 1.3475311994552612, "learning_rate": 2.979591836734694e-06, "loss": 0.0451, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 860, "total_memory_available (GB)": 94.62 }, { "epoch": 3.5510204081632653, "grad_norm": 0.36763882637023926, "learning_rate": 2.8979591836734694e-06, "loss": 0.0646, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 870, "total_memory_available (GB)": 94.62 }, { "epoch": 3.5918367346938775, "grad_norm": 3.085198402404785, "learning_rate": 2.816326530612245e-06, "loss": 0.0439, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 880, "total_memory_available (GB)": 94.62 }, { "epoch": 3.63265306122449, "grad_norm": 0.17229312658309937, "learning_rate": 2.7346938775510207e-06, "loss": 0.0288, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 890, "total_memory_available (GB)": 94.62 }, { "epoch": 3.673469387755102, "grad_norm": 1.0760900974273682, "learning_rate": 2.6530612244897964e-06, "loss": 0.0514, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 900, "total_memory_available (GB)": 94.62 }, { "epoch": 3.7142857142857144, "grad_norm": 0.45855164527893066, "learning_rate": 2.571428571428571e-06, "loss": 0.0602, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 910, "total_memory_available (GB)": 94.62 }, { "epoch": 3.7551020408163263, "grad_norm": 0.15575875341892242, "learning_rate": 2.489795918367347e-06, "loss": 0.0543, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 920, "total_memory_available (GB)": 94.62 }, { "epoch": 3.795918367346939, "grad_norm": 0.779755175113678, "learning_rate": 2.4081632653061225e-06, "loss": 0.0497, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 930, "total_memory_available (GB)": 94.62 }, { "epoch": 3.836734693877551, "grad_norm": 0.7307060956954956, "learning_rate": 2.326530612244898e-06, "loss": 0.0486, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 940, "total_memory_available (GB)": 94.62 }, { "epoch": 3.877551020408163, "grad_norm": 1.062565803527832, "learning_rate": 2.244897959183674e-06, "loss": 0.0594, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 950, "total_memory_available (GB)": 94.62 }, { "epoch": 3.9183673469387754, "grad_norm": 0.3031039535999298, "learning_rate": 2.1632653061224495e-06, "loss": 0.0497, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 960, "total_memory_available (GB)": 94.62 }, { "epoch": 3.9591836734693877, "grad_norm": 2.310593843460083, "learning_rate": 2.0816326530612247e-06, "loss": 0.0746, "max_memory_allocated (GB)": 57.18, "memory_allocated (GB)": 50.57, "step": 970, "total_memory_available (GB)": 94.62 }, { "epoch": 4.0, "grad_norm": 0.6998704075813293, "learning_rate": 2.0000000000000003e-06, "loss": 0.0703, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 980, "total_memory_available (GB)": 94.62 }, { "epoch": 4.040816326530612, "grad_norm": 0.7492395639419556, "learning_rate": 1.9183673469387756e-06, "loss": 0.0486, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 990, "total_memory_available (GB)": 94.62 }, { "epoch": 4.081632653061225, "grad_norm": 0.7633445858955383, "learning_rate": 1.8367346938775512e-06, "loss": 0.0625, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1000, "total_memory_available (GB)": 94.62 }, { "epoch": 4.122448979591836, "grad_norm": 0.6911561489105225, "learning_rate": 1.7551020408163267e-06, "loss": 0.0632, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1010, "total_memory_available (GB)": 94.62 }, { "epoch": 4.163265306122449, "grad_norm": 0.33521902561187744, "learning_rate": 1.6734693877551023e-06, "loss": 0.0406, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1020, "total_memory_available (GB)": 94.62 }, { "epoch": 4.204081632653061, "grad_norm": 0.7509037852287292, "learning_rate": 1.5918367346938775e-06, "loss": 0.0531, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1030, "total_memory_available (GB)": 94.62 }, { "epoch": 4.244897959183674, "grad_norm": 0.5234070420265198, "learning_rate": 1.5102040816326532e-06, "loss": 0.0396, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1040, "total_memory_available (GB)": 94.62 }, { "epoch": 4.285714285714286, "grad_norm": 0.7997304797172546, "learning_rate": 1.4285714285714286e-06, "loss": 0.05, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1050, "total_memory_available (GB)": 94.62 }, { "epoch": 4.326530612244898, "grad_norm": 0.2255077213048935, "learning_rate": 1.3469387755102043e-06, "loss": 0.0457, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1060, "total_memory_available (GB)": 94.62 }, { "epoch": 4.36734693877551, "grad_norm": 0.5182124376296997, "learning_rate": 1.2653061224489795e-06, "loss": 0.0485, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1070, "total_memory_available (GB)": 94.62 }, { "epoch": 4.408163265306122, "grad_norm": 0.35046374797821045, "learning_rate": 1.1836734693877552e-06, "loss": 0.0519, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1080, "total_memory_available (GB)": 94.62 }, { "epoch": 4.448979591836735, "grad_norm": 0.3923434615135193, "learning_rate": 1.1020408163265308e-06, "loss": 0.0507, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1090, "total_memory_available (GB)": 94.62 }, { "epoch": 4.489795918367347, "grad_norm": 0.23866137862205505, "learning_rate": 1.0204081632653063e-06, "loss": 0.0362, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1100, "total_memory_available (GB)": 94.62 }, { "epoch": 4.530612244897959, "grad_norm": 0.15117916464805603, "learning_rate": 9.387755102040817e-07, "loss": 0.0464, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1110, "total_memory_available (GB)": 94.62 }, { "epoch": 4.571428571428571, "grad_norm": 0.5993088483810425, "learning_rate": 8.571428571428572e-07, "loss": 0.0404, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1120, "total_memory_available (GB)": 94.62 }, { "epoch": 4.612244897959184, "grad_norm": 0.30265432596206665, "learning_rate": 7.755102040816327e-07, "loss": 0.0545, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1130, "total_memory_available (GB)": 94.62 }, { "epoch": 4.653061224489796, "grad_norm": 0.6385183334350586, "learning_rate": 6.938775510204082e-07, "loss": 0.0731, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1140, "total_memory_available (GB)": 94.62 }, { "epoch": 4.6938775510204085, "grad_norm": 1.128566026687622, "learning_rate": 6.122448979591837e-07, "loss": 0.0516, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1150, "total_memory_available (GB)": 94.62 }, { "epoch": 4.73469387755102, "grad_norm": 1.1660116910934448, "learning_rate": 5.306122448979592e-07, "loss": 0.0611, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1160, "total_memory_available (GB)": 94.62 }, { "epoch": 4.775510204081632, "grad_norm": 0.5327439904212952, "learning_rate": 4.489795918367347e-07, "loss": 0.0549, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1170, "total_memory_available (GB)": 94.62 }, { "epoch": 4.816326530612245, "grad_norm": 0.8764423131942749, "learning_rate": 3.6734693877551025e-07, "loss": 0.0441, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1180, "total_memory_available (GB)": 94.62 }, { "epoch": 4.857142857142857, "grad_norm": 0.47835007309913635, "learning_rate": 2.8571428571428575e-07, "loss": 0.0541, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1190, "total_memory_available (GB)": 94.62 }, { "epoch": 4.8979591836734695, "grad_norm": 1.048047661781311, "learning_rate": 2.0408163265306121e-07, "loss": 0.0731, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1200, "total_memory_available (GB)": 94.62 }, { "epoch": 4.938775510204081, "grad_norm": 0.3101171851158142, "learning_rate": 1.2244897959183673e-07, "loss": 0.0648, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1210, "total_memory_available (GB)": 94.62 }, { "epoch": 4.979591836734694, "grad_norm": 0.76802659034729, "learning_rate": 4.0816326530612253e-08, "loss": 0.0418, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1220, "total_memory_available (GB)": 94.62 }, { "epoch": 5.0, "max_memory_allocated (GB)": 60.52, "memory_allocated (GB)": 50.57, "step": 1225, "total_flos": 3.0598946525952e+16, "total_memory_available (GB)": 94.62, "train_loss": 0.06085505417415074, "train_runtime": 1020.8061, "train_samples_per_second": 55.51, "train_steps_per_second": 1.389 } ], "logging_steps": 10, "max_steps": 1225, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.0598946525952e+16, "train_batch_size": 40, "trial_name": null, "trial_params": null }