{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "Batch Mean": 2.61431884765625, "accuracy": 0.4765625, "epoch": 0, "step": 0 }, { "epoch": 0.0025, "grad_norm": 42.59496307373047, "learning_rate": 1.5000000000000002e-07, "loss": 0.8092, "step": 1 }, { "Batch Mean": 2.574005126953125, "accuracy": 0.578125, "epoch": 0.0025, "step": 1 }, { "epoch": 0.005, "grad_norm": 46.709930419921875, "learning_rate": 3.0000000000000004e-07, "loss": 0.7799, "step": 2 }, { "Batch Mean": 2.560516357421875, "accuracy": 0.484375, "epoch": 0.005, "step": 2 }, { "epoch": 0.0075, "grad_norm": 44.51314163208008, "learning_rate": 4.5e-07, "loss": 0.8043, "step": 3 }, { "Batch Mean": 2.6197509765625, "accuracy": 0.5, "epoch": 0.0075, "step": 3 }, { "epoch": 0.01, "grad_norm": 42.86121368408203, "learning_rate": 6.000000000000001e-07, "loss": 0.8014, "step": 4 }, { "Batch Mean": 2.565338134765625, "accuracy": 0.53125, "epoch": 0.01, "step": 4 }, { "epoch": 0.0125, "grad_norm": 44.409908294677734, "learning_rate": 7.5e-07, "loss": 0.7983, "step": 5 }, { "Batch Mean": 2.522857666015625, "accuracy": 0.484375, "epoch": 0.0125, "step": 5 }, { "epoch": 0.015, "grad_norm": 43.77534484863281, "learning_rate": 9e-07, "loss": 0.7975, "step": 6 }, { "Batch Mean": 2.3875732421875, "accuracy": 0.4921875, "epoch": 0.015, "step": 6 }, { "epoch": 0.0175, "grad_norm": 41.981407165527344, "learning_rate": 1.05e-06, "loss": 0.7951, "step": 7 }, { "Batch Mean": 2.317047119140625, "accuracy": 0.515625, "epoch": 0.0175, "step": 7 }, { "epoch": 0.02, "grad_norm": 40.53751754760742, "learning_rate": 1.2000000000000002e-06, "loss": 0.791, "step": 8 }, { "Batch Mean": 1.757080078125, "accuracy": 0.5078125, "epoch": 0.02, "step": 8 }, { "epoch": 0.0225, "grad_norm": 31.105939865112305, "learning_rate": 1.35e-06, "loss": 0.7571, "step": 9 }, { "Batch Mean": 1.6220855712890625, "accuracy": 0.578125, "epoch": 0.0225, "step": 9 }, { "epoch": 0.025, "grad_norm": 33.61161804199219, "learning_rate": 1.5e-06, "loss": 0.7234, "step": 10 }, { "Batch Mean": 1.400146484375, "accuracy": 0.53125, "epoch": 0.025, "step": 10 }, { "epoch": 0.0275, "grad_norm": 34.554622650146484, "learning_rate": 1.65e-06, "loss": 0.725, "step": 11 }, { "Batch Mean": -0.43697381019592285, "accuracy": 0.5234375, "epoch": 0.0275, "step": 11 }, { "epoch": 0.03, "grad_norm": 12.222326278686523, "learning_rate": 1.8e-06, "loss": 0.6986, "step": 12 }, { "Batch Mean": -0.8854951858520508, "accuracy": 0.5234375, "epoch": 0.03, "step": 12 }, { "epoch": 0.0325, "grad_norm": 16.398786544799805, "learning_rate": 1.95e-06, "loss": 0.7081, "step": 13 }, { "Batch Mean": -0.9508838653564453, "accuracy": 0.640625, "epoch": 0.0325, "step": 13 }, { "epoch": 0.035, "grad_norm": 19.590002059936523, "learning_rate": 2.1e-06, "loss": 0.6784, "step": 14 }, { "Batch Mean": -1.0560526847839355, "accuracy": 0.4921875, "epoch": 0.035, "step": 14 }, { "epoch": 0.0375, "grad_norm": 22.669742584228516, "learning_rate": 2.25e-06, "loss": 0.7235, "step": 15 }, { "Batch Mean": -1.1405725479125977, "accuracy": 0.625, "epoch": 0.0375, "step": 15 }, { "epoch": 0.04, "grad_norm": 22.189626693725586, "learning_rate": 2.4000000000000003e-06, "loss": 0.6714, "step": 16 }, { "Batch Mean": -1.1522831916809082, "accuracy": 0.640625, "epoch": 0.04, "step": 16 }, { "epoch": 0.0425, "grad_norm": 19.9060115814209, "learning_rate": 2.55e-06, "loss": 0.6757, "step": 17 }, { "Batch Mean": -1.1666946411132812, "accuracy": 0.53125, "epoch": 0.0425, "step": 17 }, { "epoch": 0.045, "grad_norm": 23.661216735839844, "learning_rate": 2.7e-06, "loss": 0.7002, "step": 18 }, { "Batch Mean": -0.999359130859375, "accuracy": 0.6640625, "epoch": 0.045, "step": 18 }, { "epoch": 0.0475, "grad_norm": 15.433859825134277, "learning_rate": 2.85e-06, "loss": 0.6471, "step": 19 }, { "Batch Mean": -0.6625549793243408, "accuracy": 0.6171875, "epoch": 0.0475, "step": 19 }, { "epoch": 0.05, "grad_norm": 13.135390281677246, "learning_rate": 3e-06, "loss": 0.662, "step": 20 }, { "Batch Mean": -0.13487493991851807, "accuracy": 0.6171875, "epoch": 0.05, "step": 20 }, { "epoch": 0.0525, "grad_norm": 7.336737632751465, "learning_rate": 2.992105263157895e-06, "loss": 0.629, "step": 21 }, { "Batch Mean": 0.2938040494918823, "accuracy": 0.6640625, "epoch": 0.0525, "step": 21 }, { "epoch": 0.055, "grad_norm": 9.586155891418457, "learning_rate": 2.9842105263157896e-06, "loss": 0.6225, "step": 22 }, { "Batch Mean": 0.6780391931533813, "accuracy": 0.7421875, "epoch": 0.055, "step": 22 }, { "epoch": 0.0575, "grad_norm": 14.825817108154297, "learning_rate": 2.9763157894736843e-06, "loss": 0.607, "step": 23 }, { "Batch Mean": 0.9421095848083496, "accuracy": 0.609375, "epoch": 0.0575, "step": 23 }, { "epoch": 0.06, "grad_norm": 16.7529354095459, "learning_rate": 2.968421052631579e-06, "loss": 0.6542, "step": 24 }, { "Batch Mean": 1.0722179412841797, "accuracy": 0.71875, "epoch": 0.06, "step": 24 }, { "epoch": 0.0625, "grad_norm": 18.940032958984375, "learning_rate": 2.960526315789474e-06, "loss": 0.5882, "step": 25 }, { "Batch Mean": 0.8396664261817932, "accuracy": 0.6484375, "epoch": 0.0625, "step": 25 }, { "epoch": 0.065, "grad_norm": 14.680763244628906, "learning_rate": 2.9526315789473685e-06, "loss": 0.6463, "step": 26 }, { "Batch Mean": 0.2986793518066406, "accuracy": 0.671875, "epoch": 0.065, "step": 26 }, { "epoch": 0.0675, "grad_norm": 7.3362345695495605, "learning_rate": 2.9447368421052633e-06, "loss": 0.5923, "step": 27 }, { "Batch Mean": -0.29196763038635254, "accuracy": 0.734375, "epoch": 0.0675, "step": 27 }, { "epoch": 0.07, "grad_norm": 8.053507804870605, "learning_rate": 2.936842105263158e-06, "loss": 0.5261, "step": 28 }, { "Batch Mean": -0.8161113262176514, "accuracy": 0.7109375, "epoch": 0.07, "step": 28 }, { "epoch": 0.0725, "grad_norm": 14.036270141601562, "learning_rate": 2.9289473684210528e-06, "loss": 0.5482, "step": 29 }, { "Batch Mean": -0.7824737429618835, "accuracy": 0.75, "epoch": 0.0725, "step": 29 }, { "epoch": 0.075, "grad_norm": 13.984892845153809, "learning_rate": 2.9210526315789475e-06, "loss": 0.5939, "step": 30 }, { "Batch Mean": -0.4927825927734375, "accuracy": 0.734375, "epoch": 0.075, "step": 30 }, { "epoch": 0.0775, "grad_norm": 10.751565933227539, "learning_rate": 2.9131578947368423e-06, "loss": 0.5901, "step": 31 }, { "Batch Mean": 0.15497040748596191, "accuracy": 0.6796875, "epoch": 0.0775, "step": 31 }, { "epoch": 0.08, "grad_norm": 8.30102252960205, "learning_rate": 2.905263157894737e-06, "loss": 0.5731, "step": 32 }, { "Batch Mean": 0.16421844065189362, "accuracy": 0.7578125, "epoch": 0.08, "step": 32 }, { "epoch": 0.0825, "grad_norm": 9.087366104125977, "learning_rate": 2.8973684210526318e-06, "loss": 0.4958, "step": 33 }, { "Batch Mean": 0.3596491813659668, "accuracy": 0.859375, "epoch": 0.0825, "step": 33 }, { "epoch": 0.085, "grad_norm": 9.984371185302734, "learning_rate": 2.8894736842105265e-06, "loss": 0.4321, "step": 34 }, { "Batch Mean": 0.2128266543149948, "accuracy": 0.734375, "epoch": 0.085, "step": 34 }, { "epoch": 0.0875, "grad_norm": 7.400077819824219, "learning_rate": 2.8815789473684213e-06, "loss": 0.5178, "step": 35 }, { "Batch Mean": 0.10449030995368958, "accuracy": 0.7578125, "epoch": 0.0875, "step": 35 }, { "epoch": 0.09, "grad_norm": 6.669457912445068, "learning_rate": 2.873684210526316e-06, "loss": 0.4914, "step": 36 }, { "Batch Mean": -0.24382781982421875, "accuracy": 0.6953125, "epoch": 0.09, "step": 36 }, { "epoch": 0.0925, "grad_norm": 10.831747055053711, "learning_rate": 2.8657894736842103e-06, "loss": 0.5305, "step": 37 }, { "Batch Mean": -0.29154396057128906, "accuracy": 0.7109375, "epoch": 0.0925, "step": 37 }, { "epoch": 0.095, "grad_norm": 11.060967445373535, "learning_rate": 2.857894736842105e-06, "loss": 0.5589, "step": 38 }, { "Batch Mean": -0.07830595970153809, "accuracy": 0.765625, "epoch": 0.095, "step": 38 }, { "epoch": 0.0975, "grad_norm": 8.426444053649902, "learning_rate": 2.85e-06, "loss": 0.4663, "step": 39 }, { "Batch Mean": -0.09856069087982178, "accuracy": 0.7890625, "epoch": 0.0975, "step": 39 }, { "epoch": 0.1, "grad_norm": 11.046717643737793, "learning_rate": 2.8421052631578946e-06, "loss": 0.455, "step": 40 }, { "Batch Mean": 0.29904642701148987, "accuracy": 0.8125, "epoch": 0.1, "step": 40 }, { "epoch": 0.1025, "grad_norm": 11.232060432434082, "learning_rate": 2.8342105263157897e-06, "loss": 0.437, "step": 41 }, { "Batch Mean": 0.43527090549468994, "accuracy": 0.78125, "epoch": 0.1025, "step": 41 }, { "epoch": 0.105, "grad_norm": 12.555906295776367, "learning_rate": 2.8263157894736845e-06, "loss": 0.5316, "step": 42 }, { "Batch Mean": 0.02639901638031006, "accuracy": 0.8046875, "epoch": 0.105, "step": 42 }, { "epoch": 0.1075, "grad_norm": 9.500948905944824, "learning_rate": 2.8184210526315792e-06, "loss": 0.4874, "step": 43 }, { "Batch Mean": -0.3524761199951172, "accuracy": 0.7421875, "epoch": 0.1075, "step": 43 }, { "epoch": 0.11, "grad_norm": 10.417457580566406, "learning_rate": 2.810526315789474e-06, "loss": 0.5011, "step": 44 }, { "Batch Mean": -0.32391130924224854, "accuracy": 0.8125, "epoch": 0.11, "step": 44 }, { "epoch": 0.1125, "grad_norm": 9.867128372192383, "learning_rate": 2.8026315789473687e-06, "loss": 0.4908, "step": 45 }, { "Batch Mean": -0.42060422897338867, "accuracy": 0.8125, "epoch": 0.1125, "step": 45 }, { "epoch": 0.115, "grad_norm": 10.613035202026367, "learning_rate": 2.7947368421052635e-06, "loss": 0.4642, "step": 46 }, { "Batch Mean": 0.04145359992980957, "accuracy": 0.7890625, "epoch": 0.115, "step": 46 }, { "epoch": 0.1175, "grad_norm": 12.13752555847168, "learning_rate": 2.7868421052631578e-06, "loss": 0.4943, "step": 47 }, { "Batch Mean": 0.24553179740905762, "accuracy": 0.8046875, "epoch": 0.1175, "step": 47 }, { "epoch": 0.12, "grad_norm": 11.048128128051758, "learning_rate": 2.7789473684210525e-06, "loss": 0.3946, "step": 48 }, { "Batch Mean": -0.05052506923675537, "accuracy": 0.796875, "epoch": 0.12, "step": 48 }, { "epoch": 0.1225, "grad_norm": 9.053565979003906, "learning_rate": 2.7710526315789473e-06, "loss": 0.4474, "step": 49 }, { "Batch Mean": -0.43336963653564453, "accuracy": 0.7265625, "epoch": 0.1225, "step": 49 }, { "epoch": 0.125, "grad_norm": 15.04668140411377, "learning_rate": 2.763157894736842e-06, "loss": 0.5496, "step": 50 }, { "Batch Mean": -0.29952335357666016, "accuracy": 0.765625, "epoch": 0.125, "step": 50 }, { "epoch": 0.1275, "grad_norm": 11.302435874938965, "learning_rate": 2.7552631578947368e-06, "loss": 0.4337, "step": 51 }, { "Batch Mean": -0.01873302459716797, "accuracy": 0.828125, "epoch": 0.1275, "step": 51 }, { "epoch": 0.13, "grad_norm": 10.582301139831543, "learning_rate": 2.7473684210526315e-06, "loss": 0.4073, "step": 52 }, { "Batch Mean": 0.4117751717567444, "accuracy": 0.8046875, "epoch": 0.13, "step": 52 }, { "epoch": 0.1325, "grad_norm": 9.776220321655273, "learning_rate": 2.7394736842105263e-06, "loss": 0.4235, "step": 53 }, { "Batch Mean": 0.5990171432495117, "accuracy": 0.78125, "epoch": 0.1325, "step": 53 }, { "epoch": 0.135, "grad_norm": 12.647154808044434, "learning_rate": 2.7315789473684214e-06, "loss": 0.4858, "step": 54 }, { "Batch Mean": 0.2538492679595947, "accuracy": 0.7265625, "epoch": 0.135, "step": 54 }, { "epoch": 0.1375, "grad_norm": 11.564103126525879, "learning_rate": 2.723684210526316e-06, "loss": 0.5233, "step": 55 }, { "Batch Mean": 0.17059040069580078, "accuracy": 0.7578125, "epoch": 0.1375, "step": 55 }, { "epoch": 0.14, "grad_norm": 10.589515686035156, "learning_rate": 2.715789473684211e-06, "loss": 0.4762, "step": 56 }, { "Batch Mean": -0.3642357587814331, "accuracy": 0.78125, "epoch": 0.14, "step": 56 }, { "epoch": 0.1425, "grad_norm": 10.727502822875977, "learning_rate": 2.7078947368421052e-06, "loss": 0.5289, "step": 57 }, { "Batch Mean": -0.5980481505393982, "accuracy": 0.8203125, "epoch": 0.1425, "step": 57 }, { "epoch": 0.145, "grad_norm": 8.915611267089844, "learning_rate": 2.7e-06, "loss": 0.4038, "step": 58 }, { "Batch Mean": -0.46013855934143066, "accuracy": 0.859375, "epoch": 0.145, "step": 58 }, { "epoch": 0.1475, "grad_norm": 9.178686141967773, "learning_rate": 2.6921052631578947e-06, "loss": 0.3708, "step": 59 }, { "Batch Mean": -0.1478586494922638, "accuracy": 0.84375, "epoch": 0.1475, "step": 59 }, { "epoch": 0.15, "grad_norm": 6.339688301086426, "learning_rate": 2.6842105263157895e-06, "loss": 0.4036, "step": 60 }, { "Batch Mean": 0.003389716148376465, "accuracy": 0.7578125, "epoch": 0.15, "step": 60 }, { "epoch": 0.1525, "grad_norm": 6.763300895690918, "learning_rate": 2.6763157894736842e-06, "loss": 0.5163, "step": 61 }, { "Batch Mean": 0.4018087387084961, "accuracy": 0.7734375, "epoch": 0.1525, "step": 61 }, { "epoch": 0.155, "grad_norm": 8.631525039672852, "learning_rate": 2.668421052631579e-06, "loss": 0.4219, "step": 62 }, { "Batch Mean": 0.342004656791687, "accuracy": 0.734375, "epoch": 0.155, "step": 62 }, { "epoch": 0.1575, "grad_norm": 9.190362930297852, "learning_rate": 2.6605263157894737e-06, "loss": 0.4708, "step": 63 }, { "Batch Mean": 0.5178697109222412, "accuracy": 0.8125, "epoch": 0.1575, "step": 63 }, { "epoch": 0.16, "grad_norm": 10.207452774047852, "learning_rate": 2.6526315789473685e-06, "loss": 0.4458, "step": 64 }, { "Batch Mean": 0.03012150526046753, "accuracy": 0.8125, "epoch": 0.16, "step": 64 }, { "epoch": 0.1625, "grad_norm": 8.050507545471191, "learning_rate": 2.644736842105263e-06, "loss": 0.4344, "step": 65 }, { "Batch Mean": -0.3691895604133606, "accuracy": 0.78125, "epoch": 0.1625, "step": 65 }, { "epoch": 0.165, "grad_norm": 11.163555145263672, "learning_rate": 2.636842105263158e-06, "loss": 0.4276, "step": 66 }, { "Batch Mean": -0.2624788284301758, "accuracy": 0.8125, "epoch": 0.165, "step": 66 }, { "epoch": 0.1675, "grad_norm": 9.523059844970703, "learning_rate": 2.6289473684210527e-06, "loss": 0.4001, "step": 67 }, { "Batch Mean": -0.0055138468742370605, "accuracy": 0.8046875, "epoch": 0.1675, "step": 67 }, { "epoch": 0.17, "grad_norm": 9.094910621643066, "learning_rate": 2.6210526315789474e-06, "loss": 0.4024, "step": 68 }, { "Batch Mean": 0.2572704553604126, "accuracy": 0.7734375, "epoch": 0.17, "step": 68 }, { "epoch": 0.1725, "grad_norm": 10.00645923614502, "learning_rate": 2.613157894736842e-06, "loss": 0.4802, "step": 69 }, { "Batch Mean": 0.5112218856811523, "accuracy": 0.8046875, "epoch": 0.1725, "step": 69 }, { "epoch": 0.175, "grad_norm": 11.776649475097656, "learning_rate": 2.605263157894737e-06, "loss": 0.4194, "step": 70 }, { "Batch Mean": 0.29661768674850464, "accuracy": 0.8125, "epoch": 0.175, "step": 70 }, { "epoch": 0.1775, "grad_norm": 9.701162338256836, "learning_rate": 2.5973684210526317e-06, "loss": 0.4356, "step": 71 }, { "Batch Mean": 0.049837589263916016, "accuracy": 0.859375, "epoch": 0.1775, "step": 71 }, { "epoch": 0.18, "grad_norm": 8.692646980285645, "learning_rate": 2.5894736842105264e-06, "loss": 0.2881, "step": 72 }, { "Batch Mean": -0.3786022663116455, "accuracy": 0.8125, "epoch": 0.18, "step": 72 }, { "epoch": 0.1825, "grad_norm": 10.834145545959473, "learning_rate": 2.581578947368421e-06, "loss": 0.4654, "step": 73 }, { "Batch Mean": -0.20977401733398438, "accuracy": 0.8515625, "epoch": 0.1825, "step": 73 }, { "epoch": 0.185, "grad_norm": 7.816598892211914, "learning_rate": 2.573684210526316e-06, "loss": 0.4101, "step": 74 }, { "Batch Mean": -0.6599991321563721, "accuracy": 0.8203125, "epoch": 0.185, "step": 74 }, { "epoch": 0.1875, "grad_norm": 13.089007377624512, "learning_rate": 2.5657894736842107e-06, "loss": 0.427, "step": 75 }, { "Batch Mean": -0.37617337703704834, "accuracy": 0.7734375, "epoch": 0.1875, "step": 75 }, { "epoch": 0.19, "grad_norm": 12.075210571289062, "learning_rate": 2.5578947368421054e-06, "loss": 0.4797, "step": 76 }, { "Batch Mean": 0.1281442642211914, "accuracy": 0.8046875, "epoch": 0.19, "step": 76 }, { "epoch": 0.1925, "grad_norm": 9.70462703704834, "learning_rate": 2.55e-06, "loss": 0.4425, "step": 77 }, { "Batch Mean": 0.6071650981903076, "accuracy": 0.828125, "epoch": 0.1925, "step": 77 }, { "epoch": 0.195, "grad_norm": 14.395353317260742, "learning_rate": 2.542105263157895e-06, "loss": 0.4016, "step": 78 }, { "Batch Mean": 0.34730714559555054, "accuracy": 0.8203125, "epoch": 0.195, "step": 78 }, { "epoch": 0.1975, "grad_norm": 9.122536659240723, "learning_rate": 2.5342105263157892e-06, "loss": 0.3994, "step": 79 }, { "Batch Mean": 0.36483922600746155, "accuracy": 0.828125, "epoch": 0.1975, "step": 79 }, { "epoch": 0.2, "grad_norm": 10.289307594299316, "learning_rate": 2.526315789473684e-06, "loss": 0.368, "step": 80 }, { "Batch Mean": -0.20039799809455872, "accuracy": 0.8125, "epoch": 0.2, "step": 80 }, { "epoch": 0.2025, "grad_norm": 7.813342094421387, "learning_rate": 2.5184210526315787e-06, "loss": 0.3679, "step": 81 }, { "Batch Mean": -0.1629079282283783, "accuracy": 0.8203125, "epoch": 0.2025, "step": 81 }, { "epoch": 0.205, "grad_norm": 7.504952430725098, "learning_rate": 2.510526315789474e-06, "loss": 0.3823, "step": 82 }, { "Batch Mean": -0.07863587141036987, "accuracy": 0.8046875, "epoch": 0.205, "step": 82 }, { "epoch": 0.2075, "grad_norm": 7.721461296081543, "learning_rate": 2.5026315789473686e-06, "loss": 0.3967, "step": 83 }, { "Batch Mean": 0.17194491624832153, "accuracy": 0.8125, "epoch": 0.2075, "step": 83 }, { "epoch": 0.21, "grad_norm": 8.062063217163086, "learning_rate": 2.4947368421052634e-06, "loss": 0.4195, "step": 84 }, { "Batch Mean": 0.11748838424682617, "accuracy": 0.8359375, "epoch": 0.21, "step": 84 }, { "epoch": 0.2125, "grad_norm": 8.320355415344238, "learning_rate": 2.486842105263158e-06, "loss": 0.3864, "step": 85 }, { "Batch Mean": -0.40801382064819336, "accuracy": 0.7734375, "epoch": 0.2125, "step": 85 }, { "epoch": 0.215, "grad_norm": 14.8066987991333, "learning_rate": 2.478947368421053e-06, "loss": 0.4639, "step": 86 }, { "Batch Mean": -0.35811758041381836, "accuracy": 0.7734375, "epoch": 0.215, "step": 86 }, { "epoch": 0.2175, "grad_norm": 14.074177742004395, "learning_rate": 2.4710526315789476e-06, "loss": 0.4999, "step": 87 }, { "Batch Mean": 0.278584361076355, "accuracy": 0.7734375, "epoch": 0.2175, "step": 87 }, { "epoch": 0.22, "grad_norm": 10.59004020690918, "learning_rate": 2.4631578947368424e-06, "loss": 0.404, "step": 88 }, { "Batch Mean": -0.21054387092590332, "accuracy": 0.84375, "epoch": 0.22, "step": 88 }, { "epoch": 0.2225, "grad_norm": 8.488351821899414, "learning_rate": 2.4552631578947367e-06, "loss": 0.3517, "step": 89 }, { "Batch Mean": -0.22967231273651123, "accuracy": 0.84375, "epoch": 0.2225, "step": 89 }, { "epoch": 0.225, "grad_norm": 10.347502708435059, "learning_rate": 2.4473684210526314e-06, "loss": 0.3382, "step": 90 }, { "Batch Mean": 0.1892259418964386, "accuracy": 0.7890625, "epoch": 0.225, "step": 90 }, { "epoch": 0.2275, "grad_norm": 10.092534065246582, "learning_rate": 2.439473684210526e-06, "loss": 0.4586, "step": 91 }, { "Batch Mean": 0.516247570514679, "accuracy": 0.8828125, "epoch": 0.2275, "step": 91 }, { "epoch": 0.23, "grad_norm": 13.944212913513184, "learning_rate": 2.431578947368421e-06, "loss": 0.3067, "step": 92 }, { "Batch Mean": 0.056681275367736816, "accuracy": 0.7890625, "epoch": 0.23, "step": 92 }, { "epoch": 0.2325, "grad_norm": 8.53646183013916, "learning_rate": 2.4236842105263157e-06, "loss": 0.3946, "step": 93 }, { "Batch Mean": -0.47129684686660767, "accuracy": 0.8359375, "epoch": 0.2325, "step": 93 }, { "epoch": 0.235, "grad_norm": 11.441537857055664, "learning_rate": 2.4157894736842104e-06, "loss": 0.3891, "step": 94 }, { "Batch Mean": -0.3262195587158203, "accuracy": 0.84375, "epoch": 0.235, "step": 94 }, { "epoch": 0.2375, "grad_norm": 10.751629829406738, "learning_rate": 2.4078947368421056e-06, "loss": 0.3578, "step": 95 }, { "Batch Mean": -0.6593484878540039, "accuracy": 0.828125, "epoch": 0.2375, "step": 95 }, { "epoch": 0.24, "grad_norm": 15.914350509643555, "learning_rate": 2.4000000000000003e-06, "loss": 0.4204, "step": 96 }, { "Batch Mean": -0.32147085666656494, "accuracy": 0.8359375, "epoch": 0.24, "step": 96 }, { "epoch": 0.2425, "grad_norm": 11.814691543579102, "learning_rate": 2.392105263157895e-06, "loss": 0.3892, "step": 97 }, { "Batch Mean": 0.49291136860847473, "accuracy": 0.875, "epoch": 0.2425, "step": 97 }, { "epoch": 0.245, "grad_norm": 11.86365795135498, "learning_rate": 2.38421052631579e-06, "loss": 0.3264, "step": 98 }, { "Batch Mean": 0.49174070358276367, "accuracy": 0.8515625, "epoch": 0.245, "step": 98 }, { "epoch": 0.2475, "grad_norm": 10.874544143676758, "learning_rate": 2.376315789473684e-06, "loss": 0.3847, "step": 99 }, { "Batch Mean": 0.6919900178909302, "accuracy": 0.796875, "epoch": 0.2475, "step": 99 }, { "epoch": 0.25, "grad_norm": 15.105185508728027, "learning_rate": 2.368421052631579e-06, "loss": 0.4066, "step": 100 }, { "Batch Mean": 0.061200618743896484, "accuracy": 0.7734375, "epoch": 0.25, "step": 100 }, { "epoch": 0.2525, "grad_norm": 8.795661926269531, "learning_rate": 2.3605263157894736e-06, "loss": 0.4193, "step": 101 }, { "Batch Mean": -0.4929161071777344, "accuracy": 0.7890625, "epoch": 0.2525, "step": 101 }, { "epoch": 0.255, "grad_norm": 12.120612144470215, "learning_rate": 2.3526315789473684e-06, "loss": 0.4833, "step": 102 }, { "Batch Mean": -0.48431068658828735, "accuracy": 0.859375, "epoch": 0.255, "step": 102 }, { "epoch": 0.2575, "grad_norm": 10.263632774353027, "learning_rate": 2.344736842105263e-06, "loss": 0.3397, "step": 103 }, { "Batch Mean": -0.2162787914276123, "accuracy": 0.8203125, "epoch": 0.2575, "step": 103 }, { "epoch": 0.26, "grad_norm": 7.323156356811523, "learning_rate": 2.336842105263158e-06, "loss": 0.3688, "step": 104 }, { "Batch Mean": 0.2203059196472168, "accuracy": 0.8515625, "epoch": 0.26, "step": 104 }, { "epoch": 0.2625, "grad_norm": 7.128310203552246, "learning_rate": 2.3289473684210526e-06, "loss": 0.3454, "step": 105 }, { "Batch Mean": 0.19034957885742188, "accuracy": 0.8828125, "epoch": 0.2625, "step": 105 }, { "epoch": 0.265, "grad_norm": 7.388917446136475, "learning_rate": 2.3210526315789473e-06, "loss": 0.3317, "step": 106 }, { "Batch Mean": -0.0413057804107666, "accuracy": 0.796875, "epoch": 0.265, "step": 106 }, { "epoch": 0.2675, "grad_norm": 9.412615776062012, "learning_rate": 2.313157894736842e-06, "loss": 0.4393, "step": 107 }, { "Batch Mean": 0.473476767539978, "accuracy": 0.875, "epoch": 0.2675, "step": 107 }, { "epoch": 0.27, "grad_norm": 11.040026664733887, "learning_rate": 2.305263157894737e-06, "loss": 0.35, "step": 108 }, { "Batch Mean": 0.37884294986724854, "accuracy": 0.875, "epoch": 0.27, "step": 108 }, { "epoch": 0.2725, "grad_norm": 12.01086711883545, "learning_rate": 2.2973684210526316e-06, "loss": 0.3001, "step": 109 }, { "Batch Mean": 0.4409904479980469, "accuracy": 0.8828125, "epoch": 0.2725, "step": 109 }, { "epoch": 0.275, "grad_norm": 12.842291831970215, "learning_rate": 2.2894736842105263e-06, "loss": 0.2767, "step": 110 }, { "Batch Mean": -0.12960529327392578, "accuracy": 0.7890625, "epoch": 0.275, "step": 110 }, { "epoch": 0.2775, "grad_norm": 11.811309814453125, "learning_rate": 2.281578947368421e-06, "loss": 0.4361, "step": 111 }, { "Batch Mean": -0.3953084945678711, "accuracy": 0.7578125, "epoch": 0.2775, "step": 111 }, { "epoch": 0.28, "grad_norm": 14.014053344726562, "learning_rate": 2.273684210526316e-06, "loss": 0.5185, "step": 112 }, { "Batch Mean": -0.16724836826324463, "accuracy": 0.859375, "epoch": 0.28, "step": 112 }, { "epoch": 0.2825, "grad_norm": 7.608469486236572, "learning_rate": 2.2657894736842106e-06, "loss": 0.3033, "step": 113 }, { "Batch Mean": -0.36876964569091797, "accuracy": 0.8203125, "epoch": 0.2825, "step": 113 }, { "epoch": 0.285, "grad_norm": 11.263389587402344, "learning_rate": 2.2578947368421053e-06, "loss": 0.4069, "step": 114 }, { "Batch Mean": 0.22658658027648926, "accuracy": 0.7734375, "epoch": 0.285, "step": 114 }, { "epoch": 0.2875, "grad_norm": 11.426595687866211, "learning_rate": 2.25e-06, "loss": 0.4412, "step": 115 }, { "Batch Mean": 0.5346584320068359, "accuracy": 0.8359375, "epoch": 0.2875, "step": 115 }, { "epoch": 0.29, "grad_norm": 11.347992897033691, "learning_rate": 2.242105263157895e-06, "loss": 0.3996, "step": 116 }, { "Batch Mean": -0.40567731857299805, "accuracy": 0.8671875, "epoch": 0.29, "step": 116 }, { "epoch": 0.2925, "grad_norm": 9.157360076904297, "learning_rate": 2.2342105263157895e-06, "loss": 0.3318, "step": 117 }, { "Batch Mean": -0.1567370891571045, "accuracy": 0.8046875, "epoch": 0.2925, "step": 117 }, { "epoch": 0.295, "grad_norm": 12.001374244689941, "learning_rate": 2.2263157894736843e-06, "loss": 0.4401, "step": 118 }, { "Batch Mean": -0.357755184173584, "accuracy": 0.8671875, "epoch": 0.295, "step": 118 }, { "epoch": 0.2975, "grad_norm": 8.508570671081543, "learning_rate": 2.218421052631579e-06, "loss": 0.3305, "step": 119 }, { "Batch Mean": -0.35381609201431274, "accuracy": 0.8046875, "epoch": 0.2975, "step": 119 }, { "epoch": 0.3, "grad_norm": 8.094682693481445, "learning_rate": 2.2105263157894738e-06, "loss": 0.3984, "step": 120 }, { "Batch Mean": 0.22805237770080566, "accuracy": 0.8515625, "epoch": 0.3, "step": 120 }, { "epoch": 0.3025, "grad_norm": 9.788799285888672, "learning_rate": 2.2026315789473685e-06, "loss": 0.3478, "step": 121 }, { "Batch Mean": 0.5038647651672363, "accuracy": 0.859375, "epoch": 0.3025, "step": 121 }, { "epoch": 0.305, "grad_norm": 9.84559440612793, "learning_rate": 2.1947368421052633e-06, "loss": 0.3561, "step": 122 }, { "Batch Mean": 0.7104835510253906, "accuracy": 0.828125, "epoch": 0.305, "step": 122 }, { "epoch": 0.3075, "grad_norm": 11.43686580657959, "learning_rate": 2.186842105263158e-06, "loss": 0.3686, "step": 123 }, { "Batch Mean": -0.007034778594970703, "accuracy": 0.796875, "epoch": 0.3075, "step": 123 }, { "epoch": 0.31, "grad_norm": 8.770352363586426, "learning_rate": 2.1789473684210528e-06, "loss": 0.3772, "step": 124 }, { "Batch Mean": 0.036529541015625, "accuracy": 0.7890625, "epoch": 0.31, "step": 124 }, { "epoch": 0.3125, "grad_norm": 13.17721939086914, "learning_rate": 2.1710526315789475e-06, "loss": 0.5148, "step": 125 }, { "Batch Mean": -0.30262577533721924, "accuracy": 0.875, "epoch": 0.3125, "step": 125 }, { "epoch": 0.315, "grad_norm": 8.691415786743164, "learning_rate": 2.1631578947368423e-06, "loss": 0.3671, "step": 126 }, { "Batch Mean": -0.12425446510314941, "accuracy": 0.8515625, "epoch": 0.315, "step": 126 }, { "epoch": 0.3175, "grad_norm": 8.049582481384277, "learning_rate": 2.155263157894737e-06, "loss": 0.3522, "step": 127 }, { "Batch Mean": -0.4603520631790161, "accuracy": 0.765625, "epoch": 0.3175, "step": 127 }, { "epoch": 0.32, "grad_norm": 10.966066360473633, "learning_rate": 2.1473684210526317e-06, "loss": 0.4656, "step": 128 }, { "Batch Mean": 0.0716085433959961, "accuracy": 0.796875, "epoch": 0.32, "step": 128 }, { "epoch": 0.3225, "grad_norm": 7.68458890914917, "learning_rate": 2.1394736842105265e-06, "loss": 0.3586, "step": 129 }, { "Batch Mean": 0.22112131118774414, "accuracy": 0.7890625, "epoch": 0.3225, "step": 129 }, { "epoch": 0.325, "grad_norm": 8.140663146972656, "learning_rate": 2.1315789473684212e-06, "loss": 0.4103, "step": 130 }, { "Batch Mean": 0.1299583911895752, "accuracy": 0.8125, "epoch": 0.325, "step": 130 }, { "epoch": 0.3275, "grad_norm": 7.390633583068848, "learning_rate": 2.123684210526316e-06, "loss": 0.3914, "step": 131 }, { "Batch Mean": 0.608335018157959, "accuracy": 0.921875, "epoch": 0.3275, "step": 131 }, { "epoch": 0.33, "grad_norm": 13.193960189819336, "learning_rate": 2.1157894736842103e-06, "loss": 0.2749, "step": 132 }, { "Batch Mean": 0.15748775005340576, "accuracy": 0.7578125, "epoch": 0.33, "step": 132 }, { "epoch": 0.3325, "grad_norm": 8.155619621276855, "learning_rate": 2.107894736842105e-06, "loss": 0.4138, "step": 133 }, { "Batch Mean": -0.010571569204330444, "accuracy": 0.796875, "epoch": 0.3325, "step": 133 }, { "epoch": 0.335, "grad_norm": 7.174466609954834, "learning_rate": 2.1e-06, "loss": 0.3603, "step": 134 }, { "Batch Mean": -0.17839515209197998, "accuracy": 0.8125, "epoch": 0.335, "step": 134 }, { "epoch": 0.3375, "grad_norm": 9.18850326538086, "learning_rate": 2.0921052631578945e-06, "loss": 0.3665, "step": 135 }, { "Batch Mean": -0.0516209602355957, "accuracy": 0.7578125, "epoch": 0.3375, "step": 135 }, { "epoch": 0.34, "grad_norm": 8.749617576599121, "learning_rate": 2.0842105263157897e-06, "loss": 0.4511, "step": 136 }, { "Batch Mean": -0.33418262004852295, "accuracy": 0.796875, "epoch": 0.34, "step": 136 }, { "epoch": 0.3425, "grad_norm": 10.268221855163574, "learning_rate": 2.0763157894736845e-06, "loss": 0.4395, "step": 137 }, { "Batch Mean": -0.208038330078125, "accuracy": 0.8515625, "epoch": 0.3425, "step": 137 }, { "epoch": 0.345, "grad_norm": 6.962666034698486, "learning_rate": 2.068421052631579e-06, "loss": 0.3682, "step": 138 }, { "Batch Mean": 0.1473088264465332, "accuracy": 0.8046875, "epoch": 0.345, "step": 138 }, { "epoch": 0.3475, "grad_norm": 8.331689834594727, "learning_rate": 2.060526315789474e-06, "loss": 0.3881, "step": 139 }, { "Batch Mean": 0.13666605949401855, "accuracy": 0.7734375, "epoch": 0.3475, "step": 139 }, { "epoch": 0.35, "grad_norm": 8.181924819946289, "learning_rate": 2.0526315789473687e-06, "loss": 0.4522, "step": 140 }, { "Batch Mean": -0.06362247467041016, "accuracy": 0.828125, "epoch": 0.35, "step": 140 }, { "epoch": 0.3525, "grad_norm": 7.695260047912598, "learning_rate": 2.0447368421052634e-06, "loss": 0.3219, "step": 141 }, { "Batch Mean": -0.1371777057647705, "accuracy": 0.8203125, "epoch": 0.3525, "step": 141 }, { "epoch": 0.355, "grad_norm": 7.281587600708008, "learning_rate": 2.0368421052631578e-06, "loss": 0.3907, "step": 142 }, { "Batch Mean": 0.5550127029418945, "accuracy": 0.84375, "epoch": 0.355, "step": 142 }, { "epoch": 0.3575, "grad_norm": 10.004101753234863, "learning_rate": 2.0289473684210525e-06, "loss": 0.3703, "step": 143 }, { "Batch Mean": 0.5781314373016357, "accuracy": 0.8828125, "epoch": 0.3575, "step": 143 }, { "epoch": 0.36, "grad_norm": 10.783740997314453, "learning_rate": 2.0210526315789473e-06, "loss": 0.3023, "step": 144 }, { "Batch Mean": -0.06548458337783813, "accuracy": 0.796875, "epoch": 0.36, "step": 144 }, { "epoch": 0.3625, "grad_norm": 8.395689010620117, "learning_rate": 2.013157894736842e-06, "loss": 0.3807, "step": 145 }, { "Batch Mean": -0.28943490982055664, "accuracy": 0.8125, "epoch": 0.3625, "step": 145 }, { "epoch": 0.365, "grad_norm": 8.962904930114746, "learning_rate": 2.0052631578947367e-06, "loss": 0.4011, "step": 146 }, { "Batch Mean": -0.23036813735961914, "accuracy": 0.8359375, "epoch": 0.365, "step": 146 }, { "epoch": 0.3675, "grad_norm": 8.533259391784668, "learning_rate": 1.9973684210526315e-06, "loss": 0.4078, "step": 147 }, { "Batch Mean": 0.04428744316101074, "accuracy": 0.8203125, "epoch": 0.3675, "step": 147 }, { "epoch": 0.37, "grad_norm": 9.401562690734863, "learning_rate": 1.9894736842105262e-06, "loss": 0.4861, "step": 148 }, { "Batch Mean": -0.631324052810669, "accuracy": 0.84375, "epoch": 0.37, "step": 148 }, { "epoch": 0.3725, "grad_norm": 11.169927597045898, "learning_rate": 1.9815789473684214e-06, "loss": 0.353, "step": 149 }, { "Batch Mean": -0.36252474784851074, "accuracy": 0.875, "epoch": 0.3725, "step": 149 }, { "epoch": 0.375, "grad_norm": 8.18213939666748, "learning_rate": 1.973684210526316e-06, "loss": 0.3192, "step": 150 }, { "Batch Mean": 0.1312960386276245, "accuracy": 0.8515625, "epoch": 0.375, "step": 150 }, { "epoch": 0.3775, "grad_norm": 7.884857654571533, "learning_rate": 1.965789473684211e-06, "loss": 0.3448, "step": 151 }, { "Batch Mean": -0.07881736755371094, "accuracy": 0.75, "epoch": 0.3775, "step": 151 }, { "epoch": 0.38, "grad_norm": 11.124417304992676, "learning_rate": 1.9578947368421052e-06, "loss": 0.5138, "step": 152 }, { "Batch Mean": 0.44060420989990234, "accuracy": 0.84375, "epoch": 0.38, "step": 152 }, { "epoch": 0.3825, "grad_norm": 8.98283576965332, "learning_rate": 1.95e-06, "loss": 0.3417, "step": 153 }, { "Batch Mean": 0.09381070733070374, "accuracy": 0.84375, "epoch": 0.3825, "step": 153 }, { "epoch": 0.385, "grad_norm": 6.34003210067749, "learning_rate": 1.9421052631578947e-06, "loss": 0.3136, "step": 154 }, { "Batch Mean": -0.005020737648010254, "accuracy": 0.8359375, "epoch": 0.385, "step": 154 }, { "epoch": 0.3875, "grad_norm": 6.930245399475098, "learning_rate": 1.9342105263157895e-06, "loss": 0.3927, "step": 155 }, { "Batch Mean": 0.38732171058654785, "accuracy": 0.859375, "epoch": 0.3875, "step": 155 }, { "epoch": 0.39, "grad_norm": 7.609610080718994, "learning_rate": 1.926315789473684e-06, "loss": 0.3484, "step": 156 }, { "Batch Mean": 0.1457352638244629, "accuracy": 0.859375, "epoch": 0.39, "step": 156 }, { "epoch": 0.3925, "grad_norm": 6.132632732391357, "learning_rate": 1.918421052631579e-06, "loss": 0.3313, "step": 157 }, { "Batch Mean": -0.12105239927768707, "accuracy": 0.859375, "epoch": 0.3925, "step": 157 }, { "epoch": 0.395, "grad_norm": 6.948753356933594, "learning_rate": 1.9105263157894737e-06, "loss": 0.3103, "step": 158 }, { "Batch Mean": -0.2733367681503296, "accuracy": 0.8125, "epoch": 0.395, "step": 158 }, { "epoch": 0.3975, "grad_norm": 8.32777214050293, "learning_rate": 1.9026315789473684e-06, "loss": 0.3927, "step": 159 }, { "Batch Mean": -0.2845277786254883, "accuracy": 0.8046875, "epoch": 0.3975, "step": 159 }, { "epoch": 0.4, "grad_norm": 8.584951400756836, "learning_rate": 1.8947368421052632e-06, "loss": 0.3756, "step": 160 }, { "Batch Mean": 0.45987796783447266, "accuracy": 0.8828125, "epoch": 0.4, "step": 160 }, { "epoch": 0.4025, "grad_norm": 11.069204330444336, "learning_rate": 1.8868421052631577e-06, "loss": 0.3221, "step": 161 }, { "Batch Mean": 0.05443763732910156, "accuracy": 0.8046875, "epoch": 0.4025, "step": 161 }, { "epoch": 0.405, "grad_norm": 8.764575958251953, "learning_rate": 1.8789473684210525e-06, "loss": 0.3762, "step": 162 }, { "Batch Mean": 0.0884392112493515, "accuracy": 0.8203125, "epoch": 0.405, "step": 162 }, { "epoch": 0.4075, "grad_norm": 8.510726928710938, "learning_rate": 1.8710526315789476e-06, "loss": 0.3937, "step": 163 }, { "Batch Mean": 0.06999784708023071, "accuracy": 0.859375, "epoch": 0.4075, "step": 163 }, { "epoch": 0.41, "grad_norm": 7.868905067443848, "learning_rate": 1.8631578947368424e-06, "loss": 0.2751, "step": 164 }, { "Batch Mean": 0.36394214630126953, "accuracy": 0.828125, "epoch": 0.41, "step": 164 }, { "epoch": 0.4125, "grad_norm": 12.59380054473877, "learning_rate": 1.855263157894737e-06, "loss": 0.399, "step": 165 }, { "Batch Mean": -0.03789353370666504, "accuracy": 0.8203125, "epoch": 0.4125, "step": 165 }, { "epoch": 0.415, "grad_norm": 8.086974143981934, "learning_rate": 1.8473684210526317e-06, "loss": 0.3637, "step": 166 }, { "Batch Mean": 0.06824040412902832, "accuracy": 0.8515625, "epoch": 0.415, "step": 166 }, { "epoch": 0.4175, "grad_norm": 9.723653793334961, "learning_rate": 1.8394736842105264e-06, "loss": 0.394, "step": 167 }, { "Batch Mean": -0.5270748138427734, "accuracy": 0.8203125, "epoch": 0.4175, "step": 167 }, { "epoch": 0.42, "grad_norm": 13.736504554748535, "learning_rate": 1.8315789473684211e-06, "loss": 0.429, "step": 168 }, { "Batch Mean": -0.12757277488708496, "accuracy": 0.8359375, "epoch": 0.42, "step": 168 }, { "epoch": 0.4225, "grad_norm": 8.577059745788574, "learning_rate": 1.8236842105263159e-06, "loss": 0.3599, "step": 169 }, { "Batch Mean": -0.024989277124404907, "accuracy": 0.8203125, "epoch": 0.4225, "step": 169 }, { "epoch": 0.425, "grad_norm": 8.25197982788086, "learning_rate": 1.8157894736842106e-06, "loss": 0.4185, "step": 170 }, { "Batch Mean": 0.03246608376502991, "accuracy": 0.875, "epoch": 0.425, "step": 170 }, { "epoch": 0.4275, "grad_norm": 5.9294819831848145, "learning_rate": 1.8078947368421052e-06, "loss": 0.2621, "step": 171 }, { "Batch Mean": 0.26308655738830566, "accuracy": 0.828125, "epoch": 0.4275, "step": 171 }, { "epoch": 0.43, "grad_norm": 7.570163249969482, "learning_rate": 1.8e-06, "loss": 0.3638, "step": 172 }, { "Batch Mean": -0.25519466400146484, "accuracy": 0.828125, "epoch": 0.43, "step": 172 }, { "epoch": 0.4325, "grad_norm": 7.611836910247803, "learning_rate": 1.7921052631578947e-06, "loss": 0.3867, "step": 173 }, { "Batch Mean": -0.01856723427772522, "accuracy": 0.8515625, "epoch": 0.4325, "step": 173 }, { "epoch": 0.435, "grad_norm": 7.200859546661377, "learning_rate": 1.7842105263157894e-06, "loss": 0.3596, "step": 174 }, { "Batch Mean": -0.06785082817077637, "accuracy": 0.8125, "epoch": 0.435, "step": 174 }, { "epoch": 0.4375, "grad_norm": 8.669882774353027, "learning_rate": 1.7763157894736842e-06, "loss": 0.4487, "step": 175 }, { "Batch Mean": -0.3832893371582031, "accuracy": 0.828125, "epoch": 0.4375, "step": 175 }, { "epoch": 0.44, "grad_norm": 8.320089340209961, "learning_rate": 1.768421052631579e-06, "loss": 0.4209, "step": 176 }, { "Batch Mean": -0.08297932147979736, "accuracy": 0.8203125, "epoch": 0.44, "step": 176 }, { "epoch": 0.4425, "grad_norm": 7.819711208343506, "learning_rate": 1.7605263157894739e-06, "loss": 0.3551, "step": 177 }, { "Batch Mean": 0.37918123602867126, "accuracy": 0.8125, "epoch": 0.4425, "step": 177 }, { "epoch": 0.445, "grad_norm": 8.0424165725708, "learning_rate": 1.7526315789473686e-06, "loss": 0.3594, "step": 178 }, { "Batch Mean": 0.05135989189147949, "accuracy": 0.890625, "epoch": 0.445, "step": 178 }, { "epoch": 0.4475, "grad_norm": 6.7139811515808105, "learning_rate": 1.7447368421052633e-06, "loss": 0.3117, "step": 179 }, { "Batch Mean": 0.06295812129974365, "accuracy": 0.828125, "epoch": 0.4475, "step": 179 }, { "epoch": 0.45, "grad_norm": 7.755770683288574, "learning_rate": 1.736842105263158e-06, "loss": 0.396, "step": 180 }, { "Batch Mean": -0.19158035516738892, "accuracy": 0.8671875, "epoch": 0.45, "step": 180 }, { "epoch": 0.4525, "grad_norm": 6.9720778465271, "learning_rate": 1.7289473684210526e-06, "loss": 0.3034, "step": 181 }, { "Batch Mean": 0.045849740505218506, "accuracy": 0.859375, "epoch": 0.4525, "step": 181 }, { "epoch": 0.455, "grad_norm": 6.858092784881592, "learning_rate": 1.7210526315789474e-06, "loss": 0.3629, "step": 182 }, { "Batch Mean": 0.46304988861083984, "accuracy": 0.8125, "epoch": 0.455, "step": 182 }, { "epoch": 0.4575, "grad_norm": 8.993224143981934, "learning_rate": 1.7131578947368421e-06, "loss": 0.3538, "step": 183 }, { "Batch Mean": 0.057231903076171875, "accuracy": 0.7890625, "epoch": 0.4575, "step": 183 }, { "epoch": 0.46, "grad_norm": 11.233847618103027, "learning_rate": 1.7052631578947369e-06, "loss": 0.4784, "step": 184 }, { "Batch Mean": -0.06488056480884552, "accuracy": 0.859375, "epoch": 0.46, "step": 184 }, { "epoch": 0.4625, "grad_norm": 8.108458518981934, "learning_rate": 1.6973684210526316e-06, "loss": 0.2988, "step": 185 }, { "Batch Mean": -0.4033060073852539, "accuracy": 0.78125, "epoch": 0.4625, "step": 185 }, { "epoch": 0.465, "grad_norm": 9.097691535949707, "learning_rate": 1.6894736842105264e-06, "loss": 0.4141, "step": 186 }, { "Batch Mean": -0.41214847564697266, "accuracy": 0.8828125, "epoch": 0.465, "step": 186 }, { "epoch": 0.4675, "grad_norm": 8.145352363586426, "learning_rate": 1.6815789473684209e-06, "loss": 0.3051, "step": 187 }, { "Batch Mean": 0.14345920085906982, "accuracy": 0.8671875, "epoch": 0.4675, "step": 187 }, { "epoch": 0.47, "grad_norm": 7.163969039916992, "learning_rate": 1.6736842105263156e-06, "loss": 0.3405, "step": 188 }, { "Batch Mean": 0.25019216537475586, "accuracy": 0.8359375, "epoch": 0.47, "step": 188 }, { "epoch": 0.4725, "grad_norm": 8.14911937713623, "learning_rate": 1.6657894736842104e-06, "loss": 0.3334, "step": 189 }, { "Batch Mean": 0.030582189559936523, "accuracy": 0.8203125, "epoch": 0.4725, "step": 189 }, { "epoch": 0.475, "grad_norm": 7.993282318115234, "learning_rate": 1.6578947368421056e-06, "loss": 0.3885, "step": 190 }, { "Batch Mean": 0.14673519134521484, "accuracy": 0.8125, "epoch": 0.475, "step": 190 }, { "epoch": 0.4775, "grad_norm": 9.056065559387207, "learning_rate": 1.65e-06, "loss": 0.4084, "step": 191 }, { "Batch Mean": 0.3741884231567383, "accuracy": 0.875, "epoch": 0.4775, "step": 191 }, { "epoch": 0.48, "grad_norm": 8.857176780700684, "learning_rate": 1.6421052631578948e-06, "loss": 0.3182, "step": 192 }, { "Batch Mean": 0.41150104999542236, "accuracy": 0.859375, "epoch": 0.48, "step": 192 }, { "epoch": 0.4825, "grad_norm": 8.556700706481934, "learning_rate": 1.6342105263157896e-06, "loss": 0.3261, "step": 193 }, { "Batch Mean": -0.4138193130493164, "accuracy": 0.8359375, "epoch": 0.4825, "step": 193 }, { "epoch": 0.485, "grad_norm": 7.911004543304443, "learning_rate": 1.6263157894736843e-06, "loss": 0.3395, "step": 194 }, { "Batch Mean": -0.7141661643981934, "accuracy": 0.765625, "epoch": 0.485, "step": 194 }, { "epoch": 0.4875, "grad_norm": 10.288289070129395, "learning_rate": 1.618421052631579e-06, "loss": 0.4338, "step": 195 }, { "Batch Mean": -0.5950565338134766, "accuracy": 0.8203125, "epoch": 0.4875, "step": 195 }, { "epoch": 0.49, "grad_norm": 9.772676467895508, "learning_rate": 1.6105263157894738e-06, "loss": 0.4012, "step": 196 }, { "Batch Mean": -0.40017926692962646, "accuracy": 0.84375, "epoch": 0.49, "step": 196 }, { "epoch": 0.4925, "grad_norm": 7.894969463348389, "learning_rate": 1.6026315789473683e-06, "loss": 0.363, "step": 197 }, { "Batch Mean": 0.4873809814453125, "accuracy": 0.8203125, "epoch": 0.4925, "step": 197 }, { "epoch": 0.495, "grad_norm": 9.627370834350586, "learning_rate": 1.594736842105263e-06, "loss": 0.3737, "step": 198 }, { "Batch Mean": 0.2109694480895996, "accuracy": 0.765625, "epoch": 0.495, "step": 198 }, { "epoch": 0.4975, "grad_norm": 8.528368949890137, "learning_rate": 1.5868421052631578e-06, "loss": 0.4261, "step": 199 }, { "Batch Mean": 0.3447328805923462, "accuracy": 0.8125, "epoch": 0.4975, "step": 199 }, { "epoch": 0.5, "grad_norm": 8.279525756835938, "learning_rate": 1.5789473684210526e-06, "loss": 0.3857, "step": 200 }, { "Batch Mean": 0.6082854270935059, "accuracy": 0.84375, "epoch": 0.5, "step": 200 }, { "epoch": 0.5025, "grad_norm": 9.466031074523926, "learning_rate": 1.5710526315789473e-06, "loss": 0.3566, "step": 201 }, { "Batch Mean": -0.13963961601257324, "accuracy": 0.8125, "epoch": 0.5025, "step": 201 }, { "epoch": 0.505, "grad_norm": 6.909806728363037, "learning_rate": 1.563157894736842e-06, "loss": 0.3895, "step": 202 }, { "Batch Mean": 0.03664124011993408, "accuracy": 0.8828125, "epoch": 0.505, "step": 202 }, { "epoch": 0.5075, "grad_norm": 5.393540859222412, "learning_rate": 1.5552631578947368e-06, "loss": 0.3268, "step": 203 }, { "Batch Mean": -0.344998836517334, "accuracy": 0.8359375, "epoch": 0.5075, "step": 203 }, { "epoch": 0.51, "grad_norm": 8.25485610961914, "learning_rate": 1.5473684210526318e-06, "loss": 0.4146, "step": 204 }, { "Batch Mean": -0.7813053131103516, "accuracy": 0.8203125, "epoch": 0.51, "step": 204 }, { "epoch": 0.5125, "grad_norm": 11.657013893127441, "learning_rate": 1.5394736842105265e-06, "loss": 0.4102, "step": 205 }, { "Batch Mean": -0.26070213317871094, "accuracy": 0.890625, "epoch": 0.5125, "step": 205 }, { "epoch": 0.515, "grad_norm": 7.036439895629883, "learning_rate": 1.5315789473684213e-06, "loss": 0.291, "step": 206 }, { "Batch Mean": 0.03567647933959961, "accuracy": 0.796875, "epoch": 0.515, "step": 206 }, { "epoch": 0.5175, "grad_norm": 8.311566352844238, "learning_rate": 1.5236842105263158e-06, "loss": 0.4407, "step": 207 }, { "Batch Mean": 0.23694515228271484, "accuracy": 0.8984375, "epoch": 0.5175, "step": 207 }, { "epoch": 0.52, "grad_norm": 6.446564674377441, "learning_rate": 1.5157894736842105e-06, "loss": 0.2909, "step": 208 }, { "Batch Mean": 0.6833584308624268, "accuracy": 0.84375, "epoch": 0.52, "step": 208 }, { "epoch": 0.5225, "grad_norm": 10.551989555358887, "learning_rate": 1.5078947368421053e-06, "loss": 0.3614, "step": 209 }, { "Batch Mean": 0.660728931427002, "accuracy": 0.7734375, "epoch": 0.5225, "step": 209 }, { "epoch": 0.525, "grad_norm": 11.83906078338623, "learning_rate": 1.5e-06, "loss": 0.4109, "step": 210 }, { "Batch Mean": 0.08896183967590332, "accuracy": 0.8125, "epoch": 0.525, "step": 210 }, { "epoch": 0.5275, "grad_norm": 8.59549617767334, "learning_rate": 1.4921052631578948e-06, "loss": 0.4581, "step": 211 }, { "Batch Mean": -0.11360204219818115, "accuracy": 0.8203125, "epoch": 0.5275, "step": 211 }, { "epoch": 0.53, "grad_norm": 6.57830810546875, "learning_rate": 1.4842105263157895e-06, "loss": 0.35, "step": 212 }, { "Batch Mean": -0.4756917953491211, "accuracy": 0.859375, "epoch": 0.53, "step": 212 }, { "epoch": 0.5325, "grad_norm": 9.362006187438965, "learning_rate": 1.4763157894736843e-06, "loss": 0.3195, "step": 213 }, { "Batch Mean": -0.2329235076904297, "accuracy": 0.8515625, "epoch": 0.5325, "step": 213 }, { "epoch": 0.535, "grad_norm": 8.77215576171875, "learning_rate": 1.468421052631579e-06, "loss": 0.4045, "step": 214 }, { "Batch Mean": 0.06287053227424622, "accuracy": 0.8359375, "epoch": 0.535, "step": 214 }, { "epoch": 0.5375, "grad_norm": 6.861894607543945, "learning_rate": 1.4605263157894738e-06, "loss": 0.3336, "step": 215 }, { "Batch Mean": -0.24506235122680664, "accuracy": 0.8515625, "epoch": 0.5375, "step": 215 }, { "epoch": 0.54, "grad_norm": 8.678897857666016, "learning_rate": 1.4526315789473685e-06, "loss": 0.3562, "step": 216 }, { "Batch Mean": 0.07905793190002441, "accuracy": 0.8359375, "epoch": 0.54, "step": 216 }, { "epoch": 0.5425, "grad_norm": 7.401151657104492, "learning_rate": 1.4447368421052633e-06, "loss": 0.3627, "step": 217 }, { "Batch Mean": 0.15029168128967285, "accuracy": 0.8046875, "epoch": 0.5425, "step": 217 }, { "epoch": 0.545, "grad_norm": 7.813758373260498, "learning_rate": 1.436842105263158e-06, "loss": 0.3648, "step": 218 }, { "Batch Mean": 0.002633899450302124, "accuracy": 0.8984375, "epoch": 0.545, "step": 218 }, { "epoch": 0.5475, "grad_norm": 7.126307487487793, "learning_rate": 1.4289473684210525e-06, "loss": 0.2894, "step": 219 }, { "Batch Mean": -0.1388775110244751, "accuracy": 0.84375, "epoch": 0.5475, "step": 219 }, { "epoch": 0.55, "grad_norm": 7.792306900024414, "learning_rate": 1.4210526315789473e-06, "loss": 0.3971, "step": 220 }, { "Batch Mean": 0.39450502395629883, "accuracy": 0.890625, "epoch": 0.55, "step": 220 }, { "epoch": 0.5525, "grad_norm": 8.030144691467285, "learning_rate": 1.4131578947368422e-06, "loss": 0.289, "step": 221 }, { "Batch Mean": 0.05488854646682739, "accuracy": 0.8515625, "epoch": 0.5525, "step": 221 }, { "epoch": 0.555, "grad_norm": 7.672267913818359, "learning_rate": 1.405263157894737e-06, "loss": 0.3637, "step": 222 }, { "Batch Mean": -0.231512188911438, "accuracy": 0.8671875, "epoch": 0.555, "step": 222 }, { "epoch": 0.5575, "grad_norm": 8.242493629455566, "learning_rate": 1.3973684210526317e-06, "loss": 0.2882, "step": 223 }, { "Batch Mean": -0.3896869421005249, "accuracy": 0.8515625, "epoch": 0.5575, "step": 223 }, { "epoch": 0.56, "grad_norm": 8.707738876342773, "learning_rate": 1.3894736842105263e-06, "loss": 0.3375, "step": 224 }, { "Batch Mean": -0.06315040588378906, "accuracy": 0.859375, "epoch": 0.56, "step": 224 }, { "epoch": 0.5625, "grad_norm": 7.783517360687256, "learning_rate": 1.381578947368421e-06, "loss": 0.3369, "step": 225 }, { "Batch Mean": 0.7080554962158203, "accuracy": 0.8046875, "epoch": 0.5625, "step": 225 }, { "epoch": 0.565, "grad_norm": 13.766866683959961, "learning_rate": 1.3736842105263158e-06, "loss": 0.4215, "step": 226 }, { "Batch Mean": 0.6378374099731445, "accuracy": 0.8203125, "epoch": 0.565, "step": 226 }, { "epoch": 0.5675, "grad_norm": 11.578181266784668, "learning_rate": 1.3657894736842107e-06, "loss": 0.4144, "step": 227 }, { "Batch Mean": 0.37397003173828125, "accuracy": 0.8359375, "epoch": 0.5675, "step": 227 }, { "epoch": 0.57, "grad_norm": 8.984515190124512, "learning_rate": 1.3578947368421055e-06, "loss": 0.3828, "step": 228 }, { "Batch Mean": -0.1430950164794922, "accuracy": 0.90625, "epoch": 0.57, "step": 228 }, { "epoch": 0.5725, "grad_norm": 6.43174409866333, "learning_rate": 1.35e-06, "loss": 0.3035, "step": 229 }, { "Batch Mean": -0.2629528045654297, "accuracy": 0.875, "epoch": 0.5725, "step": 229 }, { "epoch": 0.575, "grad_norm": 7.405312538146973, "learning_rate": 1.3421052631578947e-06, "loss": 0.3615, "step": 230 }, { "Batch Mean": -0.6128553152084351, "accuracy": 0.7734375, "epoch": 0.575, "step": 230 }, { "epoch": 0.5775, "grad_norm": 12.709471702575684, "learning_rate": 1.3342105263157895e-06, "loss": 0.4879, "step": 231 }, { "Batch Mean": -0.30329418182373047, "accuracy": 0.828125, "epoch": 0.5775, "step": 231 }, { "epoch": 0.58, "grad_norm": 8.190142631530762, "learning_rate": 1.3263157894736842e-06, "loss": 0.3596, "step": 232 }, { "Batch Mean": 0.236228346824646, "accuracy": 0.7734375, "epoch": 0.58, "step": 232 }, { "epoch": 0.5825, "grad_norm": 9.241979598999023, "learning_rate": 1.318421052631579e-06, "loss": 0.4575, "step": 233 }, { "Batch Mean": 0.2605457305908203, "accuracy": 0.8046875, "epoch": 0.5825, "step": 233 }, { "epoch": 0.585, "grad_norm": 7.555475234985352, "learning_rate": 1.3105263157894737e-06, "loss": 0.3833, "step": 234 }, { "Batch Mean": 0.248213529586792, "accuracy": 0.921875, "epoch": 0.585, "step": 234 }, { "epoch": 0.5875, "grad_norm": 7.177881240844727, "learning_rate": 1.3026315789473685e-06, "loss": 0.2338, "step": 235 }, { "Batch Mean": -0.26292717456817627, "accuracy": 0.8828125, "epoch": 0.5875, "step": 235 }, { "epoch": 0.59, "grad_norm": 7.436389446258545, "learning_rate": 1.2947368421052632e-06, "loss": 0.3097, "step": 236 }, { "Batch Mean": -0.24770569801330566, "accuracy": 0.8671875, "epoch": 0.59, "step": 236 }, { "epoch": 0.5925, "grad_norm": 7.95246696472168, "learning_rate": 1.286842105263158e-06, "loss": 0.3513, "step": 237 }, { "Batch Mean": -0.38290679454803467, "accuracy": 0.796875, "epoch": 0.5925, "step": 237 }, { "epoch": 0.595, "grad_norm": 8.727229118347168, "learning_rate": 1.2789473684210527e-06, "loss": 0.4006, "step": 238 }, { "Batch Mean": 0.005364656448364258, "accuracy": 0.8671875, "epoch": 0.595, "step": 238 }, { "epoch": 0.5975, "grad_norm": 5.793431758880615, "learning_rate": 1.2710526315789474e-06, "loss": 0.2995, "step": 239 }, { "Batch Mean": 0.05780482292175293, "accuracy": 0.8828125, "epoch": 0.5975, "step": 239 }, { "epoch": 0.6, "grad_norm": 7.411877632141113, "learning_rate": 1.263157894736842e-06, "loss": 0.3011, "step": 240 }, { "Batch Mean": 0.402042418718338, "accuracy": 0.828125, "epoch": 0.6, "step": 240 }, { "epoch": 0.6025, "grad_norm": 8.23151969909668, "learning_rate": 1.255263157894737e-06, "loss": 0.3518, "step": 241 }, { "Batch Mean": 0.38238781690597534, "accuracy": 0.84375, "epoch": 0.6025, "step": 241 }, { "epoch": 0.605, "grad_norm": 8.42182731628418, "learning_rate": 1.2473684210526317e-06, "loss": 0.3087, "step": 242 }, { "Batch Mean": 0.5328927040100098, "accuracy": 0.8203125, "epoch": 0.605, "step": 242 }, { "epoch": 0.6075, "grad_norm": 9.953737258911133, "learning_rate": 1.2394736842105264e-06, "loss": 0.3409, "step": 243 }, { "Batch Mean": 0.19218015670776367, "accuracy": 0.78125, "epoch": 0.6075, "step": 243 }, { "epoch": 0.61, "grad_norm": 8.72748851776123, "learning_rate": 1.2315789473684212e-06, "loss": 0.4004, "step": 244 }, { "Batch Mean": -0.012094497680664062, "accuracy": 0.828125, "epoch": 0.61, "step": 244 }, { "epoch": 0.6125, "grad_norm": 7.34162712097168, "learning_rate": 1.2236842105263157e-06, "loss": 0.3568, "step": 245 }, { "Batch Mean": -0.23955392837524414, "accuracy": 0.796875, "epoch": 0.6125, "step": 245 }, { "epoch": 0.615, "grad_norm": 9.275582313537598, "learning_rate": 1.2157894736842105e-06, "loss": 0.462, "step": 246 }, { "Batch Mean": -1.1516103744506836, "accuracy": 0.7734375, "epoch": 0.615, "step": 246 }, { "epoch": 0.6175, "grad_norm": 16.910037994384766, "learning_rate": 1.2078947368421052e-06, "loss": 0.4268, "step": 247 }, { "Batch Mean": -0.5064811706542969, "accuracy": 0.8203125, "epoch": 0.6175, "step": 247 }, { "epoch": 0.62, "grad_norm": 10.072269439697266, "learning_rate": 1.2000000000000002e-06, "loss": 0.4106, "step": 248 }, { "Batch Mean": -0.645620584487915, "accuracy": 0.921875, "epoch": 0.62, "step": 248 }, { "epoch": 0.6225, "grad_norm": 9.188313484191895, "learning_rate": 1.192105263157895e-06, "loss": 0.2486, "step": 249 }, { "Batch Mean": -0.1468508243560791, "accuracy": 0.84375, "epoch": 0.6225, "step": 249 }, { "epoch": 0.625, "grad_norm": 7.615783214569092, "learning_rate": 1.1842105263157894e-06, "loss": 0.3676, "step": 250 }, { "Batch Mean": -0.36475658416748047, "accuracy": 0.890625, "epoch": 0.625, "step": 250 }, { "epoch": 0.6275, "grad_norm": 7.731856822967529, "learning_rate": 1.1763157894736842e-06, "loss": 0.3423, "step": 251 }, { "Batch Mean": 0.27223968505859375, "accuracy": 0.8203125, "epoch": 0.6275, "step": 251 }, { "epoch": 0.63, "grad_norm": 8.997645378112793, "learning_rate": 1.168421052631579e-06, "loss": 0.4098, "step": 252 }, { "Batch Mean": 0.9861600399017334, "accuracy": 0.859375, "epoch": 0.63, "step": 252 }, { "epoch": 0.6325, "grad_norm": 13.911986351013184, "learning_rate": 1.1605263157894737e-06, "loss": 0.3761, "step": 253 }, { "Batch Mean": 0.6206140518188477, "accuracy": 0.859375, "epoch": 0.6325, "step": 253 }, { "epoch": 0.635, "grad_norm": 9.769001007080078, "learning_rate": 1.1526315789473684e-06, "loss": 0.3697, "step": 254 }, { "Batch Mean": 0.66489177942276, "accuracy": 0.8125, "epoch": 0.635, "step": 254 }, { "epoch": 0.6375, "grad_norm": 9.977997779846191, "learning_rate": 1.1447368421052632e-06, "loss": 0.3734, "step": 255 }, { "Batch Mean": 0.510383129119873, "accuracy": 0.8515625, "epoch": 0.6375, "step": 255 }, { "epoch": 0.64, "grad_norm": 8.681389808654785, "learning_rate": 1.136842105263158e-06, "loss": 0.3359, "step": 256 }, { "Batch Mean": -0.05070686340332031, "accuracy": 0.765625, "epoch": 0.64, "step": 256 }, { "epoch": 0.6425, "grad_norm": 7.4210205078125, "learning_rate": 1.1289473684210527e-06, "loss": 0.4495, "step": 257 }, { "Batch Mean": -0.22071266174316406, "accuracy": 0.796875, "epoch": 0.6425, "step": 257 }, { "epoch": 0.645, "grad_norm": 7.830999374389648, "learning_rate": 1.1210526315789474e-06, "loss": 0.4321, "step": 258 }, { "Batch Mean": 0.02909064292907715, "accuracy": 0.8359375, "epoch": 0.645, "step": 258 }, { "epoch": 0.6475, "grad_norm": 6.488223552703857, "learning_rate": 1.1131578947368421e-06, "loss": 0.3785, "step": 259 }, { "Batch Mean": -0.4812309741973877, "accuracy": 0.8203125, "epoch": 0.6475, "step": 259 }, { "epoch": 0.65, "grad_norm": 8.951570510864258, "learning_rate": 1.1052631578947369e-06, "loss": 0.3965, "step": 260 }, { "Batch Mean": -0.6864792108535767, "accuracy": 0.8828125, "epoch": 0.65, "step": 260 }, { "epoch": 0.6525, "grad_norm": 9.753576278686523, "learning_rate": 1.0973684210526316e-06, "loss": 0.3241, "step": 261 }, { "Batch Mean": -0.5582137703895569, "accuracy": 0.859375, "epoch": 0.6525, "step": 261 }, { "epoch": 0.655, "grad_norm": 8.607160568237305, "learning_rate": 1.0894736842105264e-06, "loss": 0.3297, "step": 262 }, { "Batch Mean": -0.19955122470855713, "accuracy": 0.796875, "epoch": 0.655, "step": 262 }, { "epoch": 0.6575, "grad_norm": 7.166594505310059, "learning_rate": 1.0815789473684211e-06, "loss": 0.4026, "step": 263 }, { "Batch Mean": 0.01648712158203125, "accuracy": 0.859375, "epoch": 0.6575, "step": 263 }, { "epoch": 0.66, "grad_norm": 6.486114501953125, "learning_rate": 1.0736842105263159e-06, "loss": 0.3557, "step": 264 }, { "Batch Mean": 0.3369255065917969, "accuracy": 0.875, "epoch": 0.66, "step": 264 }, { "epoch": 0.6625, "grad_norm": 7.933577060699463, "learning_rate": 1.0657894736842106e-06, "loss": 0.302, "step": 265 }, { "Batch Mean": 0.37366318702697754, "accuracy": 0.921875, "epoch": 0.6625, "step": 265 }, { "epoch": 0.665, "grad_norm": 7.446761608123779, "learning_rate": 1.0578947368421052e-06, "loss": 0.237, "step": 266 }, { "Batch Mean": 0.4524000883102417, "accuracy": 0.859375, "epoch": 0.665, "step": 266 }, { "epoch": 0.6675, "grad_norm": 8.502205848693848, "learning_rate": 1.05e-06, "loss": 0.3285, "step": 267 }, { "Batch Mean": 0.6751773357391357, "accuracy": 0.8359375, "epoch": 0.6675, "step": 267 }, { "epoch": 0.67, "grad_norm": 10.050773620605469, "learning_rate": 1.0421052631578949e-06, "loss": 0.4006, "step": 268 }, { "Batch Mean": 0.27886199951171875, "accuracy": 0.8671875, "epoch": 0.67, "step": 268 }, { "epoch": 0.6725, "grad_norm": 7.271613597869873, "learning_rate": 1.0342105263157896e-06, "loss": 0.3408, "step": 269 }, { "Batch Mean": 0.09986686706542969, "accuracy": 0.78125, "epoch": 0.6725, "step": 269 }, { "epoch": 0.675, "grad_norm": 8.224504470825195, "learning_rate": 1.0263157894736843e-06, "loss": 0.4315, "step": 270 }, { "Batch Mean": -0.23583006858825684, "accuracy": 0.859375, "epoch": 0.675, "step": 270 }, { "epoch": 0.6775, "grad_norm": 7.686771392822266, "learning_rate": 1.0184210526315789e-06, "loss": 0.3471, "step": 271 }, { "Batch Mean": -0.2585182189941406, "accuracy": 0.8515625, "epoch": 0.6775, "step": 271 }, { "epoch": 0.68, "grad_norm": 6.525619983673096, "learning_rate": 1.0105263157894736e-06, "loss": 0.3153, "step": 272 }, { "Batch Mean": 0.01106107234954834, "accuracy": 0.796875, "epoch": 0.68, "step": 272 }, { "epoch": 0.6825, "grad_norm": 7.4220967292785645, "learning_rate": 1.0026315789473684e-06, "loss": 0.4097, "step": 273 }, { "Batch Mean": -0.17680811882019043, "accuracy": 0.890625, "epoch": 0.6825, "step": 273 }, { "epoch": 0.685, "grad_norm": 6.954795837402344, "learning_rate": 9.947368421052631e-07, "loss": 0.3296, "step": 274 }, { "Batch Mean": -0.10438370704650879, "accuracy": 0.796875, "epoch": 0.685, "step": 274 }, { "epoch": 0.6875, "grad_norm": 8.100950241088867, "learning_rate": 9.86842105263158e-07, "loss": 0.4119, "step": 275 }, { "Batch Mean": -0.2608468532562256, "accuracy": 0.84375, "epoch": 0.6875, "step": 275 }, { "epoch": 0.69, "grad_norm": 7.6091437339782715, "learning_rate": 9.789473684210526e-07, "loss": 0.3792, "step": 276 }, { "Batch Mean": 0.2017640471458435, "accuracy": 0.8515625, "epoch": 0.69, "step": 276 }, { "epoch": 0.6925, "grad_norm": 7.743137836456299, "learning_rate": 9.710526315789474e-07, "loss": 0.3543, "step": 277 }, { "Batch Mean": 0.03964269161224365, "accuracy": 0.8203125, "epoch": 0.6925, "step": 277 }, { "epoch": 0.695, "grad_norm": 7.436593055725098, "learning_rate": 9.63157894736842e-07, "loss": 0.3741, "step": 278 }, { "Batch Mean": 0.1998610496520996, "accuracy": 0.8359375, "epoch": 0.695, "step": 278 }, { "epoch": 0.6975, "grad_norm": 7.1681718826293945, "learning_rate": 9.552631578947368e-07, "loss": 0.3788, "step": 279 }, { "Batch Mean": -0.3235056400299072, "accuracy": 0.8125, "epoch": 0.6975, "step": 279 }, { "epoch": 0.7, "grad_norm": 10.077638626098633, "learning_rate": 9.473684210526316e-07, "loss": 0.4124, "step": 280 }, { "Batch Mean": -0.2581977844238281, "accuracy": 0.8984375, "epoch": 0.7, "step": 280 }, { "epoch": 0.7025, "grad_norm": 6.695285320281982, "learning_rate": 9.394736842105262e-07, "loss": 0.2565, "step": 281 }, { "Batch Mean": 0.007382392883300781, "accuracy": 0.828125, "epoch": 0.7025, "step": 281 }, { "epoch": 0.705, "grad_norm": 7.9390869140625, "learning_rate": 9.315789473684212e-07, "loss": 0.3788, "step": 282 }, { "Batch Mean": -0.2042536735534668, "accuracy": 0.7890625, "epoch": 0.705, "step": 282 }, { "epoch": 0.7075, "grad_norm": 8.94979190826416, "learning_rate": 9.236842105263158e-07, "loss": 0.4032, "step": 283 }, { "Batch Mean": -0.3767300844192505, "accuracy": 0.890625, "epoch": 0.7075, "step": 283 }, { "epoch": 0.71, "grad_norm": 7.715059757232666, "learning_rate": 9.157894736842106e-07, "loss": 0.323, "step": 284 }, { "Batch Mean": 0.3044853210449219, "accuracy": 0.78125, "epoch": 0.71, "step": 284 }, { "epoch": 0.7125, "grad_norm": 7.844649791717529, "learning_rate": 9.078947368421053e-07, "loss": 0.4025, "step": 285 }, { "Batch Mean": -0.04273754358291626, "accuracy": 0.8125, "epoch": 0.7125, "step": 285 }, { "epoch": 0.715, "grad_norm": 7.262689113616943, "learning_rate": 9e-07, "loss": 0.4113, "step": 286 }, { "Batch Mean": -0.2648310661315918, "accuracy": 0.84375, "epoch": 0.715, "step": 286 }, { "epoch": 0.7175, "grad_norm": 7.911703586578369, "learning_rate": 8.921052631578947e-07, "loss": 0.3353, "step": 287 }, { "Batch Mean": 0.1920623779296875, "accuracy": 0.890625, "epoch": 0.7175, "step": 287 }, { "epoch": 0.72, "grad_norm": 6.033590316772461, "learning_rate": 8.842105263157895e-07, "loss": 0.2858, "step": 288 }, { "Batch Mean": 0.02208709716796875, "accuracy": 0.8515625, "epoch": 0.72, "step": 288 }, { "epoch": 0.7225, "grad_norm": 6.255743980407715, "learning_rate": 8.763157894736843e-07, "loss": 0.3499, "step": 289 }, { "Batch Mean": 0.157212495803833, "accuracy": 0.7734375, "epoch": 0.7225, "step": 289 }, { "epoch": 0.725, "grad_norm": 7.73415994644165, "learning_rate": 8.68421052631579e-07, "loss": 0.4015, "step": 290 }, { "Batch Mean": 0.07737350463867188, "accuracy": 0.8515625, "epoch": 0.725, "step": 290 }, { "epoch": 0.7275, "grad_norm": 6.704524040222168, "learning_rate": 8.605263157894737e-07, "loss": 0.3017, "step": 291 }, { "Batch Mean": 0.17530250549316406, "accuracy": 0.8671875, "epoch": 0.7275, "step": 291 }, { "epoch": 0.73, "grad_norm": 6.311378479003906, "learning_rate": 8.526315789473684e-07, "loss": 0.2901, "step": 292 }, { "Batch Mean": 0.10090017318725586, "accuracy": 0.8359375, "epoch": 0.73, "step": 292 }, { "epoch": 0.7325, "grad_norm": 7.105597496032715, "learning_rate": 8.447368421052632e-07, "loss": 0.4102, "step": 293 }, { "Batch Mean": 0.4577817916870117, "accuracy": 0.8671875, "epoch": 0.7325, "step": 293 }, { "epoch": 0.735, "grad_norm": 7.449181079864502, "learning_rate": 8.368421052631578e-07, "loss": 0.3147, "step": 294 }, { "Batch Mean": -0.21117448806762695, "accuracy": 0.8671875, "epoch": 0.735, "step": 294 }, { "epoch": 0.7375, "grad_norm": 7.147716522216797, "learning_rate": 8.289473684210528e-07, "loss": 0.3322, "step": 295 }, { "Batch Mean": -0.09427833557128906, "accuracy": 0.8515625, "epoch": 0.7375, "step": 295 }, { "epoch": 0.74, "grad_norm": 6.367930889129639, "learning_rate": 8.210526315789474e-07, "loss": 0.3257, "step": 296 }, { "Batch Mean": 0.09167495369911194, "accuracy": 0.8203125, "epoch": 0.74, "step": 296 }, { "epoch": 0.7425, "grad_norm": 7.110311031341553, "learning_rate": 8.131578947368422e-07, "loss": 0.3585, "step": 297 }, { "Batch Mean": -0.05771516263484955, "accuracy": 0.890625, "epoch": 0.7425, "step": 297 }, { "epoch": 0.745, "grad_norm": 6.1952314376831055, "learning_rate": 8.052631578947369e-07, "loss": 0.2568, "step": 298 }, { "Batch Mean": 0.2480783462524414, "accuracy": 0.875, "epoch": 0.745, "step": 298 }, { "epoch": 0.7475, "grad_norm": 7.205021381378174, "learning_rate": 7.973684210526315e-07, "loss": 0.3003, "step": 299 }, { "Batch Mean": -0.08726701885461807, "accuracy": 0.8671875, "epoch": 0.7475, "step": 299 }, { "epoch": 0.75, "grad_norm": 7.356017589569092, "learning_rate": 7.894736842105263e-07, "loss": 0.3019, "step": 300 }, { "Batch Mean": -0.008150577545166016, "accuracy": 0.8046875, "epoch": 0.75, "step": 300 }, { "epoch": 0.7525, "grad_norm": 7.7067437171936035, "learning_rate": 7.81578947368421e-07, "loss": 0.4048, "step": 301 }, { "Batch Mean": -0.06519889831542969, "accuracy": 0.859375, "epoch": 0.7525, "step": 301 }, { "epoch": 0.755, "grad_norm": 6.630680561065674, "learning_rate": 7.736842105263159e-07, "loss": 0.2795, "step": 302 }, { "Batch Mean": -0.16483700275421143, "accuracy": 0.8515625, "epoch": 0.755, "step": 302 }, { "epoch": 0.7575, "grad_norm": 7.234076976776123, "learning_rate": 7.657894736842106e-07, "loss": 0.3308, "step": 303 }, { "Batch Mean": -0.21460503339767456, "accuracy": 0.890625, "epoch": 0.7575, "step": 303 }, { "epoch": 0.76, "grad_norm": 6.38484001159668, "learning_rate": 7.578947368421053e-07, "loss": 0.2878, "step": 304 }, { "Batch Mean": 0.5146341323852539, "accuracy": 0.8984375, "epoch": 0.76, "step": 304 }, { "epoch": 0.7625, "grad_norm": 9.872756004333496, "learning_rate": 7.5e-07, "loss": 0.2629, "step": 305 }, { "Batch Mean": 0.4128565788269043, "accuracy": 0.8125, "epoch": 0.7625, "step": 305 }, { "epoch": 0.765, "grad_norm": 10.836166381835938, "learning_rate": 7.421052631578948e-07, "loss": 0.3865, "step": 306 }, { "Batch Mean": -0.3605806827545166, "accuracy": 0.8671875, "epoch": 0.765, "step": 306 }, { "epoch": 0.7675, "grad_norm": 8.887052536010742, "learning_rate": 7.342105263157895e-07, "loss": 0.3567, "step": 307 }, { "Batch Mean": 0.11398649215698242, "accuracy": 0.8046875, "epoch": 0.7675, "step": 307 }, { "epoch": 0.77, "grad_norm": 9.620704650878906, "learning_rate": 7.263157894736843e-07, "loss": 0.396, "step": 308 }, { "Batch Mean": -0.47295188903808594, "accuracy": 0.8515625, "epoch": 0.77, "step": 308 }, { "epoch": 0.7725, "grad_norm": 9.428754806518555, "learning_rate": 7.18421052631579e-07, "loss": 0.3433, "step": 309 }, { "Batch Mean": -0.10897934436798096, "accuracy": 0.8125, "epoch": 0.7725, "step": 309 }, { "epoch": 0.775, "grad_norm": 7.966822147369385, "learning_rate": 7.105263157894736e-07, "loss": 0.3843, "step": 310 }, { "Batch Mean": -0.23631668090820312, "accuracy": 0.8359375, "epoch": 0.775, "step": 310 }, { "epoch": 0.7775, "grad_norm": 8.651246070861816, "learning_rate": 7.026315789473685e-07, "loss": 0.3825, "step": 311 }, { "Batch Mean": 0.07775044441223145, "accuracy": 0.8359375, "epoch": 0.7775, "step": 311 }, { "epoch": 0.78, "grad_norm": 6.632461071014404, "learning_rate": 6.947368421052631e-07, "loss": 0.3705, "step": 312 }, { "Batch Mean": -0.5292290449142456, "accuracy": 0.828125, "epoch": 0.78, "step": 312 }, { "epoch": 0.7825, "grad_norm": 9.416595458984375, "learning_rate": 6.868421052631579e-07, "loss": 0.3134, "step": 313 }, { "Batch Mean": 0.02381765842437744, "accuracy": 0.8359375, "epoch": 0.7825, "step": 313 }, { "epoch": 0.785, "grad_norm": 6.94856595993042, "learning_rate": 6.789473684210527e-07, "loss": 0.3315, "step": 314 }, { "Batch Mean": -0.4679083824157715, "accuracy": 0.8359375, "epoch": 0.785, "step": 314 }, { "epoch": 0.7875, "grad_norm": 9.398565292358398, "learning_rate": 6.710526315789474e-07, "loss": 0.3963, "step": 315 }, { "Batch Mean": 0.055476367473602295, "accuracy": 0.8515625, "epoch": 0.7875, "step": 315 }, { "epoch": 0.79, "grad_norm": 6.928508758544922, "learning_rate": 6.631578947368421e-07, "loss": 0.3243, "step": 316 }, { "Batch Mean": 0.19721317291259766, "accuracy": 0.859375, "epoch": 0.79, "step": 316 }, { "epoch": 0.7925, "grad_norm": 6.585093975067139, "learning_rate": 6.552631578947369e-07, "loss": 0.353, "step": 317 }, { "Batch Mean": 0.402996301651001, "accuracy": 0.8671875, "epoch": 0.7925, "step": 317 }, { "epoch": 0.795, "grad_norm": 8.292461395263672, "learning_rate": 6.473684210526316e-07, "loss": 0.343, "step": 318 }, { "Batch Mean": 0.020545482635498047, "accuracy": 0.890625, "epoch": 0.795, "step": 318 }, { "epoch": 0.7975, "grad_norm": 5.65919303894043, "learning_rate": 6.394736842105264e-07, "loss": 0.3015, "step": 319 }, { "Batch Mean": 0.06431841850280762, "accuracy": 0.8046875, "epoch": 0.7975, "step": 319 }, { "epoch": 0.8, "grad_norm": 6.297842979431152, "learning_rate": 6.31578947368421e-07, "loss": 0.3656, "step": 320 }, { "Batch Mean": 0.5624209642410278, "accuracy": 0.796875, "epoch": 0.8, "step": 320 }, { "epoch": 0.8025, "grad_norm": 9.877172470092773, "learning_rate": 6.236842105263158e-07, "loss": 0.4071, "step": 321 }, { "Batch Mean": 0.49359941482543945, "accuracy": 0.84375, "epoch": 0.8025, "step": 321 }, { "epoch": 0.805, "grad_norm": 8.474444389343262, "learning_rate": 6.157894736842106e-07, "loss": 0.3463, "step": 322 }, { "Batch Mean": -0.38565921783447266, "accuracy": 0.8515625, "epoch": 0.805, "step": 322 }, { "epoch": 0.8075, "grad_norm": 8.97575569152832, "learning_rate": 6.078947368421052e-07, "loss": 0.4083, "step": 323 }, { "Batch Mean": -0.18721318244934082, "accuracy": 0.90625, "epoch": 0.8075, "step": 323 }, { "epoch": 0.81, "grad_norm": 7.4215497970581055, "learning_rate": 6.000000000000001e-07, "loss": 0.3315, "step": 324 }, { "Batch Mean": -0.18316686153411865, "accuracy": 0.875, "epoch": 0.81, "step": 324 }, { "epoch": 0.8125, "grad_norm": 6.686371326446533, "learning_rate": 5.921052631578947e-07, "loss": 0.3194, "step": 325 }, { "Batch Mean": -0.42713236808776855, "accuracy": 0.8203125, "epoch": 0.8125, "step": 325 }, { "epoch": 0.815, "grad_norm": 8.342667579650879, "learning_rate": 5.842105263157895e-07, "loss": 0.3528, "step": 326 }, { "Batch Mean": -0.18327808380126953, "accuracy": 0.8359375, "epoch": 0.815, "step": 326 }, { "epoch": 0.8175, "grad_norm": 9.010228157043457, "learning_rate": 5.763157894736842e-07, "loss": 0.4052, "step": 327 }, { "Batch Mean": -0.2028331756591797, "accuracy": 0.8671875, "epoch": 0.8175, "step": 327 }, { "epoch": 0.82, "grad_norm": 6.920485496520996, "learning_rate": 5.68421052631579e-07, "loss": 0.3506, "step": 328 }, { "Batch Mean": -0.3213520050048828, "accuracy": 0.8125, "epoch": 0.82, "step": 328 }, { "epoch": 0.8225, "grad_norm": 7.344422817230225, "learning_rate": 5.605263157894737e-07, "loss": 0.3336, "step": 329 }, { "Batch Mean": -0.12929964065551758, "accuracy": 0.78125, "epoch": 0.8225, "step": 329 }, { "epoch": 0.825, "grad_norm": 8.22891902923584, "learning_rate": 5.526315789473684e-07, "loss": 0.434, "step": 330 }, { "Batch Mean": -0.12277650833129883, "accuracy": 0.84375, "epoch": 0.825, "step": 330 }, { "epoch": 0.8275, "grad_norm": 7.8546319007873535, "learning_rate": 5.447368421052632e-07, "loss": 0.3436, "step": 331 }, { "Batch Mean": 0.13565659523010254, "accuracy": 0.8671875, "epoch": 0.8275, "step": 331 }, { "epoch": 0.83, "grad_norm": 6.105228900909424, "learning_rate": 5.368421052631579e-07, "loss": 0.2767, "step": 332 }, { "Batch Mean": -0.030557870864868164, "accuracy": 0.7890625, "epoch": 0.83, "step": 332 }, { "epoch": 0.8325, "grad_norm": 7.214451789855957, "learning_rate": 5.289473684210526e-07, "loss": 0.3858, "step": 333 }, { "Batch Mean": 0.2767346501350403, "accuracy": 0.84375, "epoch": 0.8325, "step": 333 }, { "epoch": 0.835, "grad_norm": 7.583705425262451, "learning_rate": 5.210526315789474e-07, "loss": 0.3249, "step": 334 }, { "Batch Mean": 0.6653814315795898, "accuracy": 0.859375, "epoch": 0.835, "step": 334 }, { "epoch": 0.8375, "grad_norm": 9.319592475891113, "learning_rate": 5.131578947368422e-07, "loss": 0.3145, "step": 335 }, { "Batch Mean": 0.39778077602386475, "accuracy": 0.8984375, "epoch": 0.8375, "step": 335 }, { "epoch": 0.84, "grad_norm": 7.308060169219971, "learning_rate": 5.052631578947368e-07, "loss": 0.2994, "step": 336 }, { "Batch Mean": -0.01158761978149414, "accuracy": 0.8828125, "epoch": 0.84, "step": 336 }, { "epoch": 0.8425, "grad_norm": 7.057836532592773, "learning_rate": 4.973684210526316e-07, "loss": 0.2808, "step": 337 }, { "Batch Mean": 0.03655052185058594, "accuracy": 0.8515625, "epoch": 0.8425, "step": 337 }, { "epoch": 0.845, "grad_norm": 6.56977653503418, "learning_rate": 4.894736842105263e-07, "loss": 0.3766, "step": 338 }, { "Batch Mean": 0.43787431716918945, "accuracy": 0.8046875, "epoch": 0.845, "step": 338 }, { "epoch": 0.8475, "grad_norm": 8.1902494430542, "learning_rate": 4.81578947368421e-07, "loss": 0.3563, "step": 339 }, { "Batch Mean": 0.15516972541809082, "accuracy": 0.8515625, "epoch": 0.8475, "step": 339 }, { "epoch": 0.85, "grad_norm": 7.303832054138184, "learning_rate": 4.736842105263158e-07, "loss": 0.3327, "step": 340 }, { "Batch Mean": -0.029469013214111328, "accuracy": 0.8515625, "epoch": 0.85, "step": 340 }, { "epoch": 0.8525, "grad_norm": 6.227102756500244, "learning_rate": 4.657894736842106e-07, "loss": 0.3203, "step": 341 }, { "Batch Mean": -0.1123504638671875, "accuracy": 0.8671875, "epoch": 0.8525, "step": 341 }, { "epoch": 0.855, "grad_norm": 6.519909858703613, "learning_rate": 4.578947368421053e-07, "loss": 0.269, "step": 342 }, { "Batch Mean": -0.3899195194244385, "accuracy": 0.8828125, "epoch": 0.855, "step": 342 }, { "epoch": 0.8575, "grad_norm": 7.967192649841309, "learning_rate": 4.5e-07, "loss": 0.2578, "step": 343 }, { "Batch Mean": -0.024474501609802246, "accuracy": 0.8203125, "epoch": 0.8575, "step": 343 }, { "epoch": 0.86, "grad_norm": 7.005013465881348, "learning_rate": 4.421052631578947e-07, "loss": 0.33, "step": 344 }, { "Batch Mean": 0.4343888759613037, "accuracy": 0.7890625, "epoch": 0.86, "step": 344 }, { "epoch": 0.8625, "grad_norm": 10.061205863952637, "learning_rate": 4.342105263157895e-07, "loss": 0.4538, "step": 345 }, { "Batch Mean": -0.2816317081451416, "accuracy": 0.8671875, "epoch": 0.8625, "step": 345 }, { "epoch": 0.865, "grad_norm": 7.395506381988525, "learning_rate": 4.263157894736842e-07, "loss": 0.3462, "step": 346 }, { "Batch Mean": -0.2941579818725586, "accuracy": 0.8671875, "epoch": 0.865, "step": 346 }, { "epoch": 0.8675, "grad_norm": 8.371923446655273, "learning_rate": 4.184210526315789e-07, "loss": 0.3269, "step": 347 }, { "Batch Mean": -0.673569917678833, "accuracy": 0.84375, "epoch": 0.8675, "step": 347 }, { "epoch": 0.87, "grad_norm": 11.035270690917969, "learning_rate": 4.105263157894737e-07, "loss": 0.3364, "step": 348 }, { "Batch Mean": -0.15933895111083984, "accuracy": 0.8046875, "epoch": 0.87, "step": 348 }, { "epoch": 0.8725, "grad_norm": 8.731343269348145, "learning_rate": 4.0263157894736845e-07, "loss": 0.3801, "step": 349 }, { "Batch Mean": 0.19048595428466797, "accuracy": 0.7890625, "epoch": 0.8725, "step": 349 }, { "epoch": 0.875, "grad_norm": 9.1675386428833, "learning_rate": 3.9473684210526315e-07, "loss": 0.4322, "step": 350 }, { "Batch Mean": 0.27903127670288086, "accuracy": 0.828125, "epoch": 0.875, "step": 350 }, { "epoch": 0.8775, "grad_norm": 12.453442573547363, "learning_rate": 3.8684210526315794e-07, "loss": 0.3991, "step": 351 }, { "Batch Mean": 0.11368811130523682, "accuracy": 0.859375, "epoch": 0.8775, "step": 351 }, { "epoch": 0.88, "grad_norm": 7.374312877655029, "learning_rate": 3.7894736842105264e-07, "loss": 0.342, "step": 352 }, { "Batch Mean": -0.12367916107177734, "accuracy": 0.796875, "epoch": 0.88, "step": 352 }, { "epoch": 0.8825, "grad_norm": 8.004443168640137, "learning_rate": 3.710526315789474e-07, "loss": 0.3866, "step": 353 }, { "Batch Mean": -0.08196258544921875, "accuracy": 0.828125, "epoch": 0.8825, "step": 353 }, { "epoch": 0.885, "grad_norm": 8.011659622192383, "learning_rate": 3.6315789473684213e-07, "loss": 0.3524, "step": 354 }, { "Batch Mean": 0.3378896713256836, "accuracy": 0.828125, "epoch": 0.885, "step": 354 }, { "epoch": 0.8875, "grad_norm": 8.120012283325195, "learning_rate": 3.552631578947368e-07, "loss": 0.3749, "step": 355 }, { "Batch Mean": -0.20023924112319946, "accuracy": 0.8359375, "epoch": 0.8875, "step": 355 }, { "epoch": 0.89, "grad_norm": 8.692459106445312, "learning_rate": 3.4736842105263157e-07, "loss": 0.4222, "step": 356 }, { "Batch Mean": 0.1543264389038086, "accuracy": 0.8828125, "epoch": 0.89, "step": 356 }, { "epoch": 0.8925, "grad_norm": 6.623098850250244, "learning_rate": 3.3947368421052636e-07, "loss": 0.2718, "step": 357 }, { "Batch Mean": 0.2232210636138916, "accuracy": 0.8515625, "epoch": 0.8925, "step": 357 }, { "epoch": 0.895, "grad_norm": 7.03958797454834, "learning_rate": 3.3157894736842106e-07, "loss": 0.3586, "step": 358 }, { "Batch Mean": 0.28446149826049805, "accuracy": 0.8203125, "epoch": 0.895, "step": 358 }, { "epoch": 0.8975, "grad_norm": 8.019937515258789, "learning_rate": 3.236842105263158e-07, "loss": 0.3331, "step": 359 }, { "Batch Mean": 0.007267653942108154, "accuracy": 0.8984375, "epoch": 0.8975, "step": 359 }, { "epoch": 0.9, "grad_norm": 5.776856899261475, "learning_rate": 3.157894736842105e-07, "loss": 0.2273, "step": 360 }, { "Batch Mean": 0.020554065704345703, "accuracy": 0.8359375, "epoch": 0.9, "step": 360 }, { "epoch": 0.9025, "grad_norm": 8.30795669555664, "learning_rate": 3.078947368421053e-07, "loss": 0.4318, "step": 361 }, { "Batch Mean": 0.015145301818847656, "accuracy": 0.84375, "epoch": 0.9025, "step": 361 }, { "epoch": 0.905, "grad_norm": 6.911500930786133, "learning_rate": 3.0000000000000004e-07, "loss": 0.3564, "step": 362 }, { "Batch Mean": -0.37508678436279297, "accuracy": 0.859375, "epoch": 0.905, "step": 362 }, { "epoch": 0.9075, "grad_norm": 7.265951633453369, "learning_rate": 2.9210526315789473e-07, "loss": 0.3149, "step": 363 }, { "Batch Mean": 0.17000269889831543, "accuracy": 0.875, "epoch": 0.9075, "step": 363 }, { "epoch": 0.91, "grad_norm": 6.401723861694336, "learning_rate": 2.842105263157895e-07, "loss": 0.3244, "step": 364 }, { "Batch Mean": -0.11107516288757324, "accuracy": 0.859375, "epoch": 0.91, "step": 364 }, { "epoch": 0.9125, "grad_norm": 7.105179786682129, "learning_rate": 2.763157894736842e-07, "loss": 0.3572, "step": 365 }, { "Batch Mean": -0.5066146850585938, "accuracy": 0.8203125, "epoch": 0.9125, "step": 365 }, { "epoch": 0.915, "grad_norm": 8.87719440460205, "learning_rate": 2.6842105263157897e-07, "loss": 0.342, "step": 366 }, { "Batch Mean": 0.20459461212158203, "accuracy": 0.84375, "epoch": 0.915, "step": 366 }, { "epoch": 0.9175, "grad_norm": 7.022450923919678, "learning_rate": 2.605263157894737e-07, "loss": 0.3367, "step": 367 }, { "Batch Mean": -0.21013212203979492, "accuracy": 0.8828125, "epoch": 0.9175, "step": 367 }, { "epoch": 0.92, "grad_norm": 8.056741714477539, "learning_rate": 2.526315789473684e-07, "loss": 0.3546, "step": 368 }, { "Batch Mean": 0.02896130084991455, "accuracy": 0.8046875, "epoch": 0.92, "step": 368 }, { "epoch": 0.9225, "grad_norm": 7.063536643981934, "learning_rate": 2.4473684210526315e-07, "loss": 0.3883, "step": 369 }, { "Batch Mean": -0.20400434732437134, "accuracy": 0.828125, "epoch": 0.9225, "step": 369 }, { "epoch": 0.925, "grad_norm": 6.210810661315918, "learning_rate": 2.368421052631579e-07, "loss": 0.3502, "step": 370 }, { "Batch Mean": -0.37163734436035156, "accuracy": 0.859375, "epoch": 0.925, "step": 370 }, { "epoch": 0.9275, "grad_norm": 8.070487022399902, "learning_rate": 2.2894736842105264e-07, "loss": 0.3078, "step": 371 }, { "Batch Mean": 0.01798880100250244, "accuracy": 0.8515625, "epoch": 0.9275, "step": 371 }, { "epoch": 0.93, "grad_norm": 6.202131748199463, "learning_rate": 2.2105263157894736e-07, "loss": 0.3478, "step": 372 }, { "Batch Mean": 0.5030186176300049, "accuracy": 0.8671875, "epoch": 0.93, "step": 372 }, { "epoch": 0.9325, "grad_norm": 8.151087760925293, "learning_rate": 2.131578947368421e-07, "loss": 0.3084, "step": 373 }, { "Batch Mean": 0.08543097972869873, "accuracy": 0.90625, "epoch": 0.9325, "step": 373 }, { "epoch": 0.935, "grad_norm": 6.240350246429443, "learning_rate": 2.0526315789473685e-07, "loss": 0.2451, "step": 374 }, { "Batch Mean": -0.13293242454528809, "accuracy": 0.8203125, "epoch": 0.935, "step": 374 }, { "epoch": 0.9375, "grad_norm": 6.996582508087158, "learning_rate": 1.9736842105263157e-07, "loss": 0.3571, "step": 375 }, { "Batch Mean": -0.08049726486206055, "accuracy": 0.8828125, "epoch": 0.9375, "step": 375 }, { "epoch": 0.94, "grad_norm": 7.0056915283203125, "learning_rate": 1.8947368421052632e-07, "loss": 0.3221, "step": 376 }, { "Batch Mean": 0.18318387866020203, "accuracy": 0.7890625, "epoch": 0.94, "step": 376 }, { "epoch": 0.9425, "grad_norm": 8.635159492492676, "learning_rate": 1.8157894736842106e-07, "loss": 0.4011, "step": 377 }, { "Batch Mean": -0.11429977416992188, "accuracy": 0.8671875, "epoch": 0.9425, "step": 377 }, { "epoch": 0.945, "grad_norm": 6.520611763000488, "learning_rate": 1.7368421052631578e-07, "loss": 0.3139, "step": 378 }, { "Batch Mean": -0.15759539604187012, "accuracy": 0.8359375, "epoch": 0.945, "step": 378 }, { "epoch": 0.9475, "grad_norm": 7.6163411140441895, "learning_rate": 1.6578947368421053e-07, "loss": 0.3751, "step": 379 }, { "Batch Mean": 0.2796630859375, "accuracy": 0.8203125, "epoch": 0.9475, "step": 379 }, { "epoch": 0.95, "grad_norm": 7.514281272888184, "learning_rate": 1.5789473684210525e-07, "loss": 0.3402, "step": 380 }, { "Batch Mean": 0.12362116575241089, "accuracy": 0.8359375, "epoch": 0.95, "step": 380 }, { "epoch": 0.9525, "grad_norm": 6.752447128295898, "learning_rate": 1.5000000000000002e-07, "loss": 0.3641, "step": 381 }, { "Batch Mean": -0.2555246353149414, "accuracy": 0.8203125, "epoch": 0.9525, "step": 381 }, { "epoch": 0.955, "grad_norm": 8.897603988647461, "learning_rate": 1.4210526315789474e-07, "loss": 0.4053, "step": 382 }, { "Batch Mean": 0.1314713954925537, "accuracy": 0.859375, "epoch": 0.955, "step": 382 }, { "epoch": 0.9575, "grad_norm": 8.170565605163574, "learning_rate": 1.3421052631578948e-07, "loss": 0.4072, "step": 383 }, { "Batch Mean": 0.19118213653564453, "accuracy": 0.8671875, "epoch": 0.9575, "step": 383 }, { "epoch": 0.96, "grad_norm": 6.606847286224365, "learning_rate": 1.263157894736842e-07, "loss": 0.3082, "step": 384 }, { "Batch Mean": 0.015971660614013672, "accuracy": 0.859375, "epoch": 0.96, "step": 384 }, { "epoch": 0.9625, "grad_norm": 6.187947750091553, "learning_rate": 1.1842105263157895e-07, "loss": 0.3072, "step": 385 }, { "Batch Mean": 0.20549535751342773, "accuracy": 0.8671875, "epoch": 0.9625, "step": 385 }, { "epoch": 0.965, "grad_norm": 5.9379048347473145, "learning_rate": 1.1052631578947368e-07, "loss": 0.3094, "step": 386 }, { "Batch Mean": -0.026795893907546997, "accuracy": 0.8515625, "epoch": 0.965, "step": 386 }, { "epoch": 0.9675, "grad_norm": 6.812534332275391, "learning_rate": 1.0263157894736843e-07, "loss": 0.3254, "step": 387 }, { "Batch Mean": 0.18688905239105225, "accuracy": 0.8203125, "epoch": 0.9675, "step": 387 }, { "epoch": 0.97, "grad_norm": 6.931975841522217, "learning_rate": 9.473684210526316e-08, "loss": 0.3781, "step": 388 }, { "Batch Mean": 0.06776046752929688, "accuracy": 0.8984375, "epoch": 0.97, "step": 388 }, { "epoch": 0.9725, "grad_norm": 5.829480171203613, "learning_rate": 8.684210526315789e-08, "loss": 0.3168, "step": 389 }, { "Batch Mean": 0.14634227752685547, "accuracy": 0.8671875, "epoch": 0.9725, "step": 389 }, { "epoch": 0.975, "grad_norm": 6.641167640686035, "learning_rate": 7.894736842105262e-08, "loss": 0.3271, "step": 390 }, { "Batch Mean": 0.16483211517333984, "accuracy": 0.859375, "epoch": 0.975, "step": 390 }, { "epoch": 0.9775, "grad_norm": 6.973351001739502, "learning_rate": 7.105263157894737e-08, "loss": 0.3307, "step": 391 }, { "Batch Mean": 0.07878565788269043, "accuracy": 0.890625, "epoch": 0.9775, "step": 391 }, { "epoch": 0.98, "grad_norm": 6.42501974105835, "learning_rate": 6.31578947368421e-08, "loss": 0.3352, "step": 392 }, { "Batch Mean": -0.26625585556030273, "accuracy": 0.890625, "epoch": 0.98, "step": 392 }, { "epoch": 0.9825, "grad_norm": 6.366028308868408, "learning_rate": 5.526315789473684e-08, "loss": 0.2924, "step": 393 }, { "Batch Mean": -0.21169519424438477, "accuracy": 0.8984375, "epoch": 0.9825, "step": 393 }, { "epoch": 0.985, "grad_norm": 6.536776542663574, "learning_rate": 4.736842105263158e-08, "loss": 0.2794, "step": 394 }, { "Batch Mean": 0.17948570847511292, "accuracy": 0.859375, "epoch": 0.985, "step": 394 }, { "epoch": 0.9875, "grad_norm": 6.673679351806641, "learning_rate": 3.947368421052631e-08, "loss": 0.304, "step": 395 }, { "Batch Mean": 0.021090269088745117, "accuracy": 0.8671875, "epoch": 0.9875, "step": 395 }, { "epoch": 0.99, "grad_norm": 6.656391143798828, "learning_rate": 3.157894736842105e-08, "loss": 0.2995, "step": 396 }, { "Batch Mean": -0.2189890742301941, "accuracy": 0.859375, "epoch": 0.99, "step": 396 }, { "epoch": 0.9925, "grad_norm": 8.504425048828125, "learning_rate": 2.368421052631579e-08, "loss": 0.3431, "step": 397 }, { "Batch Mean": 0.21610061824321747, "accuracy": 0.8828125, "epoch": 0.9925, "step": 397 }, { "epoch": 0.995, "grad_norm": 6.25787353515625, "learning_rate": 1.5789473684210525e-08, "loss": 0.3001, "step": 398 }, { "Batch Mean": -0.014714241027832031, "accuracy": 0.8203125, "epoch": 0.995, "step": 398 }, { "epoch": 0.9975, "grad_norm": 6.580000877380371, "learning_rate": 7.894736842105263e-09, "loss": 0.3591, "step": 399 }, { "Batch Mean": -0.2728743553161621, "accuracy": 0.8203125, "epoch": 0.9975, "step": 399 }, { "epoch": 1.0, "grad_norm": 7.813142776489258, "learning_rate": 0.0, "loss": 0.4014, "step": 400 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }