diff --git "a/checkpoint-400/trainer_state.json" "b/checkpoint-400/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-400/trainer_state.json" @@ -0,0 +1,5233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "Batch Mean": 0.7002408504486084, + "accuracy": 0.53125, + "epoch": 0, + "step": 0 + }, + { + "epoch": 0.0025, + "grad_norm": 4.865061283111572, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.6999, + "step": 1 + }, + { + "Batch Mean": 0.7006568908691406, + "accuracy": 0.5078125, + "epoch": 0.0025, + "step": 1 + }, + { + "epoch": 0.005, + "grad_norm": 5.116915702819824, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.706, + "step": 2 + }, + { + "Batch Mean": 0.7036838531494141, + "accuracy": 0.5625, + "epoch": 0.005, + "step": 2 + }, + { + "epoch": 0.0075, + "grad_norm": 4.952433109283447, + "learning_rate": 4.5e-07, + "loss": 0.6957, + "step": 3 + }, + { + "Batch Mean": 0.7087974548339844, + "accuracy": 0.5546875, + "epoch": 0.0075, + "step": 3 + }, + { + "epoch": 0.01, + "grad_norm": 5.185088157653809, + "learning_rate": 6.000000000000001e-07, + "loss": 0.6974, + "step": 4 + }, + { + "Batch Mean": 0.7068939208984375, + "accuracy": 0.5234375, + "epoch": 0.01, + "step": 4 + }, + { + "epoch": 0.0125, + "grad_norm": 5.04327392578125, + "learning_rate": 7.5e-07, + "loss": 0.7047, + "step": 5 + }, + { + "Batch Mean": 0.6315479278564453, + "accuracy": 0.5234375, + "epoch": 0.0125, + "step": 5 + }, + { + "epoch": 0.015, + "grad_norm": 4.728168487548828, + "learning_rate": 9e-07, + "loss": 0.6932, + "step": 6 + }, + { + "Batch Mean": 0.5291571617126465, + "accuracy": 0.5234375, + "epoch": 0.015, + "step": 6 + }, + { + "epoch": 0.0175, + "grad_norm": 4.0146050453186035, + "learning_rate": 1.05e-06, + "loss": 0.691, + "step": 7 + }, + { + "Batch Mean": 0.42229437828063965, + "accuracy": 0.5546875, + "epoch": 0.0175, + "step": 7 + }, + { + "epoch": 0.02, + "grad_norm": 3.488922119140625, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.6947, + "step": 8 + }, + { + "Batch Mean": 0.07428494095802307, + "accuracy": 0.515625, + "epoch": 0.02, + "step": 8 + }, + { + "epoch": 0.0225, + "grad_norm": 2.5626139640808105, + "learning_rate": 1.35e-06, + "loss": 0.6965, + "step": 9 + }, + { + "Batch Mean": -0.1413576304912567, + "accuracy": 0.5859375, + "epoch": 0.0225, + "step": 9 + }, + { + "epoch": 0.025, + "grad_norm": 2.776390790939331, + "learning_rate": 1.5e-06, + "loss": 0.6784, + "step": 10 + }, + { + "Batch Mean": -0.2741769552230835, + "accuracy": 0.5234375, + "epoch": 0.025, + "step": 10 + }, + { + "epoch": 0.0275, + "grad_norm": 3.4468233585357666, + "learning_rate": 1.65e-06, + "loss": 0.6844, + "step": 11 + }, + { + "Batch Mean": -0.45577430725097656, + "accuracy": 0.5859375, + "epoch": 0.0275, + "step": 11 + }, + { + "epoch": 0.03, + "grad_norm": 4.281182765960693, + "learning_rate": 1.8e-06, + "loss": 0.6753, + "step": 12 + }, + { + "Batch Mean": -1.0260009765625, + "accuracy": 0.65625, + "epoch": 0.03, + "step": 12 + }, + { + "epoch": 0.0325, + "grad_norm": 6.948849201202393, + "learning_rate": 1.95e-06, + "loss": 0.6734, + "step": 13 + }, + { + "Batch Mean": -1.1147994995117188, + "accuracy": 0.6171875, + "epoch": 0.0325, + "step": 13 + }, + { + "epoch": 0.035, + "grad_norm": 7.589419841766357, + "learning_rate": 2.1e-06, + "loss": 0.677, + "step": 14 + }, + { + "Batch Mean": -1.0315093994140625, + "accuracy": 0.65625, + "epoch": 0.035, + "step": 14 + }, + { + "epoch": 0.0375, + "grad_norm": 7.2146382331848145, + "learning_rate": 2.25e-06, + "loss": 0.677, + "step": 15 + }, + { + "Batch Mean": -0.8089427947998047, + "accuracy": 0.65625, + "epoch": 0.0375, + "step": 15 + }, + { + "epoch": 0.04, + "grad_norm": 6.308068752288818, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.6503, + "step": 16 + }, + { + "Batch Mean": -0.3669371008872986, + "accuracy": 0.671875, + "epoch": 0.04, + "step": 16 + }, + { + "epoch": 0.0425, + "grad_norm": 4.996833324432373, + "learning_rate": 2.55e-06, + "loss": 0.617, + "step": 17 + }, + { + "Batch Mean": 0.2619136571884155, + "accuracy": 0.6796875, + "epoch": 0.0425, + "step": 17 + }, + { + "epoch": 0.045, + "grad_norm": 4.9718918800354, + "learning_rate": 2.7e-06, + "loss": 0.5981, + "step": 18 + }, + { + "Batch Mean": 1.0361242294311523, + "accuracy": 0.625, + "epoch": 0.045, + "step": 18 + }, + { + "epoch": 0.0475, + "grad_norm": 9.635746955871582, + "learning_rate": 2.85e-06, + "loss": 0.6236, + "step": 19 + }, + { + "Batch Mean": 1.5605192184448242, + "accuracy": 0.65625, + "epoch": 0.0475, + "step": 19 + }, + { + "epoch": 0.05, + "grad_norm": 14.90079116821289, + "learning_rate": 3e-06, + "loss": 0.6511, + "step": 20 + }, + { + "Batch Mean": 1.7601673603057861, + "accuracy": 0.6796875, + "epoch": 0.05, + "step": 20 + }, + { + "epoch": 0.0525, + "grad_norm": 16.801076889038086, + "learning_rate": 2.992105263157895e-06, + "loss": 0.6712, + "step": 21 + }, + { + "Batch Mean": 1.1987818479537964, + "accuracy": 0.7109375, + "epoch": 0.0525, + "step": 21 + }, + { + "epoch": 0.055, + "grad_norm": 11.280301094055176, + "learning_rate": 2.9842105263157896e-06, + "loss": 0.6033, + "step": 22 + }, + { + "Batch Mean": 0.5978413820266724, + "accuracy": 0.6875, + "epoch": 0.055, + "step": 22 + }, + { + "epoch": 0.0575, + "grad_norm": 7.414430141448975, + "learning_rate": 2.9763157894736843e-06, + "loss": 0.5819, + "step": 23 + }, + { + "Batch Mean": -0.20422333478927612, + "accuracy": 0.6640625, + "epoch": 0.0575, + "step": 23 + }, + { + "epoch": 0.06, + "grad_norm": 5.717103004455566, + "learning_rate": 2.968421052631579e-06, + "loss": 0.638, + "step": 24 + }, + { + "Batch Mean": -0.7162615060806274, + "accuracy": 0.75, + "epoch": 0.06, + "step": 24 + }, + { + "epoch": 0.0625, + "grad_norm": 7.513117790222168, + "learning_rate": 2.960526315789474e-06, + "loss": 0.5851, + "step": 25 + }, + { + "Batch Mean": -1.054888129234314, + "accuracy": 0.6796875, + "epoch": 0.0625, + "step": 25 + }, + { + "epoch": 0.065, + "grad_norm": 8.300326347351074, + "learning_rate": 2.9526315789473685e-06, + "loss": 0.6286, + "step": 26 + }, + { + "Batch Mean": -1.1319353580474854, + "accuracy": 0.703125, + "epoch": 0.065, + "step": 26 + }, + { + "epoch": 0.0675, + "grad_norm": 9.116662979125977, + "learning_rate": 2.9447368421052633e-06, + "loss": 0.6003, + "step": 27 + }, + { + "Batch Mean": -0.7403234839439392, + "accuracy": 0.7265625, + "epoch": 0.0675, + "step": 27 + }, + { + "epoch": 0.07, + "grad_norm": 6.799551486968994, + "learning_rate": 2.936842105263158e-06, + "loss": 0.5383, + "step": 28 + }, + { + "Batch Mean": -0.25867950916290283, + "accuracy": 0.703125, + "epoch": 0.07, + "step": 28 + }, + { + "epoch": 0.0725, + "grad_norm": 5.493288040161133, + "learning_rate": 2.9289473684210528e-06, + "loss": 0.5683, + "step": 29 + }, + { + "Batch Mean": 0.4318051338195801, + "accuracy": 0.6796875, + "epoch": 0.0725, + "step": 29 + }, + { + "epoch": 0.075, + "grad_norm": 6.373619556427002, + "learning_rate": 2.9210526315789475e-06, + "loss": 0.6126, + "step": 30 + }, + { + "Batch Mean": 0.8685734272003174, + "accuracy": 0.7265625, + "epoch": 0.075, + "step": 30 + }, + { + "epoch": 0.0775, + "grad_norm": 7.750959396362305, + "learning_rate": 2.9131578947368423e-06, + "loss": 0.5984, + "step": 31 + }, + { + "Batch Mean": 1.1348557472229004, + "accuracy": 0.6484375, + "epoch": 0.0775, + "step": 31 + }, + { + "epoch": 0.08, + "grad_norm": 8.742609977722168, + "learning_rate": 2.905263157894737e-06, + "loss": 0.6016, + "step": 32 + }, + { + "Batch Mean": 0.6863436698913574, + "accuracy": 0.7421875, + "epoch": 0.08, + "step": 32 + }, + { + "epoch": 0.0825, + "grad_norm": 6.586100101470947, + "learning_rate": 2.8973684210526318e-06, + "loss": 0.5485, + "step": 33 + }, + { + "Batch Mean": 0.23023658990859985, + "accuracy": 0.765625, + "epoch": 0.0825, + "step": 33 + }, + { + "epoch": 0.085, + "grad_norm": 5.231872081756592, + "learning_rate": 2.8894736842105265e-06, + "loss": 0.4999, + "step": 34 + }, + { + "Batch Mean": -0.2048710286617279, + "accuracy": 0.734375, + "epoch": 0.085, + "step": 34 + }, + { + "epoch": 0.0875, + "grad_norm": 5.551333427429199, + "learning_rate": 2.8815789473684213e-06, + "loss": 0.5541, + "step": 35 + }, + { + "Batch Mean": -0.4559789299964905, + "accuracy": 0.7421875, + "epoch": 0.0875, + "step": 35 + }, + { + "epoch": 0.09, + "grad_norm": 5.972931385040283, + "learning_rate": 2.873684210526316e-06, + "loss": 0.5698, + "step": 36 + }, + { + "Batch Mean": -0.7627459764480591, + "accuracy": 0.7734375, + "epoch": 0.09, + "step": 36 + }, + { + "epoch": 0.0925, + "grad_norm": 7.165839672088623, + "learning_rate": 2.8657894736842103e-06, + "loss": 0.5157, + "step": 37 + }, + { + "Batch Mean": -0.6894429326057434, + "accuracy": 0.671875, + "epoch": 0.0925, + "step": 37 + }, + { + "epoch": 0.095, + "grad_norm": 6.630836009979248, + "learning_rate": 2.857894736842105e-06, + "loss": 0.5955, + "step": 38 + }, + { + "Batch Mean": -0.2740752696990967, + "accuracy": 0.734375, + "epoch": 0.095, + "step": 38 + }, + { + "epoch": 0.0975, + "grad_norm": 6.181252956390381, + "learning_rate": 2.85e-06, + "loss": 0.5037, + "step": 39 + }, + { + "Batch Mean": -0.09357815980911255, + "accuracy": 0.734375, + "epoch": 0.0975, + "step": 39 + }, + { + "epoch": 0.1, + "grad_norm": 6.351346969604492, + "learning_rate": 2.8421052631578946e-06, + "loss": 0.5152, + "step": 40 + }, + { + "Batch Mean": 0.33483320474624634, + "accuracy": 0.796875, + "epoch": 0.1, + "step": 40 + }, + { + "epoch": 0.1025, + "grad_norm": 6.3938117027282715, + "learning_rate": 2.8342105263157897e-06, + "loss": 0.4573, + "step": 41 + }, + { + "Batch Mean": 0.503718376159668, + "accuracy": 0.7578125, + "epoch": 0.1025, + "step": 41 + }, + { + "epoch": 0.105, + "grad_norm": 6.795258522033691, + "learning_rate": 2.8263157894736845e-06, + "loss": 0.4923, + "step": 42 + }, + { + "Batch Mean": 0.4224682152271271, + "accuracy": 0.6875, + "epoch": 0.105, + "step": 42 + }, + { + "epoch": 0.1075, + "grad_norm": 9.03122329711914, + "learning_rate": 2.8184210526315792e-06, + "loss": 0.5803, + "step": 43 + }, + { + "Batch Mean": 0.11721980571746826, + "accuracy": 0.8046875, + "epoch": 0.1075, + "step": 43 + }, + { + "epoch": 0.11, + "grad_norm": 6.323685646057129, + "learning_rate": 2.810526315789474e-06, + "loss": 0.4518, + "step": 44 + }, + { + "Batch Mean": 0.11213397979736328, + "accuracy": 0.7734375, + "epoch": 0.11, + "step": 44 + }, + { + "epoch": 0.1125, + "grad_norm": 6.636176586151123, + "learning_rate": 2.8026315789473687e-06, + "loss": 0.4747, + "step": 45 + }, + { + "Batch Mean": -0.19074296951293945, + "accuracy": 0.7421875, + "epoch": 0.1125, + "step": 45 + }, + { + "epoch": 0.115, + "grad_norm": 8.862186431884766, + "learning_rate": 2.7947368421052635e-06, + "loss": 0.5295, + "step": 46 + }, + { + "Batch Mean": -0.26628145575523376, + "accuracy": 0.7578125, + "epoch": 0.115, + "step": 46 + }, + { + "epoch": 0.1175, + "grad_norm": 9.089022636413574, + "learning_rate": 2.7868421052631578e-06, + "loss": 0.5132, + "step": 47 + }, + { + "Batch Mean": -0.11882030963897705, + "accuracy": 0.765625, + "epoch": 0.1175, + "step": 47 + }, + { + "epoch": 0.12, + "grad_norm": 8.50251293182373, + "learning_rate": 2.7789473684210525e-06, + "loss": 0.4904, + "step": 48 + }, + { + "Batch Mean": -0.09545481204986572, + "accuracy": 0.7890625, + "epoch": 0.12, + "step": 48 + }, + { + "epoch": 0.1225, + "grad_norm": 8.091400146484375, + "learning_rate": 2.7710526315789473e-06, + "loss": 0.4615, + "step": 49 + }, + { + "Batch Mean": -0.01603543758392334, + "accuracy": 0.6328125, + "epoch": 0.1225, + "step": 49 + }, + { + "epoch": 0.125, + "grad_norm": 11.12177562713623, + "learning_rate": 2.763157894736842e-06, + "loss": 0.6245, + "step": 50 + }, + { + "Batch Mean": -0.03519377112388611, + "accuracy": 0.6953125, + "epoch": 0.125, + "step": 50 + }, + { + "epoch": 0.1275, + "grad_norm": 8.476814270019531, + "learning_rate": 2.7552631578947368e-06, + "loss": 0.5094, + "step": 51 + }, + { + "Batch Mean": -0.07445716857910156, + "accuracy": 0.765625, + "epoch": 0.1275, + "step": 51 + }, + { + "epoch": 0.13, + "grad_norm": 8.475494384765625, + "learning_rate": 2.7473684210526315e-06, + "loss": 0.464, + "step": 52 + }, + { + "Batch Mean": -0.08127522468566895, + "accuracy": 0.7890625, + "epoch": 0.13, + "step": 52 + }, + { + "epoch": 0.1325, + "grad_norm": 7.754316806793213, + "learning_rate": 2.7394736842105263e-06, + "loss": 0.4618, + "step": 53 + }, + { + "Batch Mean": 0.05236625671386719, + "accuracy": 0.75, + "epoch": 0.1325, + "step": 53 + }, + { + "epoch": 0.135, + "grad_norm": 7.3552751541137695, + "learning_rate": 2.7315789473684214e-06, + "loss": 0.5202, + "step": 54 + }, + { + "Batch Mean": -0.08030200004577637, + "accuracy": 0.765625, + "epoch": 0.135, + "step": 54 + }, + { + "epoch": 0.1375, + "grad_norm": 8.413111686706543, + "learning_rate": 2.723684210526316e-06, + "loss": 0.4902, + "step": 55 + }, + { + "Batch Mean": 0.20010590553283691, + "accuracy": 0.765625, + "epoch": 0.1375, + "step": 55 + }, + { + "epoch": 0.14, + "grad_norm": 9.49488353729248, + "learning_rate": 2.715789473684211e-06, + "loss": 0.4945, + "step": 56 + }, + { + "Batch Mean": 0.1068873256444931, + "accuracy": 0.765625, + "epoch": 0.14, + "step": 56 + }, + { + "epoch": 0.1425, + "grad_norm": 9.978996276855469, + "learning_rate": 2.7078947368421052e-06, + "loss": 0.5936, + "step": 57 + }, + { + "Batch Mean": 0.12400929629802704, + "accuracy": 0.8046875, + "epoch": 0.1425, + "step": 57 + }, + { + "epoch": 0.145, + "grad_norm": 8.332430839538574, + "learning_rate": 2.7e-06, + "loss": 0.4549, + "step": 58 + }, + { + "Batch Mean": 0.05874582380056381, + "accuracy": 0.8125, + "epoch": 0.145, + "step": 58 + }, + { + "epoch": 0.1475, + "grad_norm": 7.287342548370361, + "learning_rate": 2.6921052631578947e-06, + "loss": 0.3996, + "step": 59 + }, + { + "Batch Mean": 0.17911481857299805, + "accuracy": 0.828125, + "epoch": 0.1475, + "step": 59 + }, + { + "epoch": 0.15, + "grad_norm": 7.197307109832764, + "learning_rate": 2.6842105263157895e-06, + "loss": 0.4486, + "step": 60 + }, + { + "Batch Mean": -0.0680088996887207, + "accuracy": 0.7265625, + "epoch": 0.15, + "step": 60 + }, + { + "epoch": 0.1525, + "grad_norm": 8.422717094421387, + "learning_rate": 2.6763157894736842e-06, + "loss": 0.5283, + "step": 61 + }, + { + "Batch Mean": -0.1332111358642578, + "accuracy": 0.7890625, + "epoch": 0.1525, + "step": 61 + }, + { + "epoch": 0.155, + "grad_norm": 8.836848258972168, + "learning_rate": 2.668421052631579e-06, + "loss": 0.4547, + "step": 62 + }, + { + "Batch Mean": -0.005272388458251953, + "accuracy": 0.734375, + "epoch": 0.155, + "step": 62 + }, + { + "epoch": 0.1575, + "grad_norm": 7.995077133178711, + "learning_rate": 2.6605263157894737e-06, + "loss": 0.5227, + "step": 63 + }, + { + "Batch Mean": 0.3416769504547119, + "accuracy": 0.8359375, + "epoch": 0.1575, + "step": 63 + }, + { + "epoch": 0.16, + "grad_norm": 8.131783485412598, + "learning_rate": 2.6526315789473685e-06, + "loss": 0.4191, + "step": 64 + }, + { + "Batch Mean": 0.2797560691833496, + "accuracy": 0.765625, + "epoch": 0.16, + "step": 64 + }, + { + "epoch": 0.1625, + "grad_norm": 9.04648208618164, + "learning_rate": 2.644736842105263e-06, + "loss": 0.4893, + "step": 65 + }, + { + "Batch Mean": -0.14584161341190338, + "accuracy": 0.7734375, + "epoch": 0.1625, + "step": 65 + }, + { + "epoch": 0.165, + "grad_norm": 8.039698600769043, + "learning_rate": 2.636842105263158e-06, + "loss": 0.4753, + "step": 66 + }, + { + "Batch Mean": -0.37075191736221313, + "accuracy": 0.7421875, + "epoch": 0.165, + "step": 66 + }, + { + "epoch": 0.1675, + "grad_norm": 7.643855094909668, + "learning_rate": 2.6289473684210527e-06, + "loss": 0.4424, + "step": 67 + }, + { + "Batch Mean": -0.24360448122024536, + "accuracy": 0.828125, + "epoch": 0.1675, + "step": 67 + }, + { + "epoch": 0.17, + "grad_norm": 6.892768383026123, + "learning_rate": 2.6210526315789474e-06, + "loss": 0.4126, + "step": 68 + }, + { + "Batch Mean": -0.0484846830368042, + "accuracy": 0.734375, + "epoch": 0.17, + "step": 68 + }, + { + "epoch": 0.1725, + "grad_norm": 10.497048377990723, + "learning_rate": 2.613157894736842e-06, + "loss": 0.5242, + "step": 69 + }, + { + "Batch Mean": 0.07249626517295837, + "accuracy": 0.8125, + "epoch": 0.1725, + "step": 69 + }, + { + "epoch": 0.175, + "grad_norm": 6.990925312042236, + "learning_rate": 2.605263157894737e-06, + "loss": 0.4206, + "step": 70 + }, + { + "Batch Mean": -0.06337249279022217, + "accuracy": 0.8125, + "epoch": 0.175, + "step": 70 + }, + { + "epoch": 0.1775, + "grad_norm": 7.180996417999268, + "learning_rate": 2.5973684210526317e-06, + "loss": 0.4412, + "step": 71 + }, + { + "Batch Mean": 0.06033170223236084, + "accuracy": 0.8515625, + "epoch": 0.1775, + "step": 71 + }, + { + "epoch": 0.18, + "grad_norm": 6.108893871307373, + "learning_rate": 2.5894736842105264e-06, + "loss": 0.3576, + "step": 72 + }, + { + "Batch Mean": -0.04290449619293213, + "accuracy": 0.765625, + "epoch": 0.18, + "step": 72 + }, + { + "epoch": 0.1825, + "grad_norm": 7.272516250610352, + "learning_rate": 2.581578947368421e-06, + "loss": 0.487, + "step": 73 + }, + { + "Batch Mean": 0.09309220314025879, + "accuracy": 0.796875, + "epoch": 0.1825, + "step": 73 + }, + { + "epoch": 0.185, + "grad_norm": 6.689022064208984, + "learning_rate": 2.573684210526316e-06, + "loss": 0.4576, + "step": 74 + }, + { + "Batch Mean": -0.3521728515625, + "accuracy": 0.8046875, + "epoch": 0.185, + "step": 74 + }, + { + "epoch": 0.1875, + "grad_norm": 7.446789264678955, + "learning_rate": 2.5657894736842107e-06, + "loss": 0.4537, + "step": 75 + }, + { + "Batch Mean": -0.1267460584640503, + "accuracy": 0.7265625, + "epoch": 0.1875, + "step": 75 + }, + { + "epoch": 0.19, + "grad_norm": 8.600646018981934, + "learning_rate": 2.5578947368421054e-06, + "loss": 0.5066, + "step": 76 + }, + { + "Batch Mean": 0.02230433002114296, + "accuracy": 0.7578125, + "epoch": 0.19, + "step": 76 + }, + { + "epoch": 0.1925, + "grad_norm": 8.842095375061035, + "learning_rate": 2.55e-06, + "loss": 0.4823, + "step": 77 + }, + { + "Batch Mean": 0.17012596130371094, + "accuracy": 0.7890625, + "epoch": 0.1925, + "step": 77 + }, + { + "epoch": 0.195, + "grad_norm": 6.993001937866211, + "learning_rate": 2.542105263157895e-06, + "loss": 0.3959, + "step": 78 + }, + { + "Batch Mean": -0.11112558841705322, + "accuracy": 0.78125, + "epoch": 0.195, + "step": 78 + }, + { + "epoch": 0.1975, + "grad_norm": 8.71827220916748, + "learning_rate": 2.5342105263157892e-06, + "loss": 0.4407, + "step": 79 + }, + { + "Batch Mean": 0.11298668384552002, + "accuracy": 0.8515625, + "epoch": 0.1975, + "step": 79 + }, + { + "epoch": 0.2, + "grad_norm": 7.829047679901123, + "learning_rate": 2.526315789473684e-06, + "loss": 0.3905, + "step": 80 + }, + { + "Batch Mean": 0.010145187377929688, + "accuracy": 0.78125, + "epoch": 0.2, + "step": 80 + }, + { + "epoch": 0.2025, + "grad_norm": 10.1448335647583, + "learning_rate": 2.5184210526315787e-06, + "loss": 0.4667, + "step": 81 + }, + { + "Batch Mean": 0.1327219009399414, + "accuracy": 0.7890625, + "epoch": 0.2025, + "step": 81 + }, + { + "epoch": 0.205, + "grad_norm": 7.784322738647461, + "learning_rate": 2.510526315789474e-06, + "loss": 0.408, + "step": 82 + }, + { + "Batch Mean": 0.3474133014678955, + "accuracy": 0.7421875, + "epoch": 0.205, + "step": 82 + }, + { + "epoch": 0.2075, + "grad_norm": 8.116996765136719, + "learning_rate": 2.5026315789473686e-06, + "loss": 0.438, + "step": 83 + }, + { + "Batch Mean": 0.2660253047943115, + "accuracy": 0.75, + "epoch": 0.2075, + "step": 83 + }, + { + "epoch": 0.21, + "grad_norm": 7.8381452560424805, + "learning_rate": 2.4947368421052634e-06, + "loss": 0.4639, + "step": 84 + }, + { + "Batch Mean": 0.004191964864730835, + "accuracy": 0.8125, + "epoch": 0.21, + "step": 84 + }, + { + "epoch": 0.2125, + "grad_norm": 6.7916412353515625, + "learning_rate": 2.486842105263158e-06, + "loss": 0.3937, + "step": 85 + }, + { + "Batch Mean": -0.4819910526275635, + "accuracy": 0.75, + "epoch": 0.2125, + "step": 85 + }, + { + "epoch": 0.215, + "grad_norm": 8.073772430419922, + "learning_rate": 2.478947368421053e-06, + "loss": 0.4866, + "step": 86 + }, + { + "Batch Mean": -0.5541934967041016, + "accuracy": 0.75, + "epoch": 0.215, + "step": 86 + }, + { + "epoch": 0.2175, + "grad_norm": 8.677484512329102, + "learning_rate": 2.4710526315789476e-06, + "loss": 0.503, + "step": 87 + }, + { + "Batch Mean": 0.14371705055236816, + "accuracy": 0.7890625, + "epoch": 0.2175, + "step": 87 + }, + { + "epoch": 0.22, + "grad_norm": 7.931331634521484, + "learning_rate": 2.4631578947368424e-06, + "loss": 0.4483, + "step": 88 + }, + { + "Batch Mean": 0.013556957244873047, + "accuracy": 0.859375, + "epoch": 0.22, + "step": 88 + }, + { + "epoch": 0.2225, + "grad_norm": 6.920699119567871, + "learning_rate": 2.4552631578947367e-06, + "loss": 0.3674, + "step": 89 + }, + { + "Batch Mean": -0.11897921562194824, + "accuracy": 0.8203125, + "epoch": 0.2225, + "step": 89 + }, + { + "epoch": 0.225, + "grad_norm": 7.135400295257568, + "learning_rate": 2.4473684210526314e-06, + "loss": 0.3953, + "step": 90 + }, + { + "Batch Mean": 0.33566761016845703, + "accuracy": 0.7890625, + "epoch": 0.225, + "step": 90 + }, + { + "epoch": 0.2275, + "grad_norm": 10.429315567016602, + "learning_rate": 2.439473684210526e-06, + "loss": 0.4943, + "step": 91 + }, + { + "Batch Mean": 0.4449765682220459, + "accuracy": 0.84375, + "epoch": 0.2275, + "step": 91 + }, + { + "epoch": 0.23, + "grad_norm": 8.419116020202637, + "learning_rate": 2.431578947368421e-06, + "loss": 0.3647, + "step": 92 + }, + { + "Batch Mean": -0.06833112239837646, + "accuracy": 0.8359375, + "epoch": 0.23, + "step": 92 + }, + { + "epoch": 0.2325, + "grad_norm": 7.288305282592773, + "learning_rate": 2.4236842105263157e-06, + "loss": 0.3719, + "step": 93 + }, + { + "Batch Mean": -0.36795759201049805, + "accuracy": 0.8046875, + "epoch": 0.2325, + "step": 93 + }, + { + "epoch": 0.235, + "grad_norm": 8.255443572998047, + "learning_rate": 2.4157894736842104e-06, + "loss": 0.3983, + "step": 94 + }, + { + "Batch Mean": -0.3381902277469635, + "accuracy": 0.8515625, + "epoch": 0.235, + "step": 94 + }, + { + "epoch": 0.2375, + "grad_norm": 8.25379467010498, + "learning_rate": 2.4078947368421056e-06, + "loss": 0.3602, + "step": 95 + }, + { + "Batch Mean": -0.2966504693031311, + "accuracy": 0.75, + "epoch": 0.2375, + "step": 95 + }, + { + "epoch": 0.24, + "grad_norm": 9.554913520812988, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.4499, + "step": 96 + }, + { + "Batch Mean": -0.02951526641845703, + "accuracy": 0.796875, + "epoch": 0.24, + "step": 96 + }, + { + "epoch": 0.2425, + "grad_norm": 8.300037384033203, + "learning_rate": 2.392105263157895e-06, + "loss": 0.41, + "step": 97 + }, + { + "Batch Mean": 0.4994839131832123, + "accuracy": 0.84375, + "epoch": 0.2425, + "step": 97 + }, + { + "epoch": 0.245, + "grad_norm": 9.282505989074707, + "learning_rate": 2.38421052631579e-06, + "loss": 0.3631, + "step": 98 + }, + { + "Batch Mean": 0.5059927701950073, + "accuracy": 0.84375, + "epoch": 0.245, + "step": 98 + }, + { + "epoch": 0.2475, + "grad_norm": 10.97496509552002, + "learning_rate": 2.376315789473684e-06, + "loss": 0.4266, + "step": 99 + }, + { + "Batch Mean": 0.5012010335922241, + "accuracy": 0.7890625, + "epoch": 0.2475, + "step": 99 + }, + { + "epoch": 0.25, + "grad_norm": 10.482470512390137, + "learning_rate": 2.368421052631579e-06, + "loss": 0.4402, + "step": 100 + }, + { + "Batch Mean": -0.34704887866973877, + "accuracy": 0.7890625, + "epoch": 0.25, + "step": 100 + }, + { + "epoch": 0.2525, + "grad_norm": 10.026016235351562, + "learning_rate": 2.3605263157894736e-06, + "loss": 0.4576, + "step": 101 + }, + { + "Batch Mean": -0.5854883193969727, + "accuracy": 0.7421875, + "epoch": 0.2525, + "step": 101 + }, + { + "epoch": 0.255, + "grad_norm": 11.113299369812012, + "learning_rate": 2.3526315789473684e-06, + "loss": 0.5302, + "step": 102 + }, + { + "Batch Mean": -0.6017556190490723, + "accuracy": 0.8125, + "epoch": 0.255, + "step": 102 + }, + { + "epoch": 0.2575, + "grad_norm": 9.242813110351562, + "learning_rate": 2.344736842105263e-06, + "loss": 0.3451, + "step": 103 + }, + { + "Batch Mean": -0.14799189567565918, + "accuracy": 0.78125, + "epoch": 0.2575, + "step": 103 + }, + { + "epoch": 0.26, + "grad_norm": 9.658228874206543, + "learning_rate": 2.336842105263158e-06, + "loss": 0.442, + "step": 104 + }, + { + "Batch Mean": 0.2693725824356079, + "accuracy": 0.8203125, + "epoch": 0.26, + "step": 104 + }, + { + "epoch": 0.2625, + "grad_norm": 7.290228366851807, + "learning_rate": 2.3289473684210526e-06, + "loss": 0.3808, + "step": 105 + }, + { + "Batch Mean": 0.3339729309082031, + "accuracy": 0.8828125, + "epoch": 0.2625, + "step": 105 + }, + { + "epoch": 0.265, + "grad_norm": 6.965331554412842, + "learning_rate": 2.3210526315789473e-06, + "loss": 0.326, + "step": 106 + }, + { + "Batch Mean": 0.03934955596923828, + "accuracy": 0.765625, + "epoch": 0.265, + "step": 106 + }, + { + "epoch": 0.2675, + "grad_norm": 8.904206275939941, + "learning_rate": 2.313157894736842e-06, + "loss": 0.4804, + "step": 107 + }, + { + "Batch Mean": 0.48924458026885986, + "accuracy": 0.7734375, + "epoch": 0.2675, + "step": 107 + }, + { + "epoch": 0.27, + "grad_norm": 11.731331825256348, + "learning_rate": 2.305263157894737e-06, + "loss": 0.4496, + "step": 108 + }, + { + "Batch Mean": 0.3033190965652466, + "accuracy": 0.8671875, + "epoch": 0.27, + "step": 108 + }, + { + "epoch": 0.2725, + "grad_norm": 7.58432149887085, + "learning_rate": 2.2973684210526316e-06, + "loss": 0.3272, + "step": 109 + }, + { + "Batch Mean": 0.3258399963378906, + "accuracy": 0.9296875, + "epoch": 0.2725, + "step": 109 + }, + { + "epoch": 0.275, + "grad_norm": 8.01420783996582, + "learning_rate": 2.2894736842105263e-06, + "loss": 0.2889, + "step": 110 + }, + { + "Batch Mean": -0.21785682439804077, + "accuracy": 0.7734375, + "epoch": 0.275, + "step": 110 + }, + { + "epoch": 0.2775, + "grad_norm": 10.714860916137695, + "learning_rate": 2.281578947368421e-06, + "loss": 0.4938, + "step": 111 + }, + { + "Batch Mean": -0.23068951070308685, + "accuracy": 0.7734375, + "epoch": 0.2775, + "step": 111 + }, + { + "epoch": 0.28, + "grad_norm": 9.543609619140625, + "learning_rate": 2.273684210526316e-06, + "loss": 0.4813, + "step": 112 + }, + { + "Batch Mean": -0.24743294715881348, + "accuracy": 0.8515625, + "epoch": 0.28, + "step": 112 + }, + { + "epoch": 0.2825, + "grad_norm": 7.655090808868408, + "learning_rate": 2.2657894736842106e-06, + "loss": 0.3365, + "step": 113 + }, + { + "Batch Mean": -0.4790257215499878, + "accuracy": 0.828125, + "epoch": 0.2825, + "step": 113 + }, + { + "epoch": 0.285, + "grad_norm": 9.270421028137207, + "learning_rate": 2.2578947368421053e-06, + "loss": 0.3654, + "step": 114 + }, + { + "Batch Mean": 0.39268895983695984, + "accuracy": 0.7890625, + "epoch": 0.285, + "step": 114 + }, + { + "epoch": 0.2875, + "grad_norm": 10.442706108093262, + "learning_rate": 2.25e-06, + "loss": 0.4592, + "step": 115 + }, + { + "Batch Mean": 0.6092771291732788, + "accuracy": 0.8515625, + "epoch": 0.2875, + "step": 115 + }, + { + "epoch": 0.29, + "grad_norm": 9.203109741210938, + "learning_rate": 2.242105263157895e-06, + "loss": 0.3579, + "step": 116 + }, + { + "Batch Mean": -0.1381063461303711, + "accuracy": 0.7734375, + "epoch": 0.29, + "step": 116 + }, + { + "epoch": 0.2925, + "grad_norm": 11.316916465759277, + "learning_rate": 2.2342105263157895e-06, + "loss": 0.459, + "step": 117 + }, + { + "Batch Mean": -0.18764352798461914, + "accuracy": 0.8125, + "epoch": 0.2925, + "step": 117 + }, + { + "epoch": 0.295, + "grad_norm": 9.715188026428223, + "learning_rate": 2.2263157894736843e-06, + "loss": 0.448, + "step": 118 + }, + { + "Batch Mean": -0.4390474557876587, + "accuracy": 0.8359375, + "epoch": 0.295, + "step": 118 + }, + { + "epoch": 0.2975, + "grad_norm": 8.53513240814209, + "learning_rate": 2.218421052631579e-06, + "loss": 0.353, + "step": 119 + }, + { + "Batch Mean": -0.49342280626296997, + "accuracy": 0.8203125, + "epoch": 0.2975, + "step": 119 + }, + { + "epoch": 0.3, + "grad_norm": 8.739898681640625, + "learning_rate": 2.2105263157894738e-06, + "loss": 0.3871, + "step": 120 + }, + { + "Batch Mean": -0.16289004683494568, + "accuracy": 0.84375, + "epoch": 0.3, + "step": 120 + }, + { + "epoch": 0.3025, + "grad_norm": 8.066034317016602, + "learning_rate": 2.2026315789473685e-06, + "loss": 0.3778, + "step": 121 + }, + { + "Batch Mean": 0.11179280281066895, + "accuracy": 0.8359375, + "epoch": 0.3025, + "step": 121 + }, + { + "epoch": 0.305, + "grad_norm": 8.85276985168457, + "learning_rate": 2.1947368421052633e-06, + "loss": 0.3953, + "step": 122 + }, + { + "Batch Mean": 0.6813234686851501, + "accuracy": 0.828125, + "epoch": 0.305, + "step": 122 + }, + { + "epoch": 0.3075, + "grad_norm": 10.226161003112793, + "learning_rate": 2.186842105263158e-06, + "loss": 0.3787, + "step": 123 + }, + { + "Batch Mean": 0.08529806137084961, + "accuracy": 0.8515625, + "epoch": 0.3075, + "step": 123 + }, + { + "epoch": 0.31, + "grad_norm": 9.13556957244873, + "learning_rate": 2.1789473684210528e-06, + "loss": 0.3313, + "step": 124 + }, + { + "Batch Mean": 0.24107074737548828, + "accuracy": 0.7890625, + "epoch": 0.31, + "step": 124 + }, + { + "epoch": 0.3125, + "grad_norm": 12.641064643859863, + "learning_rate": 2.1710526315789475e-06, + "loss": 0.5176, + "step": 125 + }, + { + "Batch Mean": -0.1636943817138672, + "accuracy": 0.7890625, + "epoch": 0.3125, + "step": 125 + }, + { + "epoch": 0.315, + "grad_norm": 9.701798439025879, + "learning_rate": 2.1631578947368423e-06, + "loss": 0.42, + "step": 126 + }, + { + "Batch Mean": -0.2857036590576172, + "accuracy": 0.8515625, + "epoch": 0.315, + "step": 126 + }, + { + "epoch": 0.3175, + "grad_norm": 9.48480224609375, + "learning_rate": 2.155263157894737e-06, + "loss": 0.3539, + "step": 127 + }, + { + "Batch Mean": -0.7563860416412354, + "accuracy": 0.765625, + "epoch": 0.3175, + "step": 127 + }, + { + "epoch": 0.32, + "grad_norm": 12.008138656616211, + "learning_rate": 2.1473684210526317e-06, + "loss": 0.4875, + "step": 128 + }, + { + "Batch Mean": 0.1351943016052246, + "accuracy": 0.84375, + "epoch": 0.32, + "step": 128 + }, + { + "epoch": 0.3225, + "grad_norm": 8.202943801879883, + "learning_rate": 2.1394736842105265e-06, + "loss": 0.3432, + "step": 129 + }, + { + "Batch Mean": -0.002408236265182495, + "accuracy": 0.8125, + "epoch": 0.3225, + "step": 129 + }, + { + "epoch": 0.325, + "grad_norm": 8.81209659576416, + "learning_rate": 2.1315789473684212e-06, + "loss": 0.418, + "step": 130 + }, + { + "Batch Mean": 0.00351560115814209, + "accuracy": 0.7734375, + "epoch": 0.325, + "step": 130 + }, + { + "epoch": 0.3275, + "grad_norm": 9.197092056274414, + "learning_rate": 2.123684210526316e-06, + "loss": 0.4158, + "step": 131 + }, + { + "Batch Mean": 0.26228976249694824, + "accuracy": 0.8515625, + "epoch": 0.3275, + "step": 131 + }, + { + "epoch": 0.33, + "grad_norm": 8.00036334991455, + "learning_rate": 2.1157894736842103e-06, + "loss": 0.3283, + "step": 132 + }, + { + "Batch Mean": 0.10788178443908691, + "accuracy": 0.8125, + "epoch": 0.33, + "step": 132 + }, + { + "epoch": 0.3325, + "grad_norm": 7.9092116355896, + "learning_rate": 2.107894736842105e-06, + "loss": 0.3622, + "step": 133 + }, + { + "Batch Mean": 0.09739017486572266, + "accuracy": 0.8515625, + "epoch": 0.3325, + "step": 133 + }, + { + "epoch": 0.335, + "grad_norm": 7.7380218505859375, + "learning_rate": 2.1e-06, + "loss": 0.3399, + "step": 134 + }, + { + "Batch Mean": -0.03578406572341919, + "accuracy": 0.8203125, + "epoch": 0.335, + "step": 134 + }, + { + "epoch": 0.3375, + "grad_norm": 7.9607930183410645, + "learning_rate": 2.0921052631578945e-06, + "loss": 0.3681, + "step": 135 + }, + { + "Batch Mean": 0.2095022201538086, + "accuracy": 0.7734375, + "epoch": 0.3375, + "step": 135 + }, + { + "epoch": 0.34, + "grad_norm": 9.16215705871582, + "learning_rate": 2.0842105263157897e-06, + "loss": 0.4474, + "step": 136 + }, + { + "Batch Mean": -0.12871003150939941, + "accuracy": 0.78125, + "epoch": 0.34, + "step": 136 + }, + { + "epoch": 0.3425, + "grad_norm": 9.543075561523438, + "learning_rate": 2.0763157894736845e-06, + "loss": 0.4641, + "step": 137 + }, + { + "Batch Mean": -0.045391350984573364, + "accuracy": 0.828125, + "epoch": 0.3425, + "step": 137 + }, + { + "epoch": 0.345, + "grad_norm": 8.620835304260254, + "learning_rate": 2.068421052631579e-06, + "loss": 0.388, + "step": 138 + }, + { + "Batch Mean": 0.023508906364440918, + "accuracy": 0.7578125, + "epoch": 0.345, + "step": 138 + }, + { + "epoch": 0.3475, + "grad_norm": 10.10091781616211, + "learning_rate": 2.060526315789474e-06, + "loss": 0.4391, + "step": 139 + }, + { + "Batch Mean": -0.0003312826156616211, + "accuracy": 0.8359375, + "epoch": 0.3475, + "step": 139 + }, + { + "epoch": 0.35, + "grad_norm": 8.200050354003906, + "learning_rate": 2.0526315789473687e-06, + "loss": 0.4196, + "step": 140 + }, + { + "Batch Mean": -0.304778516292572, + "accuracy": 0.8359375, + "epoch": 0.35, + "step": 140 + }, + { + "epoch": 0.3525, + "grad_norm": 7.937561511993408, + "learning_rate": 2.0447368421052634e-06, + "loss": 0.3394, + "step": 141 + }, + { + "Batch Mean": -0.22019100189208984, + "accuracy": 0.84375, + "epoch": 0.3525, + "step": 141 + }, + { + "epoch": 0.355, + "grad_norm": 8.111381530761719, + "learning_rate": 2.0368421052631578e-06, + "loss": 0.3878, + "step": 142 + }, + { + "Batch Mean": 0.4440329670906067, + "accuracy": 0.8125, + "epoch": 0.355, + "step": 142 + }, + { + "epoch": 0.3575, + "grad_norm": 10.333617210388184, + "learning_rate": 2.0289473684210525e-06, + "loss": 0.4124, + "step": 143 + }, + { + "Batch Mean": 0.45398879051208496, + "accuracy": 0.90625, + "epoch": 0.3575, + "step": 143 + }, + { + "epoch": 0.36, + "grad_norm": 7.195791721343994, + "learning_rate": 2.0210526315789473e-06, + "loss": 0.2869, + "step": 144 + }, + { + "Batch Mean": 0.13556790351867676, + "accuracy": 0.8359375, + "epoch": 0.36, + "step": 144 + }, + { + "epoch": 0.3625, + "grad_norm": 8.249251365661621, + "learning_rate": 2.013157894736842e-06, + "loss": 0.3767, + "step": 145 + }, + { + "Batch Mean": 0.03792291879653931, + "accuracy": 0.828125, + "epoch": 0.3625, + "step": 145 + }, + { + "epoch": 0.365, + "grad_norm": 10.333145141601562, + "learning_rate": 2.0052631578947367e-06, + "loss": 0.3784, + "step": 146 + }, + { + "Batch Mean": 0.05816793441772461, + "accuracy": 0.828125, + "epoch": 0.365, + "step": 146 + }, + { + "epoch": 0.3675, + "grad_norm": 9.091706275939941, + "learning_rate": 1.9973684210526315e-06, + "loss": 0.4217, + "step": 147 + }, + { + "Batch Mean": 0.016840219497680664, + "accuracy": 0.765625, + "epoch": 0.3675, + "step": 147 + }, + { + "epoch": 0.37, + "grad_norm": 12.988033294677734, + "learning_rate": 1.9894736842105262e-06, + "loss": 0.537, + "step": 148 + }, + { + "Batch Mean": -0.6307969093322754, + "accuracy": 0.8671875, + "epoch": 0.37, + "step": 148 + }, + { + "epoch": 0.3725, + "grad_norm": 8.663951873779297, + "learning_rate": 1.9815789473684214e-06, + "loss": 0.2973, + "step": 149 + }, + { + "Batch Mean": -0.41460466384887695, + "accuracy": 0.8359375, + "epoch": 0.3725, + "step": 149 + }, + { + "epoch": 0.375, + "grad_norm": 8.891082763671875, + "learning_rate": 1.973684210526316e-06, + "loss": 0.3553, + "step": 150 + }, + { + "Batch Mean": 0.045943260192871094, + "accuracy": 0.859375, + "epoch": 0.375, + "step": 150 + }, + { + "epoch": 0.3775, + "grad_norm": 8.152697563171387, + "learning_rate": 1.965789473684211e-06, + "loss": 0.3515, + "step": 151 + }, + { + "Batch Mean": -0.18106698989868164, + "accuracy": 0.796875, + "epoch": 0.3775, + "step": 151 + }, + { + "epoch": 0.38, + "grad_norm": 10.632974624633789, + "learning_rate": 1.9578947368421052e-06, + "loss": 0.4978, + "step": 152 + }, + { + "Batch Mean": 0.1395092010498047, + "accuracy": 0.859375, + "epoch": 0.38, + "step": 152 + }, + { + "epoch": 0.3825, + "grad_norm": 8.301017761230469, + "learning_rate": 1.95e-06, + "loss": 0.3268, + "step": 153 + }, + { + "Batch Mean": 0.011521339416503906, + "accuracy": 0.8046875, + "epoch": 0.3825, + "step": 153 + }, + { + "epoch": 0.385, + "grad_norm": 8.253169059753418, + "learning_rate": 1.9421052631578947e-06, + "loss": 0.3407, + "step": 154 + }, + { + "Batch Mean": -0.11399388313293457, + "accuracy": 0.8515625, + "epoch": 0.385, + "step": 154 + }, + { + "epoch": 0.3875, + "grad_norm": 9.271583557128906, + "learning_rate": 1.9342105263157895e-06, + "loss": 0.3919, + "step": 155 + }, + { + "Batch Mean": 0.3950004577636719, + "accuracy": 0.7890625, + "epoch": 0.3875, + "step": 155 + }, + { + "epoch": 0.39, + "grad_norm": 10.932522773742676, + "learning_rate": 1.926315789473684e-06, + "loss": 0.4111, + "step": 156 + }, + { + "Batch Mean": 0.24418425559997559, + "accuracy": 0.8125, + "epoch": 0.39, + "step": 156 + }, + { + "epoch": 0.3925, + "grad_norm": 9.256501197814941, + "learning_rate": 1.918421052631579e-06, + "loss": 0.4292, + "step": 157 + }, + { + "Batch Mean": -0.1265101432800293, + "accuracy": 0.8125, + "epoch": 0.3925, + "step": 157 + }, + { + "epoch": 0.395, + "grad_norm": 8.907751083374023, + "learning_rate": 1.9105263157894737e-06, + "loss": 0.3853, + "step": 158 + }, + { + "Batch Mean": -0.35911059379577637, + "accuracy": 0.828125, + "epoch": 0.395, + "step": 158 + }, + { + "epoch": 0.3975, + "grad_norm": 9.353957176208496, + "learning_rate": 1.9026315789473684e-06, + "loss": 0.4177, + "step": 159 + }, + { + "Batch Mean": -0.49236249923706055, + "accuracy": 0.828125, + "epoch": 0.3975, + "step": 159 + }, + { + "epoch": 0.4, + "grad_norm": 8.749178886413574, + "learning_rate": 1.8947368421052632e-06, + "loss": 0.3711, + "step": 160 + }, + { + "Batch Mean": 0.1534252166748047, + "accuracy": 0.8671875, + "epoch": 0.4, + "step": 160 + }, + { + "epoch": 0.4025, + "grad_norm": 8.529258728027344, + "learning_rate": 1.8868421052631577e-06, + "loss": 0.3523, + "step": 161 + }, + { + "Batch Mean": 0.11423969268798828, + "accuracy": 0.8125, + "epoch": 0.4025, + "step": 161 + }, + { + "epoch": 0.405, + "grad_norm": 8.786529541015625, + "learning_rate": 1.8789473684210525e-06, + "loss": 0.3942, + "step": 162 + }, + { + "Batch Mean": 0.15482479333877563, + "accuracy": 0.828125, + "epoch": 0.405, + "step": 162 + }, + { + "epoch": 0.4075, + "grad_norm": 8.650775909423828, + "learning_rate": 1.8710526315789476e-06, + "loss": 0.3971, + "step": 163 + }, + { + "Batch Mean": 0.19225120544433594, + "accuracy": 0.859375, + "epoch": 0.4075, + "step": 163 + }, + { + "epoch": 0.41, + "grad_norm": 7.860261917114258, + "learning_rate": 1.8631578947368424e-06, + "loss": 0.3098, + "step": 164 + }, + { + "Batch Mean": 0.570763111114502, + "accuracy": 0.828125, + "epoch": 0.41, + "step": 164 + }, + { + "epoch": 0.4125, + "grad_norm": 9.458237648010254, + "learning_rate": 1.855263157894737e-06, + "loss": 0.4007, + "step": 165 + }, + { + "Batch Mean": 0.1940155029296875, + "accuracy": 0.796875, + "epoch": 0.4125, + "step": 165 + }, + { + "epoch": 0.415, + "grad_norm": 8.419119834899902, + "learning_rate": 1.8473684210526317e-06, + "loss": 0.3679, + "step": 166 + }, + { + "Batch Mean": 0.13667702674865723, + "accuracy": 0.8125, + "epoch": 0.415, + "step": 166 + }, + { + "epoch": 0.4175, + "grad_norm": 11.180956840515137, + "learning_rate": 1.8394736842105264e-06, + "loss": 0.4043, + "step": 167 + }, + { + "Batch Mean": -0.47870922088623047, + "accuracy": 0.796875, + "epoch": 0.4175, + "step": 167 + }, + { + "epoch": 0.42, + "grad_norm": 11.080053329467773, + "learning_rate": 1.8315789473684211e-06, + "loss": 0.4806, + "step": 168 + }, + { + "Batch Mean": -0.07008326053619385, + "accuracy": 0.8125, + "epoch": 0.42, + "step": 168 + }, + { + "epoch": 0.4225, + "grad_norm": 9.977706909179688, + "learning_rate": 1.8236842105263159e-06, + "loss": 0.3711, + "step": 169 + }, + { + "Batch Mean": -0.21228408813476562, + "accuracy": 0.78125, + "epoch": 0.4225, + "step": 169 + }, + { + "epoch": 0.425, + "grad_norm": 9.771838188171387, + "learning_rate": 1.8157894736842106e-06, + "loss": 0.4154, + "step": 170 + }, + { + "Batch Mean": -0.014397621154785156, + "accuracy": 0.8671875, + "epoch": 0.425, + "step": 170 + }, + { + "epoch": 0.4275, + "grad_norm": 7.314267158508301, + "learning_rate": 1.8078947368421052e-06, + "loss": 0.2636, + "step": 171 + }, + { + "Batch Mean": 0.3306160569190979, + "accuracy": 0.828125, + "epoch": 0.4275, + "step": 171 + }, + { + "epoch": 0.43, + "grad_norm": 8.739150047302246, + "learning_rate": 1.8e-06, + "loss": 0.3611, + "step": 172 + }, + { + "Batch Mean": -0.0874967873096466, + "accuracy": 0.7890625, + "epoch": 0.43, + "step": 172 + }, + { + "epoch": 0.4325, + "grad_norm": 9.247507095336914, + "learning_rate": 1.7921052631578947e-06, + "loss": 0.4175, + "step": 173 + }, + { + "Batch Mean": 0.2975928783416748, + "accuracy": 0.84375, + "epoch": 0.4325, + "step": 173 + }, + { + "epoch": 0.435, + "grad_norm": 9.041180610656738, + "learning_rate": 1.7842105263157894e-06, + "loss": 0.3671, + "step": 174 + }, + { + "Batch Mean": 0.015955865383148193, + "accuracy": 0.7421875, + "epoch": 0.435, + "step": 174 + }, + { + "epoch": 0.4375, + "grad_norm": 11.472379684448242, + "learning_rate": 1.7763157894736842e-06, + "loss": 0.4833, + "step": 175 + }, + { + "Batch Mean": -0.36949777603149414, + "accuracy": 0.7421875, + "epoch": 0.4375, + "step": 175 + }, + { + "epoch": 0.44, + "grad_norm": 11.286201477050781, + "learning_rate": 1.768421052631579e-06, + "loss": 0.4855, + "step": 176 + }, + { + "Batch Mean": -0.3930339813232422, + "accuracy": 0.828125, + "epoch": 0.44, + "step": 176 + }, + { + "epoch": 0.4425, + "grad_norm": 10.522198677062988, + "learning_rate": 1.7605263157894739e-06, + "loss": 0.3819, + "step": 177 + }, + { + "Batch Mean": 0.08716198801994324, + "accuracy": 0.828125, + "epoch": 0.4425, + "step": 177 + }, + { + "epoch": 0.445, + "grad_norm": 9.067276000976562, + "learning_rate": 1.7526315789473686e-06, + "loss": 0.3921, + "step": 178 + }, + { + "Batch Mean": -0.25193285942077637, + "accuracy": 0.8203125, + "epoch": 0.445, + "step": 178 + }, + { + "epoch": 0.4475, + "grad_norm": 9.11840534210205, + "learning_rate": 1.7447368421052633e-06, + "loss": 0.4172, + "step": 179 + }, + { + "Batch Mean": -0.23209238052368164, + "accuracy": 0.8203125, + "epoch": 0.4475, + "step": 179 + }, + { + "epoch": 0.45, + "grad_norm": 11.945143699645996, + "learning_rate": 1.736842105263158e-06, + "loss": 0.3795, + "step": 180 + }, + { + "Batch Mean": -0.21220535039901733, + "accuracy": 0.8984375, + "epoch": 0.45, + "step": 180 + }, + { + "epoch": 0.4525, + "grad_norm": 8.286118507385254, + "learning_rate": 1.7289473684210526e-06, + "loss": 0.2964, + "step": 181 + }, + { + "Batch Mean": 0.15385687351226807, + "accuracy": 0.8203125, + "epoch": 0.4525, + "step": 181 + }, + { + "epoch": 0.455, + "grad_norm": 8.66666316986084, + "learning_rate": 1.7210526315789474e-06, + "loss": 0.3717, + "step": 182 + }, + { + "Batch Mean": 0.47098982334136963, + "accuracy": 0.8125, + "epoch": 0.455, + "step": 182 + }, + { + "epoch": 0.4575, + "grad_norm": 9.041359901428223, + "learning_rate": 1.7131578947368421e-06, + "loss": 0.3627, + "step": 183 + }, + { + "Batch Mean": 0.2258845567703247, + "accuracy": 0.8359375, + "epoch": 0.4575, + "step": 183 + }, + { + "epoch": 0.46, + "grad_norm": 10.031190872192383, + "learning_rate": 1.7052631578947369e-06, + "loss": 0.4308, + "step": 184 + }, + { + "Batch Mean": 0.1376625895500183, + "accuracy": 0.7890625, + "epoch": 0.46, + "step": 184 + }, + { + "epoch": 0.4625, + "grad_norm": 8.849730491638184, + "learning_rate": 1.6973684210526316e-06, + "loss": 0.3612, + "step": 185 + }, + { + "Batch Mean": -0.25082847476005554, + "accuracy": 0.8359375, + "epoch": 0.4625, + "step": 185 + }, + { + "epoch": 0.465, + "grad_norm": 9.715136528015137, + "learning_rate": 1.6894736842105264e-06, + "loss": 0.4192, + "step": 186 + }, + { + "Batch Mean": -0.43356090784072876, + "accuracy": 0.859375, + "epoch": 0.465, + "step": 186 + }, + { + "epoch": 0.4675, + "grad_norm": 8.193946838378906, + "learning_rate": 1.6815789473684209e-06, + "loss": 0.3504, + "step": 187 + }, + { + "Batch Mean": -0.013092756271362305, + "accuracy": 0.875, + "epoch": 0.4675, + "step": 187 + }, + { + "epoch": 0.47, + "grad_norm": 8.71420955657959, + "learning_rate": 1.6736842105263156e-06, + "loss": 0.3437, + "step": 188 + }, + { + "Batch Mean": 0.13508832454681396, + "accuracy": 0.78125, + "epoch": 0.47, + "step": 188 + }, + { + "epoch": 0.4725, + "grad_norm": 10.5387544631958, + "learning_rate": 1.6657894736842104e-06, + "loss": 0.4141, + "step": 189 + }, + { + "Batch Mean": 0.12293338775634766, + "accuracy": 0.84375, + "epoch": 0.4725, + "step": 189 + }, + { + "epoch": 0.475, + "grad_norm": 8.671295166015625, + "learning_rate": 1.6578947368421056e-06, + "loss": 0.3752, + "step": 190 + }, + { + "Batch Mean": 0.47929099202156067, + "accuracy": 0.8203125, + "epoch": 0.475, + "step": 190 + }, + { + "epoch": 0.4775, + "grad_norm": 11.653754234313965, + "learning_rate": 1.65e-06, + "loss": 0.4854, + "step": 191 + }, + { + "Batch Mean": 0.49117136001586914, + "accuracy": 0.84375, + "epoch": 0.4775, + "step": 191 + }, + { + "epoch": 0.48, + "grad_norm": 8.04267406463623, + "learning_rate": 1.6421052631578948e-06, + "loss": 0.3098, + "step": 192 + }, + { + "Batch Mean": 0.5062227249145508, + "accuracy": 0.8828125, + "epoch": 0.48, + "step": 192 + }, + { + "epoch": 0.4825, + "grad_norm": 8.732284545898438, + "learning_rate": 1.6342105263157896e-06, + "loss": 0.3193, + "step": 193 + }, + { + "Batch Mean": -0.3749980926513672, + "accuracy": 0.859375, + "epoch": 0.4825, + "step": 193 + }, + { + "epoch": 0.485, + "grad_norm": 8.27978515625, + "learning_rate": 1.6263157894736843e-06, + "loss": 0.337, + "step": 194 + }, + { + "Batch Mean": -0.6132583618164062, + "accuracy": 0.765625, + "epoch": 0.485, + "step": 194 + }, + { + "epoch": 0.4875, + "grad_norm": 9.447763442993164, + "learning_rate": 1.618421052631579e-06, + "loss": 0.4433, + "step": 195 + }, + { + "Batch Mean": -0.452798068523407, + "accuracy": 0.859375, + "epoch": 0.4875, + "step": 195 + }, + { + "epoch": 0.49, + "grad_norm": 8.492661476135254, + "learning_rate": 1.6105263157894738e-06, + "loss": 0.3995, + "step": 196 + }, + { + "Batch Mean": -0.3771591782569885, + "accuracy": 0.828125, + "epoch": 0.49, + "step": 196 + }, + { + "epoch": 0.4925, + "grad_norm": 8.175331115722656, + "learning_rate": 1.6026315789473683e-06, + "loss": 0.3677, + "step": 197 + }, + { + "Batch Mean": 0.2654750347137451, + "accuracy": 0.84375, + "epoch": 0.4925, + "step": 197 + }, + { + "epoch": 0.495, + "grad_norm": 8.926229476928711, + "learning_rate": 1.594736842105263e-06, + "loss": 0.3555, + "step": 198 + }, + { + "Batch Mean": 0.09074139595031738, + "accuracy": 0.765625, + "epoch": 0.495, + "step": 198 + }, + { + "epoch": 0.4975, + "grad_norm": 10.059514999389648, + "learning_rate": 1.5868421052631578e-06, + "loss": 0.4346, + "step": 199 + }, + { + "Batch Mean": 0.15778541564941406, + "accuracy": 0.7578125, + "epoch": 0.4975, + "step": 199 + }, + { + "epoch": 0.5, + "grad_norm": 8.965482711791992, + "learning_rate": 1.5789473684210526e-06, + "loss": 0.4118, + "step": 200 + }, + { + "Batch Mean": 0.4453287124633789, + "accuracy": 0.8359375, + "epoch": 0.5, + "step": 200 + }, + { + "epoch": 0.5025, + "grad_norm": 9.427848815917969, + "learning_rate": 1.5710526315789473e-06, + "loss": 0.4052, + "step": 201 + }, + { + "Batch Mean": 0.006826639175415039, + "accuracy": 0.828125, + "epoch": 0.5025, + "step": 201 + }, + { + "epoch": 0.505, + "grad_norm": 8.063475608825684, + "learning_rate": 1.563157894736842e-06, + "loss": 0.3993, + "step": 202 + }, + { + "Batch Mean": 0.09803962707519531, + "accuracy": 0.90625, + "epoch": 0.505, + "step": 202 + }, + { + "epoch": 0.5075, + "grad_norm": 7.630380153656006, + "learning_rate": 1.5552631578947368e-06, + "loss": 0.3072, + "step": 203 + }, + { + "Batch Mean": -0.07749080657958984, + "accuracy": 0.765625, + "epoch": 0.5075, + "step": 203 + }, + { + "epoch": 0.51, + "grad_norm": 9.845494270324707, + "learning_rate": 1.5473684210526318e-06, + "loss": 0.4411, + "step": 204 + }, + { + "Batch Mean": -0.6465473175048828, + "accuracy": 0.8046875, + "epoch": 0.51, + "step": 204 + }, + { + "epoch": 0.5125, + "grad_norm": 9.862820625305176, + "learning_rate": 1.5394736842105265e-06, + "loss": 0.4042, + "step": 205 + }, + { + "Batch Mean": -0.25893068313598633, + "accuracy": 0.875, + "epoch": 0.5125, + "step": 205 + }, + { + "epoch": 0.515, + "grad_norm": 9.104493141174316, + "learning_rate": 1.5315789473684213e-06, + "loss": 0.3414, + "step": 206 + }, + { + "Batch Mean": -0.07256972789764404, + "accuracy": 0.78125, + "epoch": 0.515, + "step": 206 + }, + { + "epoch": 0.5175, + "grad_norm": 10.120800971984863, + "learning_rate": 1.5236842105263158e-06, + "loss": 0.3988, + "step": 207 + }, + { + "Batch Mean": 0.04418504238128662, + "accuracy": 0.8828125, + "epoch": 0.5175, + "step": 207 + }, + { + "epoch": 0.52, + "grad_norm": 7.554128646850586, + "learning_rate": 1.5157894736842105e-06, + "loss": 0.3051, + "step": 208 + }, + { + "Batch Mean": 0.6764199733734131, + "accuracy": 0.8515625, + "epoch": 0.52, + "step": 208 + }, + { + "epoch": 0.5225, + "grad_norm": 10.835310935974121, + "learning_rate": 1.5078947368421053e-06, + "loss": 0.3987, + "step": 209 + }, + { + "Batch Mean": 0.7588990926742554, + "accuracy": 0.78125, + "epoch": 0.5225, + "step": 209 + }, + { + "epoch": 0.525, + "grad_norm": 11.688884735107422, + "learning_rate": 1.5e-06, + "loss": 0.4125, + "step": 210 + }, + { + "Batch Mean": 0.1539628505706787, + "accuracy": 0.78125, + "epoch": 0.525, + "step": 210 + }, + { + "epoch": 0.5275, + "grad_norm": 10.703190803527832, + "learning_rate": 1.4921052631578948e-06, + "loss": 0.4351, + "step": 211 + }, + { + "Batch Mean": 0.1315913200378418, + "accuracy": 0.828125, + "epoch": 0.5275, + "step": 211 + }, + { + "epoch": 0.53, + "grad_norm": 10.159830093383789, + "learning_rate": 1.4842105263157895e-06, + "loss": 0.4016, + "step": 212 + }, + { + "Batch Mean": -0.20069313049316406, + "accuracy": 0.8671875, + "epoch": 0.53, + "step": 212 + }, + { + "epoch": 0.5325, + "grad_norm": 9.962963104248047, + "learning_rate": 1.4763157894736843e-06, + "loss": 0.3707, + "step": 213 + }, + { + "Batch Mean": -0.14012432098388672, + "accuracy": 0.8046875, + "epoch": 0.5325, + "step": 213 + }, + { + "epoch": 0.535, + "grad_norm": 10.636829376220703, + "learning_rate": 1.468421052631579e-06, + "loss": 0.4394, + "step": 214 + }, + { + "Batch Mean": 0.03688657283782959, + "accuracy": 0.8515625, + "epoch": 0.535, + "step": 214 + }, + { + "epoch": 0.5375, + "grad_norm": 9.342628479003906, + "learning_rate": 1.4605263157894738e-06, + "loss": 0.3592, + "step": 215 + }, + { + "Batch Mean": -0.2712291479110718, + "accuracy": 0.875, + "epoch": 0.5375, + "step": 215 + }, + { + "epoch": 0.54, + "grad_norm": 8.388570785522461, + "learning_rate": 1.4526315789473685e-06, + "loss": 0.3113, + "step": 216 + }, + { + "Batch Mean": -0.07314193248748779, + "accuracy": 0.8203125, + "epoch": 0.54, + "step": 216 + }, + { + "epoch": 0.5425, + "grad_norm": 9.443964958190918, + "learning_rate": 1.4447368421052633e-06, + "loss": 0.4019, + "step": 217 + }, + { + "Batch Mean": -0.042542457580566406, + "accuracy": 0.8203125, + "epoch": 0.5425, + "step": 217 + }, + { + "epoch": 0.545, + "grad_norm": 10.919589042663574, + "learning_rate": 1.436842105263158e-06, + "loss": 0.3703, + "step": 218 + }, + { + "Batch Mean": -0.03744637966156006, + "accuracy": 0.828125, + "epoch": 0.545, + "step": 218 + }, + { + "epoch": 0.5475, + "grad_norm": 9.716089248657227, + "learning_rate": 1.4289473684210525e-06, + "loss": 0.3527, + "step": 219 + }, + { + "Batch Mean": -0.209802508354187, + "accuracy": 0.8046875, + "epoch": 0.5475, + "step": 219 + }, + { + "epoch": 0.55, + "grad_norm": 10.598649024963379, + "learning_rate": 1.4210526315789473e-06, + "loss": 0.4273, + "step": 220 + }, + { + "Batch Mean": 0.34712672233581543, + "accuracy": 0.828125, + "epoch": 0.55, + "step": 220 + }, + { + "epoch": 0.5525, + "grad_norm": 9.486217498779297, + "learning_rate": 1.4131578947368422e-06, + "loss": 0.341, + "step": 221 + }, + { + "Batch Mean": 0.03266429901123047, + "accuracy": 0.8203125, + "epoch": 0.5525, + "step": 221 + }, + { + "epoch": 0.555, + "grad_norm": 10.724418640136719, + "learning_rate": 1.405263157894737e-06, + "loss": 0.3997, + "step": 222 + }, + { + "Batch Mean": -0.18854069709777832, + "accuracy": 0.828125, + "epoch": 0.555, + "step": 222 + }, + { + "epoch": 0.5575, + "grad_norm": 9.75826358795166, + "learning_rate": 1.3973684210526317e-06, + "loss": 0.3694, + "step": 223 + }, + { + "Batch Mean": -0.41150808334350586, + "accuracy": 0.859375, + "epoch": 0.5575, + "step": 223 + }, + { + "epoch": 0.56, + "grad_norm": 9.46369743347168, + "learning_rate": 1.3894736842105263e-06, + "loss": 0.3251, + "step": 224 + }, + { + "Batch Mean": -0.0464327335357666, + "accuracy": 0.8203125, + "epoch": 0.56, + "step": 224 + }, + { + "epoch": 0.5625, + "grad_norm": 10.088607788085938, + "learning_rate": 1.381578947368421e-06, + "loss": 0.4025, + "step": 225 + }, + { + "Batch Mean": 0.4004335403442383, + "accuracy": 0.7890625, + "epoch": 0.5625, + "step": 225 + }, + { + "epoch": 0.565, + "grad_norm": 12.501627922058105, + "learning_rate": 1.3736842105263158e-06, + "loss": 0.4476, + "step": 226 + }, + { + "Batch Mean": 0.3384718894958496, + "accuracy": 0.7734375, + "epoch": 0.565, + "step": 226 + }, + { + "epoch": 0.5675, + "grad_norm": 11.507113456726074, + "learning_rate": 1.3657894736842107e-06, + "loss": 0.4509, + "step": 227 + }, + { + "Batch Mean": 0.10722827911376953, + "accuracy": 0.8125, + "epoch": 0.5675, + "step": 227 + }, + { + "epoch": 0.57, + "grad_norm": 8.944846153259277, + "learning_rate": 1.3578947368421055e-06, + "loss": 0.4159, + "step": 228 + }, + { + "Batch Mean": -0.30047035217285156, + "accuracy": 0.8359375, + "epoch": 0.57, + "step": 228 + }, + { + "epoch": 0.5725, + "grad_norm": 8.207252502441406, + "learning_rate": 1.35e-06, + "loss": 0.3531, + "step": 229 + }, + { + "Batch Mean": -0.2651713490486145, + "accuracy": 0.8515625, + "epoch": 0.5725, + "step": 229 + }, + { + "epoch": 0.575, + "grad_norm": 8.584181785583496, + "learning_rate": 1.3421052631578947e-06, + "loss": 0.363, + "step": 230 + }, + { + "Batch Mean": -0.44047990441322327, + "accuracy": 0.8203125, + "epoch": 0.575, + "step": 230 + }, + { + "epoch": 0.5775, + "grad_norm": 9.892334938049316, + "learning_rate": 1.3342105263157895e-06, + "loss": 0.3964, + "step": 231 + }, + { + "Batch Mean": -0.1500718593597412, + "accuracy": 0.7890625, + "epoch": 0.5775, + "step": 231 + }, + { + "epoch": 0.58, + "grad_norm": 9.095638275146484, + "learning_rate": 1.3263157894736842e-06, + "loss": 0.3979, + "step": 232 + }, + { + "Batch Mean": 0.2158275842666626, + "accuracy": 0.7578125, + "epoch": 0.58, + "step": 232 + }, + { + "epoch": 0.5825, + "grad_norm": 11.318427085876465, + "learning_rate": 1.318421052631579e-06, + "loss": 0.476, + "step": 233 + }, + { + "Batch Mean": 0.25293970108032227, + "accuracy": 0.8125, + "epoch": 0.5825, + "step": 233 + }, + { + "epoch": 0.585, + "grad_norm": 8.678732872009277, + "learning_rate": 1.3105263157894737e-06, + "loss": 0.4024, + "step": 234 + }, + { + "Batch Mean": 0.30934906005859375, + "accuracy": 0.8671875, + "epoch": 0.585, + "step": 234 + }, + { + "epoch": 0.5875, + "grad_norm": 7.908719062805176, + "learning_rate": 1.3026315789473685e-06, + "loss": 0.3039, + "step": 235 + }, + { + "Batch Mean": -0.057540178298950195, + "accuracy": 0.8515625, + "epoch": 0.5875, + "step": 235 + }, + { + "epoch": 0.59, + "grad_norm": 7.79960823059082, + "learning_rate": 1.2947368421052632e-06, + "loss": 0.3345, + "step": 236 + }, + { + "Batch Mean": -0.08140373229980469, + "accuracy": 0.796875, + "epoch": 0.59, + "step": 236 + }, + { + "epoch": 0.5925, + "grad_norm": 9.922690391540527, + "learning_rate": 1.286842105263158e-06, + "loss": 0.4089, + "step": 237 + }, + { + "Batch Mean": -0.31038880348205566, + "accuracy": 0.8203125, + "epoch": 0.5925, + "step": 237 + }, + { + "epoch": 0.595, + "grad_norm": 9.403748512268066, + "learning_rate": 1.2789473684210527e-06, + "loss": 0.392, + "step": 238 + }, + { + "Batch Mean": -0.025382041931152344, + "accuracy": 0.828125, + "epoch": 0.595, + "step": 238 + }, + { + "epoch": 0.5975, + "grad_norm": 9.586471557617188, + "learning_rate": 1.2710526315789474e-06, + "loss": 0.3332, + "step": 239 + }, + { + "Batch Mean": -0.09691180288791656, + "accuracy": 0.890625, + "epoch": 0.5975, + "step": 239 + }, + { + "epoch": 0.6, + "grad_norm": 8.952404975891113, + "learning_rate": 1.263157894736842e-06, + "loss": 0.2919, + "step": 240 + }, + { + "Batch Mean": 0.24474626779556274, + "accuracy": 0.796875, + "epoch": 0.6, + "step": 240 + }, + { + "epoch": 0.6025, + "grad_norm": 9.637041091918945, + "learning_rate": 1.255263157894737e-06, + "loss": 0.3659, + "step": 241 + }, + { + "Batch Mean": 0.3834969997406006, + "accuracy": 0.84375, + "epoch": 0.6025, + "step": 241 + }, + { + "epoch": 0.605, + "grad_norm": 10.450284957885742, + "learning_rate": 1.2473684210526317e-06, + "loss": 0.3735, + "step": 242 + }, + { + "Batch Mean": 0.41518211364746094, + "accuracy": 0.828125, + "epoch": 0.605, + "step": 242 + }, + { + "epoch": 0.6075, + "grad_norm": 13.025629043579102, + "learning_rate": 1.2394736842105264e-06, + "loss": 0.3474, + "step": 243 + }, + { + "Batch Mean": 0.43804705142974854, + "accuracy": 0.8125, + "epoch": 0.6075, + "step": 243 + }, + { + "epoch": 0.61, + "grad_norm": 11.754925727844238, + "learning_rate": 1.2315789473684212e-06, + "loss": 0.4055, + "step": 244 + }, + { + "Batch Mean": 0.26791906356811523, + "accuracy": 0.8125, + "epoch": 0.61, + "step": 244 + }, + { + "epoch": 0.6125, + "grad_norm": 8.931976318359375, + "learning_rate": 1.2236842105263157e-06, + "loss": 0.3545, + "step": 245 + }, + { + "Batch Mean": -0.04610109329223633, + "accuracy": 0.8203125, + "epoch": 0.6125, + "step": 245 + }, + { + "epoch": 0.615, + "grad_norm": 10.400755882263184, + "learning_rate": 1.2157894736842105e-06, + "loss": 0.4299, + "step": 246 + }, + { + "Batch Mean": -0.7998272180557251, + "accuracy": 0.8203125, + "epoch": 0.615, + "step": 246 + }, + { + "epoch": 0.6175, + "grad_norm": 11.944729804992676, + "learning_rate": 1.2078947368421052e-06, + "loss": 0.4358, + "step": 247 + }, + { + "Batch Mean": -0.2411341667175293, + "accuracy": 0.828125, + "epoch": 0.6175, + "step": 247 + }, + { + "epoch": 0.62, + "grad_norm": 10.680310249328613, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.3575, + "step": 248 + }, + { + "Batch Mean": -0.47270846366882324, + "accuracy": 0.859375, + "epoch": 0.62, + "step": 248 + }, + { + "epoch": 0.6225, + "grad_norm": 9.311689376831055, + "learning_rate": 1.192105263157895e-06, + "loss": 0.3146, + "step": 249 + }, + { + "Batch Mean": -0.023276805877685547, + "accuracy": 0.8203125, + "epoch": 0.6225, + "step": 249 + }, + { + "epoch": 0.625, + "grad_norm": 10.916861534118652, + "learning_rate": 1.1842105263157894e-06, + "loss": 0.3885, + "step": 250 + }, + { + "Batch Mean": -0.2923322916030884, + "accuracy": 0.828125, + "epoch": 0.625, + "step": 250 + }, + { + "epoch": 0.6275, + "grad_norm": 9.724871635437012, + "learning_rate": 1.1763157894736842e-06, + "loss": 0.3544, + "step": 251 + }, + { + "Batch Mean": 0.03692519664764404, + "accuracy": 0.828125, + "epoch": 0.6275, + "step": 251 + }, + { + "epoch": 0.63, + "grad_norm": 9.2437744140625, + "learning_rate": 1.168421052631579e-06, + "loss": 0.3548, + "step": 252 + }, + { + "Batch Mean": 0.9230005741119385, + "accuracy": 0.8515625, + "epoch": 0.63, + "step": 252 + }, + { + "epoch": 0.6325, + "grad_norm": 14.65259075164795, + "learning_rate": 1.1605263157894737e-06, + "loss": 0.4183, + "step": 253 + }, + { + "Batch Mean": 0.45890212059020996, + "accuracy": 0.828125, + "epoch": 0.6325, + "step": 253 + }, + { + "epoch": 0.635, + "grad_norm": 11.038723945617676, + "learning_rate": 1.1526315789473684e-06, + "loss": 0.4122, + "step": 254 + }, + { + "Batch Mean": 0.38666415214538574, + "accuracy": 0.828125, + "epoch": 0.635, + "step": 254 + }, + { + "epoch": 0.6375, + "grad_norm": 11.32805061340332, + "learning_rate": 1.1447368421052632e-06, + "loss": 0.3961, + "step": 255 + }, + { + "Batch Mean": 0.4147815704345703, + "accuracy": 0.828125, + "epoch": 0.6375, + "step": 255 + }, + { + "epoch": 0.64, + "grad_norm": 9.602386474609375, + "learning_rate": 1.136842105263158e-06, + "loss": 0.3741, + "step": 256 + }, + { + "Batch Mean": 0.006276130676269531, + "accuracy": 0.78125, + "epoch": 0.64, + "step": 256 + }, + { + "epoch": 0.6425, + "grad_norm": 9.531057357788086, + "learning_rate": 1.1289473684210527e-06, + "loss": 0.4329, + "step": 257 + }, + { + "Batch Mean": -0.23307615518569946, + "accuracy": 0.828125, + "epoch": 0.6425, + "step": 257 + }, + { + "epoch": 0.645, + "grad_norm": 8.362524032592773, + "learning_rate": 1.1210526315789474e-06, + "loss": 0.382, + "step": 258 + }, + { + "Batch Mean": 0.04139423370361328, + "accuracy": 0.796875, + "epoch": 0.645, + "step": 258 + }, + { + "epoch": 0.6475, + "grad_norm": 9.12584114074707, + "learning_rate": 1.1131578947368421e-06, + "loss": 0.3852, + "step": 259 + }, + { + "Batch Mean": -0.3190183639526367, + "accuracy": 0.8046875, + "epoch": 0.6475, + "step": 259 + }, + { + "epoch": 0.65, + "grad_norm": 9.95345401763916, + "learning_rate": 1.1052631578947369e-06, + "loss": 0.3966, + "step": 260 + }, + { + "Batch Mean": -0.8157808780670166, + "accuracy": 0.828125, + "epoch": 0.65, + "step": 260 + }, + { + "epoch": 0.6525, + "grad_norm": 10.541975021362305, + "learning_rate": 1.0973684210526316e-06, + "loss": 0.367, + "step": 261 + }, + { + "Batch Mean": -0.5502346754074097, + "accuracy": 0.796875, + "epoch": 0.6525, + "step": 261 + }, + { + "epoch": 0.655, + "grad_norm": 10.180998802185059, + "learning_rate": 1.0894736842105264e-06, + "loss": 0.4012, + "step": 262 + }, + { + "Batch Mean": -0.31228208541870117, + "accuracy": 0.796875, + "epoch": 0.655, + "step": 262 + }, + { + "epoch": 0.6575, + "grad_norm": 8.454268455505371, + "learning_rate": 1.0815789473684211e-06, + "loss": 0.3794, + "step": 263 + }, + { + "Batch Mean": 0.023279190063476562, + "accuracy": 0.8359375, + "epoch": 0.6575, + "step": 263 + }, + { + "epoch": 0.66, + "grad_norm": 8.542173385620117, + "learning_rate": 1.0736842105263159e-06, + "loss": 0.3667, + "step": 264 + }, + { + "Batch Mean": 0.25201869010925293, + "accuracy": 0.8984375, + "epoch": 0.66, + "step": 264 + }, + { + "epoch": 0.6625, + "grad_norm": 9.752306938171387, + "learning_rate": 1.0657894736842106e-06, + "loss": 0.2916, + "step": 265 + }, + { + "Batch Mean": 0.21469709277153015, + "accuracy": 0.890625, + "epoch": 0.6625, + "step": 265 + }, + { + "epoch": 0.665, + "grad_norm": 6.977869510650635, + "learning_rate": 1.0578947368421052e-06, + "loss": 0.2659, + "step": 266 + }, + { + "Batch Mean": 0.393679141998291, + "accuracy": 0.8515625, + "epoch": 0.665, + "step": 266 + }, + { + "epoch": 0.6675, + "grad_norm": 9.701105117797852, + "learning_rate": 1.05e-06, + "loss": 0.3567, + "step": 267 + }, + { + "Batch Mean": 0.6953978538513184, + "accuracy": 0.8046875, + "epoch": 0.6675, + "step": 267 + }, + { + "epoch": 0.67, + "grad_norm": 10.121991157531738, + "learning_rate": 1.0421052631578949e-06, + "loss": 0.404, + "step": 268 + }, + { + "Batch Mean": 0.3401451110839844, + "accuracy": 0.8671875, + "epoch": 0.67, + "step": 268 + }, + { + "epoch": 0.6725, + "grad_norm": 8.189865112304688, + "learning_rate": 1.0342105263157896e-06, + "loss": 0.3378, + "step": 269 + }, + { + "Batch Mean": 0.2478356957435608, + "accuracy": 0.75, + "epoch": 0.6725, + "step": 269 + }, + { + "epoch": 0.675, + "grad_norm": 10.778964042663574, + "learning_rate": 1.0263157894736843e-06, + "loss": 0.4703, + "step": 270 + }, + { + "Batch Mean": -0.22268009185791016, + "accuracy": 0.8046875, + "epoch": 0.675, + "step": 270 + }, + { + "epoch": 0.6775, + "grad_norm": 9.94092845916748, + "learning_rate": 1.0184210526315789e-06, + "loss": 0.412, + "step": 271 + }, + { + "Batch Mean": -0.19172096252441406, + "accuracy": 0.8671875, + "epoch": 0.6775, + "step": 271 + }, + { + "epoch": 0.68, + "grad_norm": 7.948995113372803, + "learning_rate": 1.0105263157894736e-06, + "loss": 0.3079, + "step": 272 + }, + { + "Batch Mean": -0.01717686653137207, + "accuracy": 0.84375, + "epoch": 0.68, + "step": 272 + }, + { + "epoch": 0.6825, + "grad_norm": 8.31904125213623, + "learning_rate": 1.0026315789473684e-06, + "loss": 0.3627, + "step": 273 + }, + { + "Batch Mean": -0.17321079969406128, + "accuracy": 0.8203125, + "epoch": 0.6825, + "step": 273 + }, + { + "epoch": 0.685, + "grad_norm": 9.820837020874023, + "learning_rate": 9.947368421052631e-07, + "loss": 0.4136, + "step": 274 + }, + { + "Batch Mean": -0.1444711685180664, + "accuracy": 0.7421875, + "epoch": 0.685, + "step": 274 + }, + { + "epoch": 0.6875, + "grad_norm": 10.36066722869873, + "learning_rate": 9.86842105263158e-07, + "loss": 0.4474, + "step": 275 + }, + { + "Batch Mean": -0.26374632120132446, + "accuracy": 0.796875, + "epoch": 0.6875, + "step": 275 + }, + { + "epoch": 0.69, + "grad_norm": 9.943070411682129, + "learning_rate": 9.789473684210526e-07, + "loss": 0.3974, + "step": 276 + }, + { + "Batch Mean": 0.056405067443847656, + "accuracy": 0.828125, + "epoch": 0.69, + "step": 276 + }, + { + "epoch": 0.6925, + "grad_norm": 8.994078636169434, + "learning_rate": 9.710526315789474e-07, + "loss": 0.3722, + "step": 277 + }, + { + "Batch Mean": -0.03705739974975586, + "accuracy": 0.84375, + "epoch": 0.6925, + "step": 277 + }, + { + "epoch": 0.695, + "grad_norm": 9.382537841796875, + "learning_rate": 9.63157894736842e-07, + "loss": 0.3764, + "step": 278 + }, + { + "Batch Mean": 0.1647498905658722, + "accuracy": 0.84375, + "epoch": 0.695, + "step": 278 + }, + { + "epoch": 0.6975, + "grad_norm": 8.693201065063477, + "learning_rate": 9.552631578947368e-07, + "loss": 0.3984, + "step": 279 + }, + { + "Batch Mean": -0.195298433303833, + "accuracy": 0.7890625, + "epoch": 0.6975, + "step": 279 + }, + { + "epoch": 0.7, + "grad_norm": 11.22012710571289, + "learning_rate": 9.473684210526316e-07, + "loss": 0.4338, + "step": 280 + }, + { + "Batch Mean": -0.1467556357383728, + "accuracy": 0.84375, + "epoch": 0.7, + "step": 280 + }, + { + "epoch": 0.7025, + "grad_norm": 7.977049827575684, + "learning_rate": 9.394736842105262e-07, + "loss": 0.3062, + "step": 281 + }, + { + "Batch Mean": 0.15868717432022095, + "accuracy": 0.8515625, + "epoch": 0.7025, + "step": 281 + }, + { + "epoch": 0.705, + "grad_norm": 8.255351066589355, + "learning_rate": 9.315789473684212e-07, + "loss": 0.3726, + "step": 282 + }, + { + "Batch Mean": -0.15160489082336426, + "accuracy": 0.8125, + "epoch": 0.705, + "step": 282 + }, + { + "epoch": 0.7075, + "grad_norm": 9.976411819458008, + "learning_rate": 9.236842105263158e-07, + "loss": 0.3723, + "step": 283 + }, + { + "Batch Mean": -0.27870988845825195, + "accuracy": 0.828125, + "epoch": 0.7075, + "step": 283 + }, + { + "epoch": 0.71, + "grad_norm": 8.676006317138672, + "learning_rate": 9.157894736842106e-07, + "loss": 0.3357, + "step": 284 + }, + { + "Batch Mean": 0.25218677520751953, + "accuracy": 0.8046875, + "epoch": 0.71, + "step": 284 + }, + { + "epoch": 0.7125, + "grad_norm": 9.949384689331055, + "learning_rate": 9.078947368421053e-07, + "loss": 0.4071, + "step": 285 + }, + { + "Batch Mean": -0.13117164373397827, + "accuracy": 0.8046875, + "epoch": 0.7125, + "step": 285 + }, + { + "epoch": 0.715, + "grad_norm": 9.771759033203125, + "learning_rate": 9e-07, + "loss": 0.3825, + "step": 286 + }, + { + "Batch Mean": -0.2633533477783203, + "accuracy": 0.828125, + "epoch": 0.715, + "step": 286 + }, + { + "epoch": 0.7175, + "grad_norm": 9.035362243652344, + "learning_rate": 8.921052631578947e-07, + "loss": 0.3266, + "step": 287 + }, + { + "Batch Mean": 0.20660001039505005, + "accuracy": 0.90625, + "epoch": 0.7175, + "step": 287 + }, + { + "epoch": 0.72, + "grad_norm": 8.132331848144531, + "learning_rate": 8.842105263157895e-07, + "loss": 0.2879, + "step": 288 + }, + { + "Batch Mean": 0.07574963569641113, + "accuracy": 0.8828125, + "epoch": 0.72, + "step": 288 + }, + { + "epoch": 0.7225, + "grad_norm": 8.73331356048584, + "learning_rate": 8.763157894736843e-07, + "loss": 0.333, + "step": 289 + }, + { + "Batch Mean": -0.017863690853118896, + "accuracy": 0.78125, + "epoch": 0.7225, + "step": 289 + }, + { + "epoch": 0.725, + "grad_norm": 10.603270530700684, + "learning_rate": 8.68421052631579e-07, + "loss": 0.4039, + "step": 290 + }, + { + "Batch Mean": 0.0752725601196289, + "accuracy": 0.8515625, + "epoch": 0.725, + "step": 290 + }, + { + "epoch": 0.7275, + "grad_norm": 9.06254768371582, + "learning_rate": 8.605263157894737e-07, + "loss": 0.3526, + "step": 291 + }, + { + "Batch Mean": 0.02768230438232422, + "accuracy": 0.8828125, + "epoch": 0.7275, + "step": 291 + }, + { + "epoch": 0.73, + "grad_norm": 8.968415260314941, + "learning_rate": 8.526315789473684e-07, + "loss": 0.291, + "step": 292 + }, + { + "Batch Mean": 0.21530699729919434, + "accuracy": 0.8203125, + "epoch": 0.73, + "step": 292 + }, + { + "epoch": 0.7325, + "grad_norm": 10.839483261108398, + "learning_rate": 8.447368421052632e-07, + "loss": 0.4106, + "step": 293 + }, + { + "Batch Mean": 0.5640954971313477, + "accuracy": 0.84375, + "epoch": 0.7325, + "step": 293 + }, + { + "epoch": 0.735, + "grad_norm": 9.728655815124512, + "learning_rate": 8.368421052631578e-07, + "loss": 0.3442, + "step": 294 + }, + { + "Batch Mean": -0.018077611923217773, + "accuracy": 0.859375, + "epoch": 0.735, + "step": 294 + }, + { + "epoch": 0.7375, + "grad_norm": 10.114471435546875, + "learning_rate": 8.289473684210528e-07, + "loss": 0.3098, + "step": 295 + }, + { + "Batch Mean": 0.09554731845855713, + "accuracy": 0.8671875, + "epoch": 0.7375, + "step": 295 + }, + { + "epoch": 0.74, + "grad_norm": 9.599811553955078, + "learning_rate": 8.210526315789474e-07, + "loss": 0.356, + "step": 296 + }, + { + "Batch Mean": 0.12642377614974976, + "accuracy": 0.8359375, + "epoch": 0.74, + "step": 296 + }, + { + "epoch": 0.7425, + "grad_norm": 9.411256790161133, + "learning_rate": 8.131578947368422e-07, + "loss": 0.3861, + "step": 297 + }, + { + "Batch Mean": -0.05638265609741211, + "accuracy": 0.828125, + "epoch": 0.7425, + "step": 297 + }, + { + "epoch": 0.745, + "grad_norm": 9.531231880187988, + "learning_rate": 8.052631578947369e-07, + "loss": 0.3093, + "step": 298 + }, + { + "Batch Mean": 0.13509082794189453, + "accuracy": 0.8671875, + "epoch": 0.745, + "step": 298 + }, + { + "epoch": 0.7475, + "grad_norm": 8.823690414428711, + "learning_rate": 7.973684210526315e-07, + "loss": 0.3221, + "step": 299 + }, + { + "Batch Mean": -0.17625296115875244, + "accuracy": 0.890625, + "epoch": 0.7475, + "step": 299 + }, + { + "epoch": 0.75, + "grad_norm": 8.370561599731445, + "learning_rate": 7.894736842105263e-07, + "loss": 0.2848, + "step": 300 + }, + { + "Batch Mean": -0.3022923469543457, + "accuracy": 0.8203125, + "epoch": 0.75, + "step": 300 + }, + { + "epoch": 0.7525, + "grad_norm": 10.149959564208984, + "learning_rate": 7.81578947368421e-07, + "loss": 0.4057, + "step": 301 + }, + { + "Batch Mean": -0.12894195318222046, + "accuracy": 0.8671875, + "epoch": 0.7525, + "step": 301 + }, + { + "epoch": 0.755, + "grad_norm": 8.99760913848877, + "learning_rate": 7.736842105263159e-07, + "loss": 0.2967, + "step": 302 + }, + { + "Batch Mean": -0.034453392028808594, + "accuracy": 0.8359375, + "epoch": 0.755, + "step": 302 + }, + { + "epoch": 0.7575, + "grad_norm": 9.568157196044922, + "learning_rate": 7.657894736842106e-07, + "loss": 0.3744, + "step": 303 + }, + { + "Batch Mean": -0.12914586067199707, + "accuracy": 0.8671875, + "epoch": 0.7575, + "step": 303 + }, + { + "epoch": 0.76, + "grad_norm": 8.443596839904785, + "learning_rate": 7.578947368421053e-07, + "loss": 0.2851, + "step": 304 + }, + { + "Batch Mean": 0.4375237822532654, + "accuracy": 0.90625, + "epoch": 0.76, + "step": 304 + }, + { + "epoch": 0.7625, + "grad_norm": 9.44564151763916, + "learning_rate": 7.5e-07, + "loss": 0.2535, + "step": 305 + }, + { + "Batch Mean": 0.14234724640846252, + "accuracy": 0.828125, + "epoch": 0.7625, + "step": 305 + }, + { + "epoch": 0.765, + "grad_norm": 11.979601860046387, + "learning_rate": 7.421052631578948e-07, + "loss": 0.3964, + "step": 306 + }, + { + "Batch Mean": -0.2668642997741699, + "accuracy": 0.859375, + "epoch": 0.765, + "step": 306 + }, + { + "epoch": 0.7675, + "grad_norm": 9.393048286437988, + "learning_rate": 7.342105263157895e-07, + "loss": 0.3521, + "step": 307 + }, + { + "Batch Mean": 0.2523055076599121, + "accuracy": 0.828125, + "epoch": 0.7675, + "step": 307 + }, + { + "epoch": 0.77, + "grad_norm": 12.084721565246582, + "learning_rate": 7.263157894736843e-07, + "loss": 0.363, + "step": 308 + }, + { + "Batch Mean": -0.3096902370452881, + "accuracy": 0.859375, + "epoch": 0.77, + "step": 308 + }, + { + "epoch": 0.7725, + "grad_norm": 9.678906440734863, + "learning_rate": 7.18421052631579e-07, + "loss": 0.3527, + "step": 309 + }, + { + "Batch Mean": -0.16327643394470215, + "accuracy": 0.796875, + "epoch": 0.7725, + "step": 309 + }, + { + "epoch": 0.775, + "grad_norm": 10.697284698486328, + "learning_rate": 7.105263157894736e-07, + "loss": 0.3921, + "step": 310 + }, + { + "Batch Mean": -0.20280182361602783, + "accuracy": 0.859375, + "epoch": 0.775, + "step": 310 + }, + { + "epoch": 0.7775, + "grad_norm": 10.731630325317383, + "learning_rate": 7.026315789473685e-07, + "loss": 0.3758, + "step": 311 + }, + { + "Batch Mean": 0.16492557525634766, + "accuracy": 0.859375, + "epoch": 0.7775, + "step": 311 + }, + { + "epoch": 0.78, + "grad_norm": 9.216254234313965, + "learning_rate": 6.947368421052631e-07, + "loss": 0.3501, + "step": 312 + }, + { + "Batch Mean": -0.5113391876220703, + "accuracy": 0.875, + "epoch": 0.78, + "step": 312 + }, + { + "epoch": 0.7825, + "grad_norm": 9.918815612792969, + "learning_rate": 6.868421052631579e-07, + "loss": 0.3294, + "step": 313 + }, + { + "Batch Mean": -0.0627584457397461, + "accuracy": 0.8203125, + "epoch": 0.7825, + "step": 313 + }, + { + "epoch": 0.785, + "grad_norm": 10.955821990966797, + "learning_rate": 6.789473684210527e-07, + "loss": 0.3523, + "step": 314 + }, + { + "Batch Mean": -0.4180750846862793, + "accuracy": 0.7890625, + "epoch": 0.785, + "step": 314 + }, + { + "epoch": 0.7875, + "grad_norm": 10.963501930236816, + "learning_rate": 6.710526315789474e-07, + "loss": 0.4507, + "step": 315 + }, + { + "Batch Mean": -0.10809827595949173, + "accuracy": 0.8203125, + "epoch": 0.7875, + "step": 315 + }, + { + "epoch": 0.79, + "grad_norm": 9.842696189880371, + "learning_rate": 6.631578947368421e-07, + "loss": 0.3107, + "step": 316 + }, + { + "Batch Mean": -0.07777953147888184, + "accuracy": 0.875, + "epoch": 0.79, + "step": 316 + }, + { + "epoch": 0.7925, + "grad_norm": 8.503801345825195, + "learning_rate": 6.552631578947369e-07, + "loss": 0.3231, + "step": 317 + }, + { + "Batch Mean": 0.34143921732902527, + "accuracy": 0.84375, + "epoch": 0.7925, + "step": 317 + }, + { + "epoch": 0.795, + "grad_norm": 10.606958389282227, + "learning_rate": 6.473684210526316e-07, + "loss": 0.3549, + "step": 318 + }, + { + "Batch Mean": 0.1322188377380371, + "accuracy": 0.8984375, + "epoch": 0.795, + "step": 318 + }, + { + "epoch": 0.7975, + "grad_norm": 8.602923393249512, + "learning_rate": 6.394736842105264e-07, + "loss": 0.309, + "step": 319 + }, + { + "Batch Mean": 0.27764415740966797, + "accuracy": 0.796875, + "epoch": 0.7975, + "step": 319 + }, + { + "epoch": 0.8, + "grad_norm": 10.285405158996582, + "learning_rate": 6.31578947368421e-07, + "loss": 0.367, + "step": 320 + }, + { + "Batch Mean": 0.512723445892334, + "accuracy": 0.796875, + "epoch": 0.8, + "step": 320 + }, + { + "epoch": 0.8025, + "grad_norm": 12.33840274810791, + "learning_rate": 6.236842105263158e-07, + "loss": 0.4105, + "step": 321 + }, + { + "Batch Mean": 0.5196051597595215, + "accuracy": 0.8828125, + "epoch": 0.8025, + "step": 321 + }, + { + "epoch": 0.805, + "grad_norm": 12.079765319824219, + "learning_rate": 6.157894736842106e-07, + "loss": 0.3555, + "step": 322 + }, + { + "Batch Mean": -0.3112964630126953, + "accuracy": 0.796875, + "epoch": 0.805, + "step": 322 + }, + { + "epoch": 0.8075, + "grad_norm": 11.693730354309082, + "learning_rate": 6.078947368421052e-07, + "loss": 0.4492, + "step": 323 + }, + { + "Batch Mean": -0.11909055709838867, + "accuracy": 0.84375, + "epoch": 0.8075, + "step": 323 + }, + { + "epoch": 0.81, + "grad_norm": 10.58245849609375, + "learning_rate": 6.000000000000001e-07, + "loss": 0.3541, + "step": 324 + }, + { + "Batch Mean": -0.04676675796508789, + "accuracy": 0.8359375, + "epoch": 0.81, + "step": 324 + }, + { + "epoch": 0.8125, + "grad_norm": 10.23089599609375, + "learning_rate": 5.921052631578947e-07, + "loss": 0.3618, + "step": 325 + }, + { + "Batch Mean": -0.34575414657592773, + "accuracy": 0.8203125, + "epoch": 0.8125, + "step": 325 + }, + { + "epoch": 0.815, + "grad_norm": 10.796978950500488, + "learning_rate": 5.842105263157895e-07, + "loss": 0.3415, + "step": 326 + }, + { + "Batch Mean": -0.18766283988952637, + "accuracy": 0.828125, + "epoch": 0.815, + "step": 326 + }, + { + "epoch": 0.8175, + "grad_norm": 10.497952461242676, + "learning_rate": 5.763157894736842e-07, + "loss": 0.3881, + "step": 327 + }, + { + "Batch Mean": -0.18593168258666992, + "accuracy": 0.8671875, + "epoch": 0.8175, + "step": 327 + }, + { + "epoch": 0.82, + "grad_norm": 9.840806007385254, + "learning_rate": 5.68421052631579e-07, + "loss": 0.3928, + "step": 328 + }, + { + "Batch Mean": -0.23124361038208008, + "accuracy": 0.84375, + "epoch": 0.82, + "step": 328 + }, + { + "epoch": 0.8225, + "grad_norm": 9.90493106842041, + "learning_rate": 5.605263157894737e-07, + "loss": 0.331, + "step": 329 + }, + { + "Batch Mean": -0.0444793701171875, + "accuracy": 0.78125, + "epoch": 0.8225, + "step": 329 + }, + { + "epoch": 0.825, + "grad_norm": 11.78012752532959, + "learning_rate": 5.526315789473684e-07, + "loss": 0.4493, + "step": 330 + }, + { + "Batch Mean": -0.25496864318847656, + "accuracy": 0.8515625, + "epoch": 0.825, + "step": 330 + }, + { + "epoch": 0.8275, + "grad_norm": 10.282852172851562, + "learning_rate": 5.447368421052632e-07, + "loss": 0.3761, + "step": 331 + }, + { + "Batch Mean": 0.3048366904258728, + "accuracy": 0.8828125, + "epoch": 0.8275, + "step": 331 + }, + { + "epoch": 0.83, + "grad_norm": 8.63224983215332, + "learning_rate": 5.368421052631579e-07, + "loss": 0.2974, + "step": 332 + }, + { + "Batch Mean": -0.023471057415008545, + "accuracy": 0.796875, + "epoch": 0.83, + "step": 332 + }, + { + "epoch": 0.8325, + "grad_norm": 10.05276870727539, + "learning_rate": 5.289473684210526e-07, + "loss": 0.4025, + "step": 333 + }, + { + "Batch Mean": 0.2183828353881836, + "accuracy": 0.8125, + "epoch": 0.8325, + "step": 333 + }, + { + "epoch": 0.835, + "grad_norm": 10.247188568115234, + "learning_rate": 5.210526315789474e-07, + "loss": 0.3603, + "step": 334 + }, + { + "Batch Mean": 0.5247657299041748, + "accuracy": 0.8671875, + "epoch": 0.835, + "step": 334 + }, + { + "epoch": 0.8375, + "grad_norm": 9.070343971252441, + "learning_rate": 5.131578947368422e-07, + "loss": 0.3427, + "step": 335 + }, + { + "Batch Mean": 0.20985054969787598, + "accuracy": 0.8828125, + "epoch": 0.8375, + "step": 335 + }, + { + "epoch": 0.84, + "grad_norm": 9.452494621276855, + "learning_rate": 5.052631578947368e-07, + "loss": 0.3282, + "step": 336 + }, + { + "Batch Mean": -0.20295953750610352, + "accuracy": 0.890625, + "epoch": 0.84, + "step": 336 + }, + { + "epoch": 0.8425, + "grad_norm": 8.882509231567383, + "learning_rate": 4.973684210526316e-07, + "loss": 0.2789, + "step": 337 + }, + { + "Batch Mean": -0.06368577480316162, + "accuracy": 0.8125, + "epoch": 0.8425, + "step": 337 + }, + { + "epoch": 0.845, + "grad_norm": 9.023061752319336, + "learning_rate": 4.894736842105263e-07, + "loss": 0.3806, + "step": 338 + }, + { + "Batch Mean": 0.5258607864379883, + "accuracy": 0.7890625, + "epoch": 0.845, + "step": 338 + }, + { + "epoch": 0.8475, + "grad_norm": 10.501274108886719, + "learning_rate": 4.81578947368421e-07, + "loss": 0.3861, + "step": 339 + }, + { + "Batch Mean": 0.05961108207702637, + "accuracy": 0.828125, + "epoch": 0.8475, + "step": 339 + }, + { + "epoch": 0.85, + "grad_norm": 8.876795768737793, + "learning_rate": 4.736842105263158e-07, + "loss": 0.3463, + "step": 340 + }, + { + "Batch Mean": 0.1795494556427002, + "accuracy": 0.8125, + "epoch": 0.85, + "step": 340 + }, + { + "epoch": 0.8525, + "grad_norm": 9.935751914978027, + "learning_rate": 4.657894736842106e-07, + "loss": 0.3716, + "step": 341 + }, + { + "Batch Mean": -0.021910667419433594, + "accuracy": 0.9140625, + "epoch": 0.8525, + "step": 341 + }, + { + "epoch": 0.855, + "grad_norm": 7.700226306915283, + "learning_rate": 4.578947368421053e-07, + "loss": 0.249, + "step": 342 + }, + { + "Batch Mean": -0.15176844596862793, + "accuracy": 0.8828125, + "epoch": 0.855, + "step": 342 + }, + { + "epoch": 0.8575, + "grad_norm": 8.577019691467285, + "learning_rate": 4.5e-07, + "loss": 0.2931, + "step": 343 + }, + { + "Batch Mean": 0.19070473313331604, + "accuracy": 0.8203125, + "epoch": 0.8575, + "step": 343 + }, + { + "epoch": 0.86, + "grad_norm": 8.576455116271973, + "learning_rate": 4.421052631578947e-07, + "loss": 0.3461, + "step": 344 + }, + { + "Batch Mean": 0.49016714096069336, + "accuracy": 0.78125, + "epoch": 0.86, + "step": 344 + }, + { + "epoch": 0.8625, + "grad_norm": 11.025247573852539, + "learning_rate": 4.342105263157895e-07, + "loss": 0.4233, + "step": 345 + }, + { + "Batch Mean": -0.20942234992980957, + "accuracy": 0.875, + "epoch": 0.8625, + "step": 345 + }, + { + "epoch": 0.865, + "grad_norm": 8.730184555053711, + "learning_rate": 4.263157894736842e-07, + "loss": 0.3475, + "step": 346 + }, + { + "Batch Mean": -0.3581113815307617, + "accuracy": 0.84375, + "epoch": 0.865, + "step": 346 + }, + { + "epoch": 0.8675, + "grad_norm": 9.417312622070312, + "learning_rate": 4.184210526315789e-07, + "loss": 0.3565, + "step": 347 + }, + { + "Batch Mean": -0.6340574026107788, + "accuracy": 0.84375, + "epoch": 0.8675, + "step": 347 + }, + { + "epoch": 0.87, + "grad_norm": 10.449695587158203, + "learning_rate": 4.105263157894737e-07, + "loss": 0.3764, + "step": 348 + }, + { + "Batch Mean": -0.21253252029418945, + "accuracy": 0.875, + "epoch": 0.87, + "step": 348 + }, + { + "epoch": 0.8725, + "grad_norm": 9.437952995300293, + "learning_rate": 4.0263157894736845e-07, + "loss": 0.3224, + "step": 349 + }, + { + "Batch Mean": 0.11686336994171143, + "accuracy": 0.8203125, + "epoch": 0.8725, + "step": 349 + }, + { + "epoch": 0.875, + "grad_norm": 10.553494453430176, + "learning_rate": 3.9473684210526315e-07, + "loss": 0.4058, + "step": 350 + }, + { + "Batch Mean": 0.18667292594909668, + "accuracy": 0.8515625, + "epoch": 0.875, + "step": 350 + }, + { + "epoch": 0.8775, + "grad_norm": 10.997474670410156, + "learning_rate": 3.8684210526315794e-07, + "loss": 0.3562, + "step": 351 + }, + { + "Batch Mean": 0.09920608997344971, + "accuracy": 0.8203125, + "epoch": 0.8775, + "step": 351 + }, + { + "epoch": 0.88, + "grad_norm": 9.650683403015137, + "learning_rate": 3.7894736842105264e-07, + "loss": 0.3747, + "step": 352 + }, + { + "Batch Mean": -0.2453855276107788, + "accuracy": 0.8515625, + "epoch": 0.88, + "step": 352 + }, + { + "epoch": 0.8825, + "grad_norm": 9.367212295532227, + "learning_rate": 3.710526315789474e-07, + "loss": 0.3509, + "step": 353 + }, + { + "Batch Mean": -0.14837932586669922, + "accuracy": 0.8515625, + "epoch": 0.8825, + "step": 353 + }, + { + "epoch": 0.885, + "grad_norm": 10.099615097045898, + "learning_rate": 3.6315789473684213e-07, + "loss": 0.3888, + "step": 354 + }, + { + "Batch Mean": 0.28180456161499023, + "accuracy": 0.8515625, + "epoch": 0.885, + "step": 354 + }, + { + "epoch": 0.8875, + "grad_norm": 9.667671203613281, + "learning_rate": 3.552631578947368e-07, + "loss": 0.3854, + "step": 355 + }, + { + "Batch Mean": -0.13739043474197388, + "accuracy": 0.8125, + "epoch": 0.8875, + "step": 355 + }, + { + "epoch": 0.89, + "grad_norm": 10.022262573242188, + "learning_rate": 3.4736842105263157e-07, + "loss": 0.3919, + "step": 356 + }, + { + "Batch Mean": 0.25037017464637756, + "accuracy": 0.8671875, + "epoch": 0.89, + "step": 356 + }, + { + "epoch": 0.8925, + "grad_norm": 8.40639877319336, + "learning_rate": 3.3947368421052636e-07, + "loss": 0.3131, + "step": 357 + }, + { + "Batch Mean": 0.44237828254699707, + "accuracy": 0.8046875, + "epoch": 0.8925, + "step": 357 + }, + { + "epoch": 0.895, + "grad_norm": 10.576713562011719, + "learning_rate": 3.3157894736842106e-07, + "loss": 0.4037, + "step": 358 + }, + { + "Batch Mean": 0.28910255432128906, + "accuracy": 0.8515625, + "epoch": 0.895, + "step": 358 + }, + { + "epoch": 0.8975, + "grad_norm": 9.242695808410645, + "learning_rate": 3.236842105263158e-07, + "loss": 0.3498, + "step": 359 + }, + { + "Batch Mean": -0.020318835973739624, + "accuracy": 0.8515625, + "epoch": 0.8975, + "step": 359 + }, + { + "epoch": 0.9, + "grad_norm": 8.021788597106934, + "learning_rate": 3.157894736842105e-07, + "loss": 0.2526, + "step": 360 + }, + { + "Batch Mean": -0.011986970901489258, + "accuracy": 0.8203125, + "epoch": 0.9, + "step": 360 + }, + { + "epoch": 0.9025, + "grad_norm": 9.140953063964844, + "learning_rate": 3.078947368421053e-07, + "loss": 0.4096, + "step": 361 + }, + { + "Batch Mean": 0.10518503189086914, + "accuracy": 0.8046875, + "epoch": 0.9025, + "step": 361 + }, + { + "epoch": 0.905, + "grad_norm": 10.265999794006348, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.3787, + "step": 362 + }, + { + "Batch Mean": -0.40680599212646484, + "accuracy": 0.8515625, + "epoch": 0.905, + "step": 362 + }, + { + "epoch": 0.9075, + "grad_norm": 9.021233558654785, + "learning_rate": 2.9210526315789473e-07, + "loss": 0.3171, + "step": 363 + }, + { + "Batch Mean": 0.18383502960205078, + "accuracy": 0.78125, + "epoch": 0.9075, + "step": 363 + }, + { + "epoch": 0.91, + "grad_norm": 9.411810874938965, + "learning_rate": 2.842105263157895e-07, + "loss": 0.377, + "step": 364 + }, + { + "Batch Mean": -0.3058032989501953, + "accuracy": 0.8515625, + "epoch": 0.91, + "step": 364 + }, + { + "epoch": 0.9125, + "grad_norm": 8.536703109741211, + "learning_rate": 2.763157894736842e-07, + "loss": 0.3636, + "step": 365 + }, + { + "Batch Mean": -0.454792857170105, + "accuracy": 0.859375, + "epoch": 0.9125, + "step": 365 + }, + { + "epoch": 0.915, + "grad_norm": 9.486909866333008, + "learning_rate": 2.6842105263157897e-07, + "loss": 0.3759, + "step": 366 + }, + { + "Batch Mean": 0.13639020919799805, + "accuracy": 0.8359375, + "epoch": 0.915, + "step": 366 + }, + { + "epoch": 0.9175, + "grad_norm": 8.468119621276855, + "learning_rate": 2.605263157894737e-07, + "loss": 0.3447, + "step": 367 + }, + { + "Batch Mean": -0.28373217582702637, + "accuracy": 0.84375, + "epoch": 0.9175, + "step": 367 + }, + { + "epoch": 0.92, + "grad_norm": 9.961069107055664, + "learning_rate": 2.526315789473684e-07, + "loss": 0.3972, + "step": 368 + }, + { + "Batch Mean": -0.08122384548187256, + "accuracy": 0.7890625, + "epoch": 0.92, + "step": 368 + }, + { + "epoch": 0.9225, + "grad_norm": 10.039396286010742, + "learning_rate": 2.4473684210526315e-07, + "loss": 0.4382, + "step": 369 + }, + { + "Batch Mean": -0.18444490432739258, + "accuracy": 0.8125, + "epoch": 0.9225, + "step": 369 + }, + { + "epoch": 0.925, + "grad_norm": 8.742066383361816, + "learning_rate": 2.368421052631579e-07, + "loss": 0.3676, + "step": 370 + }, + { + "Batch Mean": -0.3456565737724304, + "accuracy": 0.796875, + "epoch": 0.925, + "step": 370 + }, + { + "epoch": 0.9275, + "grad_norm": 9.731671333312988, + "learning_rate": 2.2894736842105264e-07, + "loss": 0.3645, + "step": 371 + }, + { + "Batch Mean": -0.016453981399536133, + "accuracy": 0.8515625, + "epoch": 0.9275, + "step": 371 + }, + { + "epoch": 0.93, + "grad_norm": 8.926627159118652, + "learning_rate": 2.2105263157894736e-07, + "loss": 0.3358, + "step": 372 + }, + { + "Batch Mean": 0.3115067481994629, + "accuracy": 0.890625, + "epoch": 0.93, + "step": 372 + }, + { + "epoch": 0.9325, + "grad_norm": 7.854696273803711, + "learning_rate": 2.131578947368421e-07, + "loss": 0.2956, + "step": 373 + }, + { + "Batch Mean": -0.0024530887603759766, + "accuracy": 0.875, + "epoch": 0.9325, + "step": 373 + }, + { + "epoch": 0.935, + "grad_norm": 8.164934158325195, + "learning_rate": 2.0526315789473685e-07, + "loss": 0.284, + "step": 374 + }, + { + "Batch Mean": -0.35596323013305664, + "accuracy": 0.8359375, + "epoch": 0.935, + "step": 374 + }, + { + "epoch": 0.9375, + "grad_norm": 10.020709991455078, + "learning_rate": 1.9736842105263157e-07, + "loss": 0.3899, + "step": 375 + }, + { + "Batch Mean": -0.31003671884536743, + "accuracy": 0.8203125, + "epoch": 0.9375, + "step": 375 + }, + { + "epoch": 0.94, + "grad_norm": 10.256973266601562, + "learning_rate": 1.8947368421052632e-07, + "loss": 0.3544, + "step": 376 + }, + { + "Batch Mean": 0.1095513105392456, + "accuracy": 0.796875, + "epoch": 0.94, + "step": 376 + }, + { + "epoch": 0.9425, + "grad_norm": 10.901346206665039, + "learning_rate": 1.8157894736842106e-07, + "loss": 0.4286, + "step": 377 + }, + { + "Batch Mean": -0.2355579137802124, + "accuracy": 0.8671875, + "epoch": 0.9425, + "step": 377 + }, + { + "epoch": 0.945, + "grad_norm": 9.383118629455566, + "learning_rate": 1.7368421052631578e-07, + "loss": 0.2951, + "step": 378 + }, + { + "Batch Mean": -0.15269994735717773, + "accuracy": 0.84375, + "epoch": 0.945, + "step": 378 + }, + { + "epoch": 0.9475, + "grad_norm": 9.133818626403809, + "learning_rate": 1.6578947368421053e-07, + "loss": 0.3858, + "step": 379 + }, + { + "Batch Mean": 0.28601565957069397, + "accuracy": 0.859375, + "epoch": 0.9475, + "step": 379 + }, + { + "epoch": 0.95, + "grad_norm": 9.083587646484375, + "learning_rate": 1.5789473684210525e-07, + "loss": 0.3113, + "step": 380 + }, + { + "Batch Mean": 0.18660545349121094, + "accuracy": 0.8046875, + "epoch": 0.95, + "step": 380 + }, + { + "epoch": 0.9525, + "grad_norm": 9.448458671569824, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.4171, + "step": 381 + }, + { + "Batch Mean": -0.03425157070159912, + "accuracy": 0.8203125, + "epoch": 0.9525, + "step": 381 + }, + { + "epoch": 0.955, + "grad_norm": 8.985979080200195, + "learning_rate": 1.4210526315789474e-07, + "loss": 0.364, + "step": 382 + }, + { + "Batch Mean": 0.2637450695037842, + "accuracy": 0.7890625, + "epoch": 0.955, + "step": 382 + }, + { + "epoch": 0.9575, + "grad_norm": 10.121076583862305, + "learning_rate": 1.3421052631578948e-07, + "loss": 0.4165, + "step": 383 + }, + { + "Batch Mean": 0.31875157356262207, + "accuracy": 0.890625, + "epoch": 0.9575, + "step": 383 + }, + { + "epoch": 0.96, + "grad_norm": 7.5626397132873535, + "learning_rate": 1.263157894736842e-07, + "loss": 0.2767, + "step": 384 + }, + { + "Batch Mean": 0.15826892852783203, + "accuracy": 0.8359375, + "epoch": 0.96, + "step": 384 + }, + { + "epoch": 0.9625, + "grad_norm": 7.888492107391357, + "learning_rate": 1.1842105263157895e-07, + "loss": 0.319, + "step": 385 + }, + { + "Batch Mean": 0.21660977602005005, + "accuracy": 0.8671875, + "epoch": 0.9625, + "step": 385 + }, + { + "epoch": 0.965, + "grad_norm": 7.688926696777344, + "learning_rate": 1.1052631578947368e-07, + "loss": 0.3065, + "step": 386 + }, + { + "Batch Mean": -0.029039382934570312, + "accuracy": 0.84375, + "epoch": 0.965, + "step": 386 + }, + { + "epoch": 0.9675, + "grad_norm": 9.584554672241211, + "learning_rate": 1.0263157894736843e-07, + "loss": 0.3247, + "step": 387 + }, + { + "Batch Mean": 0.2024543285369873, + "accuracy": 0.828125, + "epoch": 0.9675, + "step": 387 + }, + { + "epoch": 0.97, + "grad_norm": 8.92916202545166, + "learning_rate": 9.473684210526316e-08, + "loss": 0.3898, + "step": 388 + }, + { + "Batch Mean": 0.157745361328125, + "accuracy": 0.8828125, + "epoch": 0.97, + "step": 388 + }, + { + "epoch": 0.9725, + "grad_norm": 7.953405857086182, + "learning_rate": 8.684210526315789e-08, + "loss": 0.3219, + "step": 389 + }, + { + "Batch Mean": 0.22934383153915405, + "accuracy": 0.84375, + "epoch": 0.9725, + "step": 389 + }, + { + "epoch": 0.975, + "grad_norm": 7.865681171417236, + "learning_rate": 7.894736842105262e-08, + "loss": 0.3059, + "step": 390 + }, + { + "Batch Mean": 0.1985170841217041, + "accuracy": 0.8671875, + "epoch": 0.975, + "step": 390 + }, + { + "epoch": 0.9775, + "grad_norm": 9.250611305236816, + "learning_rate": 7.105263157894737e-08, + "loss": 0.3561, + "step": 391 + }, + { + "Batch Mean": 0.2049732208251953, + "accuracy": 0.8828125, + "epoch": 0.9775, + "step": 391 + }, + { + "epoch": 0.98, + "grad_norm": 7.836735725402832, + "learning_rate": 6.31578947368421e-08, + "loss": 0.3195, + "step": 392 + }, + { + "Batch Mean": -0.21650028228759766, + "accuracy": 0.84375, + "epoch": 0.98, + "step": 392 + }, + { + "epoch": 0.9825, + "grad_norm": 8.78288745880127, + "learning_rate": 5.526315789473684e-08, + "loss": 0.3307, + "step": 393 + }, + { + "Batch Mean": -0.16384530067443848, + "accuracy": 0.859375, + "epoch": 0.9825, + "step": 393 + }, + { + "epoch": 0.985, + "grad_norm": 9.413653373718262, + "learning_rate": 4.736842105263158e-08, + "loss": 0.3168, + "step": 394 + }, + { + "Batch Mean": 0.39468860626220703, + "accuracy": 0.890625, + "epoch": 0.985, + "step": 394 + }, + { + "epoch": 0.9875, + "grad_norm": 8.482548713684082, + "learning_rate": 3.947368421052631e-08, + "loss": 0.3072, + "step": 395 + }, + { + "Batch Mean": -0.008903980255126953, + "accuracy": 0.8828125, + "epoch": 0.9875, + "step": 395 + }, + { + "epoch": 0.99, + "grad_norm": 7.37254524230957, + "learning_rate": 3.157894736842105e-08, + "loss": 0.2792, + "step": 396 + }, + { + "Batch Mean": -0.081329345703125, + "accuracy": 0.828125, + "epoch": 0.99, + "step": 396 + }, + { + "epoch": 0.9925, + "grad_norm": 11.35734748840332, + "learning_rate": 2.368421052631579e-08, + "loss": 0.3622, + "step": 397 + }, + { + "Batch Mean": 0.2623863220214844, + "accuracy": 0.8515625, + "epoch": 0.9925, + "step": 397 + }, + { + "epoch": 0.995, + "grad_norm": 8.10012149810791, + "learning_rate": 1.5789473684210525e-08, + "loss": 0.306, + "step": 398 + }, + { + "Batch Mean": 0.06012392044067383, + "accuracy": 0.828125, + "epoch": 0.995, + "step": 398 + }, + { + "epoch": 0.9975, + "grad_norm": 9.454927444458008, + "learning_rate": 7.894736842105263e-09, + "loss": 0.3882, + "step": 399 + }, + { + "Batch Mean": -0.25020742416381836, + "accuracy": 0.8125, + "epoch": 0.9975, + "step": 399 + }, + { + "epoch": 1.0, + "grad_norm": 9.042609214782715, + "learning_rate": 0.0, + "loss": 0.3826, + "step": 400 + } + ], + "logging_steps": 1, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}