diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7087 @@ +{ + "best_metric": 0.14280347526073456, + "best_model_checkpoint": "d:\\\\whisper-medium-pt-cv16-fleurs2\\checkpoint-15000", + "epoch": 11.671335200746965, + "eval_steps": 5000, + "global_step": 25000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.011671335200746966, + "grad_norm": 18.58954620361328, + "learning_rate": 4.6e-10, + "loss": 0.7382, + "step": 25 + }, + { + "epoch": 0.02334267040149393, + "grad_norm": 37.525917053222656, + "learning_rate": 9.399999999999999e-10, + "loss": 1.2845, + "step": 50 + }, + { + "epoch": 0.0350140056022409, + "grad_norm": 15.194890975952148, + "learning_rate": 1.44e-09, + "loss": 0.7588, + "step": 75 + }, + { + "epoch": 0.04668534080298786, + "grad_norm": 40.771392822265625, + "learning_rate": 1.94e-09, + "loss": 1.2737, + "step": 100 + }, + { + "epoch": 0.05835667600373483, + "grad_norm": 16.082420349121094, + "learning_rate": 2.44e-09, + "loss": 0.7373, + "step": 125 + }, + { + "epoch": 0.0700280112044818, + "grad_norm": 38.88285446166992, + "learning_rate": 2.9399999999999998e-09, + "loss": 1.3064, + "step": 150 + }, + { + "epoch": 0.08169934640522876, + "grad_norm": 17.771854400634766, + "learning_rate": 3.4399999999999997e-09, + "loss": 0.7341, + "step": 175 + }, + { + "epoch": 0.09337068160597572, + "grad_norm": 41.72404861450195, + "learning_rate": 3.94e-09, + "loss": 1.3691, + "step": 200 + }, + { + "epoch": 0.10504201680672269, + "grad_norm": 16.00535774230957, + "learning_rate": 4.44e-09, + "loss": 0.7228, + "step": 225 + }, + { + "epoch": 0.11671335200746966, + "grad_norm": 41.851478576660156, + "learning_rate": 4.94e-09, + "loss": 1.3768, + "step": 250 + }, + { + "epoch": 0.1283846872082166, + "grad_norm": 20.789945602416992, + "learning_rate": 5.44e-09, + "loss": 0.7399, + "step": 275 + }, + { + "epoch": 0.1400560224089636, + "grad_norm": 39.85365295410156, + "learning_rate": 5.94e-09, + "loss": 1.359, + "step": 300 + }, + { + "epoch": 0.15172735760971054, + "grad_norm": 16.680112838745117, + "learning_rate": 6.44e-09, + "loss": 0.7578, + "step": 325 + }, + { + "epoch": 0.16339869281045752, + "grad_norm": 41.59889221191406, + "learning_rate": 6.94e-09, + "loss": 1.3793, + "step": 350 + }, + { + "epoch": 0.17507002801120447, + "grad_norm": 16.55697250366211, + "learning_rate": 7.44e-09, + "loss": 0.7345, + "step": 375 + }, + { + "epoch": 0.18674136321195145, + "grad_norm": 37.21436309814453, + "learning_rate": 7.939999999999999e-09, + "loss": 1.3301, + "step": 400 + }, + { + "epoch": 0.1984126984126984, + "grad_norm": 18.573705673217773, + "learning_rate": 8.44e-09, + "loss": 0.7678, + "step": 425 + }, + { + "epoch": 0.21008403361344538, + "grad_norm": 42.38700866699219, + "learning_rate": 8.94e-09, + "loss": 1.3582, + "step": 450 + }, + { + "epoch": 0.22175536881419233, + "grad_norm": 16.054298400878906, + "learning_rate": 9.44e-09, + "loss": 0.7475, + "step": 475 + }, + { + "epoch": 0.2334267040149393, + "grad_norm": 40.42662811279297, + "learning_rate": 9.939999999999998e-09, + "loss": 1.3728, + "step": 500 + }, + { + "epoch": 0.24509803921568626, + "grad_norm": 17.739364624023438, + "learning_rate": 1.044e-08, + "loss": 0.7678, + "step": 525 + }, + { + "epoch": 0.2567693744164332, + "grad_norm": 38.32596206665039, + "learning_rate": 1.0939999999999999e-08, + "loss": 1.3873, + "step": 550 + }, + { + "epoch": 0.2684407096171802, + "grad_norm": 14.915902137756348, + "learning_rate": 1.144e-08, + "loss": 0.7033, + "step": 575 + }, + { + "epoch": 0.2801120448179272, + "grad_norm": 34.46598434448242, + "learning_rate": 1.1939999999999998e-08, + "loss": 1.2503, + "step": 600 + }, + { + "epoch": 0.29178338001867415, + "grad_norm": 14.2736177444458, + "learning_rate": 1.244e-08, + "loss": 0.7253, + "step": 625 + }, + { + "epoch": 0.3034547152194211, + "grad_norm": 40.36186981201172, + "learning_rate": 1.2939999999999999e-08, + "loss": 1.3159, + "step": 650 + }, + { + "epoch": 0.31512605042016806, + "grad_norm": 14.474146842956543, + "learning_rate": 1.344e-08, + "loss": 0.7097, + "step": 675 + }, + { + "epoch": 0.32679738562091504, + "grad_norm": 39.71982192993164, + "learning_rate": 1.394e-08, + "loss": 1.3331, + "step": 700 + }, + { + "epoch": 0.338468720821662, + "grad_norm": 13.428034782409668, + "learning_rate": 1.4439999999999999e-08, + "loss": 0.7001, + "step": 725 + }, + { + "epoch": 0.35014005602240894, + "grad_norm": 38.90840148925781, + "learning_rate": 1.494e-08, + "loss": 1.2718, + "step": 750 + }, + { + "epoch": 0.3618113912231559, + "grad_norm": 15.919449806213379, + "learning_rate": 1.544e-08, + "loss": 0.6877, + "step": 775 + }, + { + "epoch": 0.3734827264239029, + "grad_norm": 37.94025802612305, + "learning_rate": 1.594e-08, + "loss": 1.2798, + "step": 800 + }, + { + "epoch": 0.3851540616246499, + "grad_norm": 14.55276107788086, + "learning_rate": 1.644e-08, + "loss": 0.7123, + "step": 825 + }, + { + "epoch": 0.3968253968253968, + "grad_norm": 33.79072952270508, + "learning_rate": 1.6939999999999998e-08, + "loss": 1.2842, + "step": 850 + }, + { + "epoch": 0.4084967320261438, + "grad_norm": 14.676527976989746, + "learning_rate": 1.744e-08, + "loss": 0.7064, + "step": 875 + }, + { + "epoch": 0.42016806722689076, + "grad_norm": 36.242069244384766, + "learning_rate": 1.794e-08, + "loss": 1.2235, + "step": 900 + }, + { + "epoch": 0.43183940242763774, + "grad_norm": 12.617734909057617, + "learning_rate": 1.8440000000000002e-08, + "loss": 0.6694, + "step": 925 + }, + { + "epoch": 0.44351073762838467, + "grad_norm": 40.069305419921875, + "learning_rate": 1.8939999999999996e-08, + "loss": 1.2126, + "step": 950 + }, + { + "epoch": 0.45518207282913165, + "grad_norm": 15.211618423461914, + "learning_rate": 1.9439999999999997e-08, + "loss": 0.6739, + "step": 975 + }, + { + "epoch": 0.4668534080298786, + "grad_norm": 32.790863037109375, + "learning_rate": 1.994e-08, + "loss": 1.1737, + "step": 1000 + }, + { + "epoch": 0.4785247432306256, + "grad_norm": 12.688447952270508, + "learning_rate": 2.044e-08, + "loss": 0.6731, + "step": 1025 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 34.38262176513672, + "learning_rate": 2.094e-08, + "loss": 1.1325, + "step": 1050 + }, + { + "epoch": 0.5018674136321195, + "grad_norm": 12.72283935546875, + "learning_rate": 2.144e-08, + "loss": 0.6504, + "step": 1075 + }, + { + "epoch": 0.5135387488328664, + "grad_norm": 28.521909713745117, + "learning_rate": 2.194e-08, + "loss": 1.1427, + "step": 1100 + }, + { + "epoch": 0.5252100840336135, + "grad_norm": 14.081682205200195, + "learning_rate": 2.244e-08, + "loss": 0.6429, + "step": 1125 + }, + { + "epoch": 0.5368814192343604, + "grad_norm": 33.354591369628906, + "learning_rate": 2.294e-08, + "loss": 1.124, + "step": 1150 + }, + { + "epoch": 0.5485527544351074, + "grad_norm": 13.210142135620117, + "learning_rate": 2.3439999999999997e-08, + "loss": 0.6365, + "step": 1175 + }, + { + "epoch": 0.5602240896358543, + "grad_norm": 40.134281158447266, + "learning_rate": 2.3939999999999998e-08, + "loss": 1.1303, + "step": 1200 + }, + { + "epoch": 0.5718954248366013, + "grad_norm": 12.516732215881348, + "learning_rate": 2.444e-08, + "loss": 0.5871, + "step": 1225 + }, + { + "epoch": 0.5835667600373483, + "grad_norm": 30.771167755126953, + "learning_rate": 2.494e-08, + "loss": 1.0342, + "step": 1250 + }, + { + "epoch": 0.5952380952380952, + "grad_norm": 11.771331787109375, + "learning_rate": 2.5439999999999998e-08, + "loss": 0.5727, + "step": 1275 + }, + { + "epoch": 0.6069094304388422, + "grad_norm": 32.63950729370117, + "learning_rate": 2.594e-08, + "loss": 0.9901, + "step": 1300 + }, + { + "epoch": 0.6185807656395892, + "grad_norm": 15.06674575805664, + "learning_rate": 2.644e-08, + "loss": 0.5162, + "step": 1325 + }, + { + "epoch": 0.6302521008403361, + "grad_norm": 34.53097152709961, + "learning_rate": 2.694e-08, + "loss": 0.9596, + "step": 1350 + }, + { + "epoch": 0.6419234360410832, + "grad_norm": 13.923140525817871, + "learning_rate": 2.7439999999999996e-08, + "loss": 0.5145, + "step": 1375 + }, + { + "epoch": 0.6535947712418301, + "grad_norm": 34.99801254272461, + "learning_rate": 2.7939999999999997e-08, + "loss": 0.8, + "step": 1400 + }, + { + "epoch": 0.665266106442577, + "grad_norm": 13.633746147155762, + "learning_rate": 2.844e-08, + "loss": 0.4484, + "step": 1425 + }, + { + "epoch": 0.676937441643324, + "grad_norm": 27.909713745117188, + "learning_rate": 2.894e-08, + "loss": 0.7578, + "step": 1450 + }, + { + "epoch": 0.688608776844071, + "grad_norm": 10.687728881835938, + "learning_rate": 2.944e-08, + "loss": 0.4327, + "step": 1475 + }, + { + "epoch": 0.7002801120448179, + "grad_norm": 25.57269859313965, + "learning_rate": 2.994e-08, + "loss": 0.6606, + "step": 1500 + }, + { + "epoch": 0.7119514472455649, + "grad_norm": 11.038127899169922, + "learning_rate": 3.044e-08, + "loss": 0.3744, + "step": 1525 + }, + { + "epoch": 0.7236227824463118, + "grad_norm": 32.23295974731445, + "learning_rate": 3.094e-08, + "loss": 0.5246, + "step": 1550 + }, + { + "epoch": 0.7352941176470589, + "grad_norm": 10.442867279052734, + "learning_rate": 3.144e-08, + "loss": 0.3276, + "step": 1575 + }, + { + "epoch": 0.7469654528478058, + "grad_norm": 30.245128631591797, + "learning_rate": 3.194e-08, + "loss": 0.5341, + "step": 1600 + }, + { + "epoch": 0.7586367880485527, + "grad_norm": 7.633006572723389, + "learning_rate": 3.244e-08, + "loss": 0.3076, + "step": 1625 + }, + { + "epoch": 0.7703081232492998, + "grad_norm": 25.9896297454834, + "learning_rate": 3.2939999999999996e-08, + "loss": 0.5054, + "step": 1650 + }, + { + "epoch": 0.7819794584500467, + "grad_norm": 11.558256149291992, + "learning_rate": 3.3439999999999994e-08, + "loss": 0.2894, + "step": 1675 + }, + { + "epoch": 0.7936507936507936, + "grad_norm": 25.086002349853516, + "learning_rate": 3.394e-08, + "loss": 0.4637, + "step": 1700 + }, + { + "epoch": 0.8053221288515406, + "grad_norm": 9.39806079864502, + "learning_rate": 3.4439999999999996e-08, + "loss": 0.2774, + "step": 1725 + }, + { + "epoch": 0.8169934640522876, + "grad_norm": 25.4090518951416, + "learning_rate": 3.494e-08, + "loss": 0.4205, + "step": 1750 + }, + { + "epoch": 0.8286647992530346, + "grad_norm": 7.335741996765137, + "learning_rate": 3.544e-08, + "loss": 0.2438, + "step": 1775 + }, + { + "epoch": 0.8403361344537815, + "grad_norm": 24.587902069091797, + "learning_rate": 3.5939999999999996e-08, + "loss": 0.4571, + "step": 1800 + }, + { + "epoch": 0.8520074696545284, + "grad_norm": 7.93494176864624, + "learning_rate": 3.644e-08, + "loss": 0.2692, + "step": 1825 + }, + { + "epoch": 0.8636788048552755, + "grad_norm": 25.627216339111328, + "learning_rate": 3.694e-08, + "loss": 0.3872, + "step": 1850 + }, + { + "epoch": 0.8753501400560224, + "grad_norm": 6.054137229919434, + "learning_rate": 3.7439999999999996e-08, + "loss": 0.2613, + "step": 1875 + }, + { + "epoch": 0.8870214752567693, + "grad_norm": 22.876371383666992, + "learning_rate": 3.794e-08, + "loss": 0.3745, + "step": 1900 + }, + { + "epoch": 0.8986928104575164, + "grad_norm": 9.932693481445312, + "learning_rate": 3.844e-08, + "loss": 0.2459, + "step": 1925 + }, + { + "epoch": 0.9103641456582633, + "grad_norm": 24.33729362487793, + "learning_rate": 3.894e-08, + "loss": 0.3707, + "step": 1950 + }, + { + "epoch": 0.9220354808590103, + "grad_norm": 5.043721675872803, + "learning_rate": 3.944e-08, + "loss": 0.2594, + "step": 1975 + }, + { + "epoch": 0.9337068160597572, + "grad_norm": 23.499347686767578, + "learning_rate": 3.994e-08, + "loss": 0.3559, + "step": 2000 + }, + { + "epoch": 0.9453781512605042, + "grad_norm": 9.935140609741211, + "learning_rate": 4.044e-08, + "loss": 0.2537, + "step": 2025 + }, + { + "epoch": 0.9570494864612512, + "grad_norm": 21.89238929748535, + "learning_rate": 4.0939999999999995e-08, + "loss": 0.3577, + "step": 2050 + }, + { + "epoch": 0.9687208216619981, + "grad_norm": 7.369849681854248, + "learning_rate": 4.143999999999999e-08, + "loss": 0.2506, + "step": 2075 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 21.387100219726562, + "learning_rate": 4.194e-08, + "loss": 0.3548, + "step": 2100 + }, + { + "epoch": 0.9920634920634921, + "grad_norm": 9.189516067504883, + "learning_rate": 4.2439999999999995e-08, + "loss": 0.2296, + "step": 2125 + }, + { + "epoch": 1.003734827264239, + "grad_norm": 6.029189109802246, + "learning_rate": 4.294e-08, + "loss": 0.3258, + "step": 2150 + }, + { + "epoch": 1.015406162464986, + "grad_norm": 9.663504600524902, + "learning_rate": 4.344e-08, + "loss": 0.1926, + "step": 2175 + }, + { + "epoch": 1.0270774976657329, + "grad_norm": 5.585232734680176, + "learning_rate": 4.3939999999999995e-08, + "loss": 0.3378, + "step": 2200 + }, + { + "epoch": 1.03874883286648, + "grad_norm": 8.463289260864258, + "learning_rate": 4.444e-08, + "loss": 0.2082, + "step": 2225 + }, + { + "epoch": 1.050420168067227, + "grad_norm": 5.860575199127197, + "learning_rate": 4.494e-08, + "loss": 0.3448, + "step": 2250 + }, + { + "epoch": 1.0620915032679739, + "grad_norm": 6.821081161499023, + "learning_rate": 4.544e-08, + "loss": 0.2441, + "step": 2275 + }, + { + "epoch": 1.0737628384687208, + "grad_norm": 6.957500457763672, + "learning_rate": 4.594e-08, + "loss": 0.3243, + "step": 2300 + }, + { + "epoch": 1.0854341736694677, + "grad_norm": 9.871063232421875, + "learning_rate": 4.644e-08, + "loss": 0.2316, + "step": 2325 + }, + { + "epoch": 1.0971055088702149, + "grad_norm": 5.59705114364624, + "learning_rate": 4.694e-08, + "loss": 0.3128, + "step": 2350 + }, + { + "epoch": 1.1087768440709618, + "grad_norm": 11.109825134277344, + "learning_rate": 4.744e-08, + "loss": 0.2319, + "step": 2375 + }, + { + "epoch": 1.1204481792717087, + "grad_norm": 6.252768039703369, + "learning_rate": 4.7940000000000004e-08, + "loss": 0.2975, + "step": 2400 + }, + { + "epoch": 1.1321195144724556, + "grad_norm": 9.143139839172363, + "learning_rate": 4.8439999999999996e-08, + "loss": 0.2043, + "step": 2425 + }, + { + "epoch": 1.1437908496732025, + "grad_norm": 5.3806657791137695, + "learning_rate": 4.8939999999999994e-08, + "loss": 0.3358, + "step": 2450 + }, + { + "epoch": 1.1554621848739495, + "grad_norm": 5.736015796661377, + "learning_rate": 4.944e-08, + "loss": 0.1975, + "step": 2475 + }, + { + "epoch": 1.1671335200746966, + "grad_norm": 8.606856346130371, + "learning_rate": 4.9939999999999996e-08, + "loss": 0.3118, + "step": 2500 + }, + { + "epoch": 1.1788048552754435, + "grad_norm": 8.582596778869629, + "learning_rate": 5.0439999999999994e-08, + "loss": 0.2086, + "step": 2525 + }, + { + "epoch": 1.1904761904761905, + "grad_norm": 5.178341388702393, + "learning_rate": 5.094e-08, + "loss": 0.2946, + "step": 2550 + }, + { + "epoch": 1.2021475256769374, + "grad_norm": 11.727195739746094, + "learning_rate": 5.1439999999999996e-08, + "loss": 0.2178, + "step": 2575 + }, + { + "epoch": 1.2138188608776843, + "grad_norm": 5.104198932647705, + "learning_rate": 5.194e-08, + "loss": 0.3234, + "step": 2600 + }, + { + "epoch": 1.2254901960784315, + "grad_norm": 9.104410171508789, + "learning_rate": 5.244e-08, + "loss": 0.2093, + "step": 2625 + }, + { + "epoch": 1.2371615312791784, + "grad_norm": 6.0715765953063965, + "learning_rate": 5.2939999999999996e-08, + "loss": 0.3366, + "step": 2650 + }, + { + "epoch": 1.2488328664799253, + "grad_norm": 8.743270874023438, + "learning_rate": 5.344e-08, + "loss": 0.1902, + "step": 2675 + }, + { + "epoch": 1.2605042016806722, + "grad_norm": 5.818302154541016, + "learning_rate": 5.394e-08, + "loss": 0.3033, + "step": 2700 + }, + { + "epoch": 1.2721755368814192, + "grad_norm": 5.093564510345459, + "learning_rate": 5.444e-08, + "loss": 0.2089, + "step": 2725 + }, + { + "epoch": 1.283846872082166, + "grad_norm": 6.838255405426025, + "learning_rate": 5.494e-08, + "loss": 0.2888, + "step": 2750 + }, + { + "epoch": 1.2955182072829132, + "grad_norm": 10.466809272766113, + "learning_rate": 5.544e-08, + "loss": 0.2166, + "step": 2775 + }, + { + "epoch": 1.3071895424836601, + "grad_norm": 5.292140007019043, + "learning_rate": 5.5939999999999997e-08, + "loss": 0.292, + "step": 2800 + }, + { + "epoch": 1.318860877684407, + "grad_norm": 8.67912483215332, + "learning_rate": 5.6439999999999995e-08, + "loss": 0.2073, + "step": 2825 + }, + { + "epoch": 1.330532212885154, + "grad_norm": 4.972991943359375, + "learning_rate": 5.693999999999999e-08, + "loss": 0.2952, + "step": 2850 + }, + { + "epoch": 1.3422035480859011, + "grad_norm": 8.939681053161621, + "learning_rate": 5.744e-08, + "loss": 0.1821, + "step": 2875 + }, + { + "epoch": 1.353874883286648, + "grad_norm": 7.211392402648926, + "learning_rate": 5.7939999999999995e-08, + "loss": 0.2914, + "step": 2900 + }, + { + "epoch": 1.365546218487395, + "grad_norm": 8.267333984375, + "learning_rate": 5.844e-08, + "loss": 0.1839, + "step": 2925 + }, + { + "epoch": 1.377217553688142, + "grad_norm": 6.720695972442627, + "learning_rate": 5.894e-08, + "loss": 0.2678, + "step": 2950 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 8.372034072875977, + "learning_rate": 5.9439999999999995e-08, + "loss": 0.1999, + "step": 2975 + }, + { + "epoch": 1.4005602240896358, + "grad_norm": 6.330301284790039, + "learning_rate": 5.993999999999999e-08, + "loss": 0.3173, + "step": 3000 + }, + { + "epoch": 1.4122315592903827, + "grad_norm": 10.318882942199707, + "learning_rate": 6.044e-08, + "loss": 0.1953, + "step": 3025 + }, + { + "epoch": 1.4239028944911298, + "grad_norm": 7.442046165466309, + "learning_rate": 6.094e-08, + "loss": 0.3231, + "step": 3050 + }, + { + "epoch": 1.4355742296918768, + "grad_norm": 9.14301872253418, + "learning_rate": 6.144e-08, + "loss": 0.2168, + "step": 3075 + }, + { + "epoch": 1.4472455648926237, + "grad_norm": 6.955599784851074, + "learning_rate": 6.194e-08, + "loss": 0.265, + "step": 3100 + }, + { + "epoch": 1.4589169000933706, + "grad_norm": 7.568444728851318, + "learning_rate": 6.244e-08, + "loss": 0.182, + "step": 3125 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 4.784877300262451, + "learning_rate": 6.293999999999999e-08, + "loss": 0.2616, + "step": 3150 + }, + { + "epoch": 1.4822595704948647, + "grad_norm": 5.884426116943359, + "learning_rate": 6.343999999999999e-08, + "loss": 0.1782, + "step": 3175 + }, + { + "epoch": 1.4939309056956116, + "grad_norm": 8.85175609588623, + "learning_rate": 6.393999999999999e-08, + "loss": 0.2837, + "step": 3200 + }, + { + "epoch": 1.5056022408963585, + "grad_norm": 7.9142537117004395, + "learning_rate": 6.444e-08, + "loss": 0.1751, + "step": 3225 + }, + { + "epoch": 1.5172735760971054, + "grad_norm": 6.807056903839111, + "learning_rate": 6.494e-08, + "loss": 0.3072, + "step": 3250 + }, + { + "epoch": 1.5289449112978524, + "grad_norm": 8.901240348815918, + "learning_rate": 6.544e-08, + "loss": 0.1975, + "step": 3275 + }, + { + "epoch": 1.5406162464985993, + "grad_norm": 5.000201225280762, + "learning_rate": 6.594e-08, + "loss": 0.2892, + "step": 3300 + }, + { + "epoch": 1.5522875816993464, + "grad_norm": 11.009442329406738, + "learning_rate": 6.643999999999999e-08, + "loss": 0.205, + "step": 3325 + }, + { + "epoch": 1.5639589169000934, + "grad_norm": 4.820681095123291, + "learning_rate": 6.694e-08, + "loss": 0.2868, + "step": 3350 + }, + { + "epoch": 1.5756302521008403, + "grad_norm": 11.95584487915039, + "learning_rate": 6.744e-08, + "loss": 0.1905, + "step": 3375 + }, + { + "epoch": 1.5873015873015874, + "grad_norm": 5.530846118927002, + "learning_rate": 6.794e-08, + "loss": 0.2608, + "step": 3400 + }, + { + "epoch": 1.5989729225023344, + "grad_norm": 8.828543663024902, + "learning_rate": 6.844e-08, + "loss": 0.1916, + "step": 3425 + }, + { + "epoch": 1.6106442577030813, + "grad_norm": 5.600862503051758, + "learning_rate": 6.894e-08, + "loss": 0.2636, + "step": 3450 + }, + { + "epoch": 1.6223155929038282, + "grad_norm": 9.772380828857422, + "learning_rate": 6.944e-08, + "loss": 0.1835, + "step": 3475 + }, + { + "epoch": 1.6339869281045751, + "grad_norm": 4.258734703063965, + "learning_rate": 6.994e-08, + "loss": 0.2693, + "step": 3500 + }, + { + "epoch": 1.645658263305322, + "grad_norm": 6.106602668762207, + "learning_rate": 7.044e-08, + "loss": 0.1971, + "step": 3525 + }, + { + "epoch": 1.657329598506069, + "grad_norm": 3.7969162464141846, + "learning_rate": 7.094e-08, + "loss": 0.2919, + "step": 3550 + }, + { + "epoch": 1.669000933706816, + "grad_norm": 7.152183532714844, + "learning_rate": 7.144e-08, + "loss": 0.2027, + "step": 3575 + }, + { + "epoch": 1.680672268907563, + "grad_norm": 6.071133613586426, + "learning_rate": 7.194e-08, + "loss": 0.2699, + "step": 3600 + }, + { + "epoch": 1.69234360410831, + "grad_norm": 6.300527095794678, + "learning_rate": 7.244e-08, + "loss": 0.1766, + "step": 3625 + }, + { + "epoch": 1.7040149393090571, + "grad_norm": 5.592601776123047, + "learning_rate": 7.294e-08, + "loss": 0.2645, + "step": 3650 + }, + { + "epoch": 1.715686274509804, + "grad_norm": 14.278104782104492, + "learning_rate": 7.344e-08, + "loss": 0.1926, + "step": 3675 + }, + { + "epoch": 1.727357609710551, + "grad_norm": 6.237105369567871, + "learning_rate": 7.394e-08, + "loss": 0.2814, + "step": 3700 + }, + { + "epoch": 1.739028944911298, + "grad_norm": 10.357053756713867, + "learning_rate": 7.444e-08, + "loss": 0.2081, + "step": 3725 + }, + { + "epoch": 1.7507002801120448, + "grad_norm": 7.063169002532959, + "learning_rate": 7.494000000000001e-08, + "loss": 0.2718, + "step": 3750 + }, + { + "epoch": 1.7623716153127917, + "grad_norm": 7.2696638107299805, + "learning_rate": 7.543999999999999e-08, + "loss": 0.1849, + "step": 3775 + }, + { + "epoch": 1.7740429505135387, + "grad_norm": 3.825491428375244, + "learning_rate": 7.593999999999999e-08, + "loss": 0.271, + "step": 3800 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 7.341236114501953, + "learning_rate": 7.643999999999999e-08, + "loss": 0.166, + "step": 3825 + }, + { + "epoch": 1.7973856209150327, + "grad_norm": 5.081601619720459, + "learning_rate": 7.693999999999999e-08, + "loss": 0.2857, + "step": 3850 + }, + { + "epoch": 1.8090569561157797, + "grad_norm": 7.839240550994873, + "learning_rate": 7.744e-08, + "loss": 0.1839, + "step": 3875 + }, + { + "epoch": 1.8207282913165266, + "grad_norm": 4.5172014236450195, + "learning_rate": 7.794e-08, + "loss": 0.2515, + "step": 3900 + }, + { + "epoch": 1.8323996265172737, + "grad_norm": 9.478545188903809, + "learning_rate": 7.843999999999999e-08, + "loss": 0.1678, + "step": 3925 + }, + { + "epoch": 1.8440709617180207, + "grad_norm": 5.93352746963501, + "learning_rate": 7.893999999999999e-08, + "loss": 0.2674, + "step": 3950 + }, + { + "epoch": 1.8557422969187676, + "grad_norm": 9.502734184265137, + "learning_rate": 7.943999999999999e-08, + "loss": 0.1609, + "step": 3975 + }, + { + "epoch": 1.8674136321195145, + "grad_norm": 7.912998676300049, + "learning_rate": 7.994e-08, + "loss": 0.2345, + "step": 4000 + }, + { + "epoch": 1.8790849673202614, + "grad_norm": 5.549155235290527, + "learning_rate": 8.044e-08, + "loss": 0.1918, + "step": 4025 + }, + { + "epoch": 1.8907563025210083, + "grad_norm": 7.1379499435424805, + "learning_rate": 8.094e-08, + "loss": 0.2655, + "step": 4050 + }, + { + "epoch": 1.9024276377217553, + "grad_norm": 5.990372657775879, + "learning_rate": 8.144e-08, + "loss": 0.1538, + "step": 4075 + }, + { + "epoch": 1.9140989729225022, + "grad_norm": 5.755247592926025, + "learning_rate": 8.192000000000001e-08, + "loss": 0.2351, + "step": 4100 + }, + { + "epoch": 1.9257703081232493, + "grad_norm": 11.432059288024902, + "learning_rate": 8.241999999999999e-08, + "loss": 0.1734, + "step": 4125 + }, + { + "epoch": 1.9374416433239963, + "grad_norm": 4.935561656951904, + "learning_rate": 8.291999999999999e-08, + "loss": 0.2586, + "step": 4150 + }, + { + "epoch": 1.9491129785247432, + "grad_norm": 7.362981796264648, + "learning_rate": 8.341999999999999e-08, + "loss": 0.1681, + "step": 4175 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 5.120658874511719, + "learning_rate": 8.391999999999999e-08, + "loss": 0.2669, + "step": 4200 + }, + { + "epoch": 1.9724556489262373, + "grad_norm": 9.280594825744629, + "learning_rate": 8.442e-08, + "loss": 0.1717, + "step": 4225 + }, + { + "epoch": 1.9841269841269842, + "grad_norm": 7.310540199279785, + "learning_rate": 8.492e-08, + "loss": 0.2391, + "step": 4250 + }, + { + "epoch": 1.995798319327731, + "grad_norm": 7.3643927574157715, + "learning_rate": 8.541999999999999e-08, + "loss": 0.1764, + "step": 4275 + }, + { + "epoch": 2.007469654528478, + "grad_norm": 4.083337783813477, + "learning_rate": 8.59e-08, + "loss": 0.2192, + "step": 4300 + }, + { + "epoch": 2.019140989729225, + "grad_norm": 10.079933166503906, + "learning_rate": 8.64e-08, + "loss": 0.1708, + "step": 4325 + }, + { + "epoch": 2.030812324929972, + "grad_norm": 5.14344596862793, + "learning_rate": 8.69e-08, + "loss": 0.2486, + "step": 4350 + }, + { + "epoch": 2.042483660130719, + "grad_norm": 6.526447296142578, + "learning_rate": 8.74e-08, + "loss": 0.1678, + "step": 4375 + }, + { + "epoch": 2.0541549953314657, + "grad_norm": 5.671429634094238, + "learning_rate": 8.79e-08, + "loss": 0.2349, + "step": 4400 + }, + { + "epoch": 2.065826330532213, + "grad_norm": 9.383622169494629, + "learning_rate": 8.84e-08, + "loss": 0.1365, + "step": 4425 + }, + { + "epoch": 2.07749766573296, + "grad_norm": 6.865725040435791, + "learning_rate": 8.890000000000001e-08, + "loss": 0.2207, + "step": 4450 + }, + { + "epoch": 2.089169000933707, + "grad_norm": 10.070252418518066, + "learning_rate": 8.939999999999999e-08, + "loss": 0.1797, + "step": 4475 + }, + { + "epoch": 2.100840336134454, + "grad_norm": 4.737791061401367, + "learning_rate": 8.989999999999999e-08, + "loss": 0.2361, + "step": 4500 + }, + { + "epoch": 2.112511671335201, + "grad_norm": 9.253829002380371, + "learning_rate": 9.039999999999999e-08, + "loss": 0.184, + "step": 4525 + }, + { + "epoch": 2.1241830065359477, + "grad_norm": 5.701707363128662, + "learning_rate": 9.089999999999999e-08, + "loss": 0.233, + "step": 4550 + }, + { + "epoch": 2.1358543417366946, + "grad_norm": 7.527386665344238, + "learning_rate": 9.139999999999998e-08, + "loss": 0.1771, + "step": 4575 + }, + { + "epoch": 2.1475256769374416, + "grad_norm": 7.340992450714111, + "learning_rate": 9.19e-08, + "loss": 0.2505, + "step": 4600 + }, + { + "epoch": 2.1591970121381885, + "grad_norm": 11.267548561096191, + "learning_rate": 9.24e-08, + "loss": 0.1706, + "step": 4625 + }, + { + "epoch": 2.1708683473389354, + "grad_norm": 5.289811134338379, + "learning_rate": 9.289999999999999e-08, + "loss": 0.2326, + "step": 4650 + }, + { + "epoch": 2.1825396825396823, + "grad_norm": 6.074433326721191, + "learning_rate": 9.339999999999999e-08, + "loss": 0.1794, + "step": 4675 + }, + { + "epoch": 2.1942110177404297, + "grad_norm": 6.203845024108887, + "learning_rate": 9.389999999999999e-08, + "loss": 0.2259, + "step": 4700 + }, + { + "epoch": 2.2058823529411766, + "grad_norm": 9.799361228942871, + "learning_rate": 9.44e-08, + "loss": 0.1796, + "step": 4725 + }, + { + "epoch": 2.2175536881419236, + "grad_norm": 7.236292839050293, + "learning_rate": 9.49e-08, + "loss": 0.2338, + "step": 4750 + }, + { + "epoch": 2.2292250233426705, + "grad_norm": 10.37661075592041, + "learning_rate": 9.54e-08, + "loss": 0.1969, + "step": 4775 + }, + { + "epoch": 2.2408963585434174, + "grad_norm": 6.369841575622559, + "learning_rate": 9.589999999999999e-08, + "loss": 0.2103, + "step": 4800 + }, + { + "epoch": 2.2525676937441643, + "grad_norm": 9.137279510498047, + "learning_rate": 9.639999999999999e-08, + "loss": 0.1836, + "step": 4825 + }, + { + "epoch": 2.2642390289449112, + "grad_norm": 6.758956432342529, + "learning_rate": 9.69e-08, + "loss": 0.2462, + "step": 4850 + }, + { + "epoch": 2.275910364145658, + "grad_norm": 6.473018169403076, + "learning_rate": 9.74e-08, + "loss": 0.1802, + "step": 4875 + }, + { + "epoch": 2.287581699346405, + "grad_norm": 4.492936134338379, + "learning_rate": 9.79e-08, + "loss": 0.2323, + "step": 4900 + }, + { + "epoch": 2.299253034547152, + "grad_norm": 9.348398208618164, + "learning_rate": 9.84e-08, + "loss": 0.1794, + "step": 4925 + }, + { + "epoch": 2.310924369747899, + "grad_norm": 5.3305230140686035, + "learning_rate": 9.889999999999999e-08, + "loss": 0.2443, + "step": 4950 + }, + { + "epoch": 2.3225957049486463, + "grad_norm": 10.86744499206543, + "learning_rate": 9.94e-08, + "loss": 0.1844, + "step": 4975 + }, + { + "epoch": 2.3342670401493932, + "grad_norm": 6.479306697845459, + "learning_rate": 9.99e-08, + "loss": 0.2244, + "step": 5000 + }, + { + "epoch": 2.3342670401493932, + "eval_loss": 0.17277346551418304, + "eval_runtime": 6738.9666, + "eval_samples_per_second": 1.397, + "eval_steps_per_second": 0.175, + "eval_wer": 0.11098013886646213, + "step": 5000 + }, + { + "epoch": 2.34593837535014, + "grad_norm": 5.447085857391357, + "learning_rate": 1.004e-07, + "loss": 0.1718, + "step": 5025 + }, + { + "epoch": 2.357609710550887, + "grad_norm": 8.150873184204102, + "learning_rate": 1.009e-07, + "loss": 0.2243, + "step": 5050 + }, + { + "epoch": 2.369281045751634, + "grad_norm": 8.1106538772583, + "learning_rate": 1.014e-07, + "loss": 0.146, + "step": 5075 + }, + { + "epoch": 2.380952380952381, + "grad_norm": 4.127166748046875, + "learning_rate": 1.019e-07, + "loss": 0.2267, + "step": 5100 + }, + { + "epoch": 2.392623716153128, + "grad_norm": 11.673868179321289, + "learning_rate": 1.024e-07, + "loss": 0.1827, + "step": 5125 + }, + { + "epoch": 2.404295051353875, + "grad_norm": 5.34147834777832, + "learning_rate": 1.029e-07, + "loss": 0.2271, + "step": 5150 + }, + { + "epoch": 2.4159663865546217, + "grad_norm": 8.061164855957031, + "learning_rate": 1.034e-07, + "loss": 0.1765, + "step": 5175 + }, + { + "epoch": 2.4276377217553686, + "grad_norm": 6.568578243255615, + "learning_rate": 1.039e-07, + "loss": 0.2249, + "step": 5200 + }, + { + "epoch": 2.439309056956116, + "grad_norm": 8.7069730758667, + "learning_rate": 1.0440000000000001e-07, + "loss": 0.1717, + "step": 5225 + }, + { + "epoch": 2.450980392156863, + "grad_norm": 5.4418792724609375, + "learning_rate": 1.0489999999999999e-07, + "loss": 0.2247, + "step": 5250 + }, + { + "epoch": 2.46265172735761, + "grad_norm": 9.333065032958984, + "learning_rate": 1.0539999999999999e-07, + "loss": 0.1851, + "step": 5275 + }, + { + "epoch": 2.4743230625583568, + "grad_norm": 6.602376461029053, + "learning_rate": 1.0589999999999999e-07, + "loss": 0.2658, + "step": 5300 + }, + { + "epoch": 2.4859943977591037, + "grad_norm": 11.450864791870117, + "learning_rate": 1.0639999999999999e-07, + "loss": 0.1743, + "step": 5325 + }, + { + "epoch": 2.4976657329598506, + "grad_norm": 5.90830135345459, + "learning_rate": 1.0689999999999998e-07, + "loss": 0.2272, + "step": 5350 + }, + { + "epoch": 2.5093370681605975, + "grad_norm": 6.921583652496338, + "learning_rate": 1.074e-07, + "loss": 0.1585, + "step": 5375 + }, + { + "epoch": 2.5210084033613445, + "grad_norm": 5.965441703796387, + "learning_rate": 1.079e-07, + "loss": 0.2117, + "step": 5400 + }, + { + "epoch": 2.5326797385620914, + "grad_norm": 8.437889099121094, + "learning_rate": 1.0839999999999999e-07, + "loss": 0.1903, + "step": 5425 + }, + { + "epoch": 2.5443510737628383, + "grad_norm": 5.796535491943359, + "learning_rate": 1.0889999999999999e-07, + "loss": 0.2151, + "step": 5450 + }, + { + "epoch": 2.5560224089635852, + "grad_norm": 9.895671844482422, + "learning_rate": 1.0939999999999999e-07, + "loss": 0.1778, + "step": 5475 + }, + { + "epoch": 2.567693744164332, + "grad_norm": 10.211431503295898, + "learning_rate": 1.099e-07, + "loss": 0.2166, + "step": 5500 + }, + { + "epoch": 2.5793650793650795, + "grad_norm": 8.422016143798828, + "learning_rate": 1.104e-07, + "loss": 0.1599, + "step": 5525 + }, + { + "epoch": 2.5910364145658265, + "grad_norm": 6.82072639465332, + "learning_rate": 1.109e-07, + "loss": 0.2406, + "step": 5550 + }, + { + "epoch": 2.6027077497665734, + "grad_norm": 7.977824687957764, + "learning_rate": 1.1139999999999999e-07, + "loss": 0.174, + "step": 5575 + }, + { + "epoch": 2.6143790849673203, + "grad_norm": 4.871920108795166, + "learning_rate": 1.1189999999999999e-07, + "loss": 0.2077, + "step": 5600 + }, + { + "epoch": 2.6260504201680672, + "grad_norm": 14.31760025024414, + "learning_rate": 1.124e-07, + "loss": 0.1581, + "step": 5625 + }, + { + "epoch": 2.637721755368814, + "grad_norm": 4.476131916046143, + "learning_rate": 1.129e-07, + "loss": 0.2158, + "step": 5650 + }, + { + "epoch": 2.649393090569561, + "grad_norm": 6.954850673675537, + "learning_rate": 1.134e-07, + "loss": 0.1689, + "step": 5675 + }, + { + "epoch": 2.661064425770308, + "grad_norm": 5.502589702606201, + "learning_rate": 1.139e-07, + "loss": 0.2082, + "step": 5700 + }, + { + "epoch": 2.6727357609710554, + "grad_norm": 13.118797302246094, + "learning_rate": 1.1439999999999999e-07, + "loss": 0.1646, + "step": 5725 + }, + { + "epoch": 2.6844070961718023, + "grad_norm": 3.66182541847229, + "learning_rate": 1.149e-07, + "loss": 0.2201, + "step": 5750 + }, + { + "epoch": 2.696078431372549, + "grad_norm": 9.46583366394043, + "learning_rate": 1.154e-07, + "loss": 0.158, + "step": 5775 + }, + { + "epoch": 2.707749766573296, + "grad_norm": 6.853757381439209, + "learning_rate": 1.159e-07, + "loss": 0.2417, + "step": 5800 + }, + { + "epoch": 2.719421101774043, + "grad_norm": 8.791181564331055, + "learning_rate": 1.164e-07, + "loss": 0.163, + "step": 5825 + }, + { + "epoch": 2.73109243697479, + "grad_norm": 6.461370944976807, + "learning_rate": 1.169e-07, + "loss": 0.2103, + "step": 5850 + }, + { + "epoch": 2.742763772175537, + "grad_norm": 9.98912525177002, + "learning_rate": 1.1739999999999999e-07, + "loss": 0.1519, + "step": 5875 + }, + { + "epoch": 2.754435107376284, + "grad_norm": 4.975451946258545, + "learning_rate": 1.179e-07, + "loss": 0.2517, + "step": 5900 + }, + { + "epoch": 2.7661064425770308, + "grad_norm": 8.629615783691406, + "learning_rate": 1.184e-07, + "loss": 0.1452, + "step": 5925 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 6.218091011047363, + "learning_rate": 1.189e-07, + "loss": 0.2481, + "step": 5950 + }, + { + "epoch": 2.7894491129785246, + "grad_norm": 7.882603168487549, + "learning_rate": 1.194e-07, + "loss": 0.175, + "step": 5975 + }, + { + "epoch": 2.8011204481792715, + "grad_norm": 5.259121417999268, + "learning_rate": 1.199e-07, + "loss": 0.1935, + "step": 6000 + }, + { + "epoch": 2.8127917833800185, + "grad_norm": 9.59416389465332, + "learning_rate": 1.204e-07, + "loss": 0.1517, + "step": 6025 + }, + { + "epoch": 2.8244631185807654, + "grad_norm": 5.815752983093262, + "learning_rate": 1.2089999999999998e-07, + "loss": 0.2308, + "step": 6050 + }, + { + "epoch": 2.8361344537815127, + "grad_norm": 16.040206909179688, + "learning_rate": 1.214e-07, + "loss": 0.1681, + "step": 6075 + }, + { + "epoch": 2.8478057889822597, + "grad_norm": 6.292205333709717, + "learning_rate": 1.219e-07, + "loss": 0.2188, + "step": 6100 + }, + { + "epoch": 2.8594771241830066, + "grad_norm": 8.221199035644531, + "learning_rate": 1.2239999999999998e-07, + "loss": 0.1637, + "step": 6125 + }, + { + "epoch": 2.8711484593837535, + "grad_norm": 5.419161319732666, + "learning_rate": 1.229e-07, + "loss": 0.2308, + "step": 6150 + }, + { + "epoch": 2.8828197945845004, + "grad_norm": 7.9300665855407715, + "learning_rate": 1.2339999999999998e-07, + "loss": 0.1609, + "step": 6175 + }, + { + "epoch": 2.8944911297852474, + "grad_norm": 5.752758026123047, + "learning_rate": 1.239e-07, + "loss": 0.2159, + "step": 6200 + }, + { + "epoch": 2.9061624649859943, + "grad_norm": 8.784625053405762, + "learning_rate": 1.244e-07, + "loss": 0.1663, + "step": 6225 + }, + { + "epoch": 2.917833800186741, + "grad_norm": 6.791645526885986, + "learning_rate": 1.249e-07, + "loss": 0.2163, + "step": 6250 + }, + { + "epoch": 2.9295051353874886, + "grad_norm": 6.143098831176758, + "learning_rate": 1.254e-07, + "loss": 0.1623, + "step": 6275 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 4.767801284790039, + "learning_rate": 1.259e-07, + "loss": 0.2019, + "step": 6300 + }, + { + "epoch": 2.9528478057889824, + "grad_norm": 9.43720531463623, + "learning_rate": 1.264e-07, + "loss": 0.1709, + "step": 6325 + }, + { + "epoch": 2.9645191409897294, + "grad_norm": 5.25966215133667, + "learning_rate": 1.269e-07, + "loss": 0.2319, + "step": 6350 + }, + { + "epoch": 2.9761904761904763, + "grad_norm": 13.078607559204102, + "learning_rate": 1.2740000000000002e-07, + "loss": 0.1754, + "step": 6375 + }, + { + "epoch": 2.987861811391223, + "grad_norm": 5.5642991065979, + "learning_rate": 1.279e-07, + "loss": 0.2153, + "step": 6400 + }, + { + "epoch": 2.99953314659197, + "grad_norm": 11.523698806762695, + "learning_rate": 1.2839999999999999e-07, + "loss": 0.2086, + "step": 6425 + }, + { + "epoch": 3.011204481792717, + "grad_norm": 5.675624370574951, + "learning_rate": 1.2888e-07, + "loss": 0.1593, + "step": 6450 + }, + { + "epoch": 3.022875816993464, + "grad_norm": 17.480037689208984, + "learning_rate": 1.2937999999999998e-07, + "loss": 0.1673, + "step": 6475 + }, + { + "epoch": 3.034547152194211, + "grad_norm": 9.549832344055176, + "learning_rate": 1.2988e-07, + "loss": 0.1938, + "step": 6500 + }, + { + "epoch": 3.046218487394958, + "grad_norm": 12.89521598815918, + "learning_rate": 1.3037999999999998e-07, + "loss": 0.186, + "step": 6525 + }, + { + "epoch": 3.0578898225957047, + "grad_norm": 7.42260217666626, + "learning_rate": 1.3088e-07, + "loss": 0.2042, + "step": 6550 + }, + { + "epoch": 3.069561157796452, + "grad_norm": 13.60092544555664, + "learning_rate": 1.3138e-07, + "loss": 0.1988, + "step": 6575 + }, + { + "epoch": 3.081232492997199, + "grad_norm": 5.782377243041992, + "learning_rate": 1.3188e-07, + "loss": 0.1736, + "step": 6600 + }, + { + "epoch": 3.092903828197946, + "grad_norm": 11.844609260559082, + "learning_rate": 1.3238e-07, + "loss": 0.1904, + "step": 6625 + }, + { + "epoch": 3.104575163398693, + "grad_norm": 6.240257263183594, + "learning_rate": 1.3287999999999998e-07, + "loss": 0.1605, + "step": 6650 + }, + { + "epoch": 3.11624649859944, + "grad_norm": 12.566492080688477, + "learning_rate": 1.3338e-07, + "loss": 0.1957, + "step": 6675 + }, + { + "epoch": 3.1279178338001867, + "grad_norm": 8.285445213317871, + "learning_rate": 1.3388e-07, + "loss": 0.1801, + "step": 6700 + }, + { + "epoch": 3.1395891690009337, + "grad_norm": 12.288935661315918, + "learning_rate": 1.3438e-07, + "loss": 0.1982, + "step": 6725 + }, + { + "epoch": 3.1512605042016806, + "grad_norm": 7.052362442016602, + "learning_rate": 1.3488e-07, + "loss": 0.1619, + "step": 6750 + }, + { + "epoch": 3.1629318394024275, + "grad_norm": 18.458065032958984, + "learning_rate": 1.3537999999999999e-07, + "loss": 0.1855, + "step": 6775 + }, + { + "epoch": 3.1746031746031744, + "grad_norm": 15.821798324584961, + "learning_rate": 1.3588e-07, + "loss": 0.1476, + "step": 6800 + }, + { + "epoch": 3.186274509803922, + "grad_norm": 11.816914558410645, + "learning_rate": 1.3638e-07, + "loss": 0.1944, + "step": 6825 + }, + { + "epoch": 3.1979458450046687, + "grad_norm": 6.645755767822266, + "learning_rate": 1.3688e-07, + "loss": 0.1678, + "step": 6850 + }, + { + "epoch": 3.2096171802054156, + "grad_norm": 16.721040725708008, + "learning_rate": 1.3738e-07, + "loss": 0.1706, + "step": 6875 + }, + { + "epoch": 3.2212885154061626, + "grad_norm": 8.140375137329102, + "learning_rate": 1.3788e-07, + "loss": 0.1695, + "step": 6900 + }, + { + "epoch": 3.2329598506069095, + "grad_norm": 12.450023651123047, + "learning_rate": 1.3838e-07, + "loss": 0.1797, + "step": 6925 + }, + { + "epoch": 3.2446311858076564, + "grad_norm": 6.419872760772705, + "learning_rate": 1.3888e-07, + "loss": 0.1665, + "step": 6950 + }, + { + "epoch": 3.2563025210084033, + "grad_norm": 10.356698036193848, + "learning_rate": 1.3938e-07, + "loss": 0.1614, + "step": 6975 + }, + { + "epoch": 3.2679738562091503, + "grad_norm": 8.553840637207031, + "learning_rate": 1.3988e-07, + "loss": 0.1616, + "step": 7000 + }, + { + "epoch": 3.279645191409897, + "grad_norm": 11.927959442138672, + "learning_rate": 1.4038e-07, + "loss": 0.1686, + "step": 7025 + }, + { + "epoch": 3.291316526610644, + "grad_norm": 6.493635654449463, + "learning_rate": 1.4088e-07, + "loss": 0.1804, + "step": 7050 + }, + { + "epoch": 3.302987861811391, + "grad_norm": 14.237950325012207, + "learning_rate": 1.4137999999999999e-07, + "loss": 0.1607, + "step": 7075 + }, + { + "epoch": 3.314659197012138, + "grad_norm": 7.410088539123535, + "learning_rate": 1.4188e-07, + "loss": 0.1458, + "step": 7100 + }, + { + "epoch": 3.3263305322128853, + "grad_norm": 10.997467041015625, + "learning_rate": 1.4238e-07, + "loss": 0.1676, + "step": 7125 + }, + { + "epoch": 3.3380018674136323, + "grad_norm": 6.001441955566406, + "learning_rate": 1.4288e-07, + "loss": 0.1655, + "step": 7150 + }, + { + "epoch": 3.349673202614379, + "grad_norm": 14.08969497680664, + "learning_rate": 1.4338e-07, + "loss": 0.1959, + "step": 7175 + }, + { + "epoch": 3.361344537815126, + "grad_norm": 5.801328182220459, + "learning_rate": 1.4388e-07, + "loss": 0.1686, + "step": 7200 + }, + { + "epoch": 3.373015873015873, + "grad_norm": 13.626670837402344, + "learning_rate": 1.4438e-07, + "loss": 0.1986, + "step": 7225 + }, + { + "epoch": 3.38468720821662, + "grad_norm": 6.545166492462158, + "learning_rate": 1.4488e-07, + "loss": 0.1852, + "step": 7250 + }, + { + "epoch": 3.396358543417367, + "grad_norm": 13.894329071044922, + "learning_rate": 1.4538e-07, + "loss": 0.1642, + "step": 7275 + }, + { + "epoch": 3.408029878618114, + "grad_norm": 10.140618324279785, + "learning_rate": 1.4588e-07, + "loss": 0.177, + "step": 7300 + }, + { + "epoch": 3.4197012138188607, + "grad_norm": 18.14762306213379, + "learning_rate": 1.4638e-07, + "loss": 0.1856, + "step": 7325 + }, + { + "epoch": 3.431372549019608, + "grad_norm": 5.81195592880249, + "learning_rate": 1.4688e-07, + "loss": 0.1701, + "step": 7350 + }, + { + "epoch": 3.443043884220355, + "grad_norm": 12.958548545837402, + "learning_rate": 1.4738000000000001e-07, + "loss": 0.1755, + "step": 7375 + }, + { + "epoch": 3.454715219421102, + "grad_norm": 8.025079727172852, + "learning_rate": 1.4788e-07, + "loss": 0.1801, + "step": 7400 + }, + { + "epoch": 3.466386554621849, + "grad_norm": 19.22530746459961, + "learning_rate": 1.4838e-07, + "loss": 0.1776, + "step": 7425 + }, + { + "epoch": 3.478057889822596, + "grad_norm": 7.672618865966797, + "learning_rate": 1.4888e-07, + "loss": 0.1915, + "step": 7450 + }, + { + "epoch": 3.4897292250233427, + "grad_norm": 12.312602043151855, + "learning_rate": 1.4938e-07, + "loss": 0.1888, + "step": 7475 + }, + { + "epoch": 3.5014005602240896, + "grad_norm": 7.824102878570557, + "learning_rate": 1.4988000000000002e-07, + "loss": 0.1725, + "step": 7500 + }, + { + "epoch": 3.5130718954248366, + "grad_norm": 11.86865234375, + "learning_rate": 1.5038e-07, + "loss": 0.1996, + "step": 7525 + }, + { + "epoch": 3.5247432306255835, + "grad_norm": 6.472956657409668, + "learning_rate": 1.5087999999999999e-07, + "loss": 0.1807, + "step": 7550 + }, + { + "epoch": 3.5364145658263304, + "grad_norm": 14.855595588684082, + "learning_rate": 1.5137999999999997e-07, + "loss": 0.1665, + "step": 7575 + }, + { + "epoch": 3.5480859010270773, + "grad_norm": 5.422650337219238, + "learning_rate": 1.5187999999999998e-07, + "loss": 0.169, + "step": 7600 + }, + { + "epoch": 3.5597572362278243, + "grad_norm": 13.586644172668457, + "learning_rate": 1.5238e-07, + "loss": 0.1886, + "step": 7625 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 7.154773712158203, + "learning_rate": 1.5287999999999998e-07, + "loss": 0.1772, + "step": 7650 + }, + { + "epoch": 3.5830999066293185, + "grad_norm": 15.92589282989502, + "learning_rate": 1.5338e-07, + "loss": 0.1603, + "step": 7675 + }, + { + "epoch": 3.5947712418300655, + "grad_norm": 4.725268363952637, + "learning_rate": 1.5387999999999997e-07, + "loss": 0.1579, + "step": 7700 + }, + { + "epoch": 3.6064425770308124, + "grad_norm": 10.37312126159668, + "learning_rate": 1.5437999999999998e-07, + "loss": 0.1556, + "step": 7725 + }, + { + "epoch": 3.6181139122315593, + "grad_norm": 3.6106224060058594, + "learning_rate": 1.5488e-07, + "loss": 0.1753, + "step": 7750 + }, + { + "epoch": 3.6297852474323062, + "grad_norm": 13.736579895019531, + "learning_rate": 1.5537999999999998e-07, + "loss": 0.1932, + "step": 7775 + }, + { + "epoch": 3.641456582633053, + "grad_norm": 7.333006381988525, + "learning_rate": 1.5588e-07, + "loss": 0.1688, + "step": 7800 + }, + { + "epoch": 3.6531279178338, + "grad_norm": 16.784841537475586, + "learning_rate": 1.5637999999999997e-07, + "loss": 0.1999, + "step": 7825 + }, + { + "epoch": 3.664799253034547, + "grad_norm": 9.547866821289062, + "learning_rate": 1.5687999999999999e-07, + "loss": 0.1852, + "step": 7850 + }, + { + "epoch": 3.6764705882352944, + "grad_norm": 14.133809089660645, + "learning_rate": 1.5738e-07, + "loss": 0.1554, + "step": 7875 + }, + { + "epoch": 3.6881419234360413, + "grad_norm": 7.968010425567627, + "learning_rate": 1.5787999999999998e-07, + "loss": 0.1649, + "step": 7900 + }, + { + "epoch": 3.6998132586367882, + "grad_norm": 12.247528076171875, + "learning_rate": 1.5838e-07, + "loss": 0.1975, + "step": 7925 + }, + { + "epoch": 3.711484593837535, + "grad_norm": 6.5526323318481445, + "learning_rate": 1.5887999999999998e-07, + "loss": 0.1808, + "step": 7950 + }, + { + "epoch": 3.723155929038282, + "grad_norm": 11.869317054748535, + "learning_rate": 1.5938e-07, + "loss": 0.1796, + "step": 7975 + }, + { + "epoch": 3.734827264239029, + "grad_norm": 7.336709499359131, + "learning_rate": 1.5988e-07, + "loss": 0.179, + "step": 8000 + }, + { + "epoch": 3.746498599439776, + "grad_norm": 21.456043243408203, + "learning_rate": 1.6037999999999998e-07, + "loss": 0.1854, + "step": 8025 + }, + { + "epoch": 3.758169934640523, + "grad_norm": 5.577650547027588, + "learning_rate": 1.6088e-07, + "loss": 0.1771, + "step": 8050 + }, + { + "epoch": 3.7698412698412698, + "grad_norm": 18.03679084777832, + "learning_rate": 1.6137999999999998e-07, + "loss": 0.1849, + "step": 8075 + }, + { + "epoch": 3.7815126050420167, + "grad_norm": 6.453721046447754, + "learning_rate": 1.6188e-07, + "loss": 0.1715, + "step": 8100 + }, + { + "epoch": 3.7931839402427636, + "grad_norm": 11.65691089630127, + "learning_rate": 1.6238e-07, + "loss": 0.1721, + "step": 8125 + }, + { + "epoch": 3.8048552754435105, + "grad_norm": 5.9379963874816895, + "learning_rate": 1.6288e-07, + "loss": 0.1691, + "step": 8150 + }, + { + "epoch": 3.8165266106442575, + "grad_norm": 16.275161743164062, + "learning_rate": 1.6338e-07, + "loss": 0.187, + "step": 8175 + }, + { + "epoch": 3.828197945845005, + "grad_norm": 4.220703125, + "learning_rate": 1.6387999999999998e-07, + "loss": 0.1973, + "step": 8200 + }, + { + "epoch": 3.8398692810457518, + "grad_norm": 13.512842178344727, + "learning_rate": 1.6438e-07, + "loss": 0.2035, + "step": 8225 + }, + { + "epoch": 3.8515406162464987, + "grad_norm": 4.128376007080078, + "learning_rate": 1.6487999999999998e-07, + "loss": 0.1761, + "step": 8250 + }, + { + "epoch": 3.8632119514472456, + "grad_norm": 15.425586700439453, + "learning_rate": 1.6538e-07, + "loss": 0.176, + "step": 8275 + }, + { + "epoch": 3.8748832866479925, + "grad_norm": 6.103633403778076, + "learning_rate": 1.6588e-07, + "loss": 0.1643, + "step": 8300 + }, + { + "epoch": 3.8865546218487395, + "grad_norm": 15.449716567993164, + "learning_rate": 1.6637999999999999e-07, + "loss": 0.1892, + "step": 8325 + }, + { + "epoch": 3.8982259570494864, + "grad_norm": 5.736420154571533, + "learning_rate": 1.6688e-07, + "loss": 0.1849, + "step": 8350 + }, + { + "epoch": 3.9098972922502333, + "grad_norm": 8.855688095092773, + "learning_rate": 1.6737999999999998e-07, + "loss": 0.178, + "step": 8375 + }, + { + "epoch": 3.9215686274509802, + "grad_norm": 6.678255558013916, + "learning_rate": 1.6788e-07, + "loss": 0.1574, + "step": 8400 + }, + { + "epoch": 3.9332399626517276, + "grad_norm": 15.251968383789062, + "learning_rate": 1.6838e-07, + "loss": 0.186, + "step": 8425 + }, + { + "epoch": 3.9449112978524745, + "grad_norm": 4.868924140930176, + "learning_rate": 1.6888e-07, + "loss": 0.1476, + "step": 8450 + }, + { + "epoch": 3.9565826330532214, + "grad_norm": 13.231505393981934, + "learning_rate": 1.6938e-07, + "loss": 0.1432, + "step": 8475 + }, + { + "epoch": 3.9682539682539684, + "grad_norm": 6.0398478507995605, + "learning_rate": 1.6987999999999998e-07, + "loss": 0.1835, + "step": 8500 + }, + { + "epoch": 3.9799253034547153, + "grad_norm": 20.359071731567383, + "learning_rate": 1.7038e-07, + "loss": 0.187, + "step": 8525 + }, + { + "epoch": 3.991596638655462, + "grad_norm": 5.61522912979126, + "learning_rate": 1.7088e-07, + "loss": 0.1767, + "step": 8550 + }, + { + "epoch": 4.003267973856209, + "grad_norm": 5.084539413452148, + "learning_rate": 1.7138e-07, + "loss": 0.1768, + "step": 8575 + }, + { + "epoch": 4.014939309056956, + "grad_norm": 8.967703819274902, + "learning_rate": 1.7188e-07, + "loss": 0.1259, + "step": 8600 + }, + { + "epoch": 4.026610644257703, + "grad_norm": 5.355931758880615, + "learning_rate": 1.7236000000000002e-07, + "loss": 0.1866, + "step": 8625 + }, + { + "epoch": 4.03828197945845, + "grad_norm": 8.791220664978027, + "learning_rate": 1.7286e-07, + "loss": 0.1286, + "step": 8650 + }, + { + "epoch": 4.049953314659197, + "grad_norm": 6.436952590942383, + "learning_rate": 1.7335999999999999e-07, + "loss": 0.1782, + "step": 8675 + }, + { + "epoch": 4.061624649859944, + "grad_norm": 7.118254661560059, + "learning_rate": 1.7385999999999997e-07, + "loss": 0.1487, + "step": 8700 + }, + { + "epoch": 4.073295985060691, + "grad_norm": 4.484027862548828, + "learning_rate": 1.7435999999999998e-07, + "loss": 0.1915, + "step": 8725 + }, + { + "epoch": 4.084967320261438, + "grad_norm": 6.743505477905273, + "learning_rate": 1.7486e-07, + "loss": 0.1209, + "step": 8750 + }, + { + "epoch": 4.0966386554621845, + "grad_norm": 4.635137557983398, + "learning_rate": 1.7535999999999998e-07, + "loss": 0.1897, + "step": 8775 + }, + { + "epoch": 4.1083099906629315, + "grad_norm": 4.966923713684082, + "learning_rate": 1.7586e-07, + "loss": 0.1394, + "step": 8800 + }, + { + "epoch": 4.119981325863678, + "grad_norm": 4.779516696929932, + "learning_rate": 1.7635999999999997e-07, + "loss": 0.1839, + "step": 8825 + }, + { + "epoch": 4.131652661064426, + "grad_norm": 8.51559066772461, + "learning_rate": 1.7685999999999998e-07, + "loss": 0.1169, + "step": 8850 + }, + { + "epoch": 4.143323996265173, + "grad_norm": 4.456249237060547, + "learning_rate": 1.7736e-07, + "loss": 0.1903, + "step": 8875 + }, + { + "epoch": 4.15499533146592, + "grad_norm": 5.5919671058654785, + "learning_rate": 1.7785999999999998e-07, + "loss": 0.1338, + "step": 8900 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 5.427141189575195, + "learning_rate": 1.7836e-07, + "loss": 0.1815, + "step": 8925 + }, + { + "epoch": 4.178338001867414, + "grad_norm": 5.67875337600708, + "learning_rate": 1.7885999999999998e-07, + "loss": 0.1258, + "step": 8950 + }, + { + "epoch": 4.190009337068161, + "grad_norm": 5.330212116241455, + "learning_rate": 1.7935999999999999e-07, + "loss": 0.1972, + "step": 8975 + }, + { + "epoch": 4.201680672268908, + "grad_norm": 7.678745269775391, + "learning_rate": 1.7985999999999997e-07, + "loss": 0.1486, + "step": 9000 + }, + { + "epoch": 4.213352007469655, + "grad_norm": 6.018349647521973, + "learning_rate": 1.8035999999999998e-07, + "loss": 0.1717, + "step": 9025 + }, + { + "epoch": 4.225023342670402, + "grad_norm": 6.979328155517578, + "learning_rate": 1.8086e-07, + "loss": 0.1444, + "step": 9050 + }, + { + "epoch": 4.2366946778711485, + "grad_norm": 4.748338222503662, + "learning_rate": 1.8135999999999998e-07, + "loss": 0.1776, + "step": 9075 + }, + { + "epoch": 4.248366013071895, + "grad_norm": 8.463362693786621, + "learning_rate": 1.8186e-07, + "loss": 0.1377, + "step": 9100 + }, + { + "epoch": 4.260037348272642, + "grad_norm": 4.122219085693359, + "learning_rate": 1.8235999999999997e-07, + "loss": 0.1752, + "step": 9125 + }, + { + "epoch": 4.271708683473389, + "grad_norm": 5.625559329986572, + "learning_rate": 1.8285999999999998e-07, + "loss": 0.1301, + "step": 9150 + }, + { + "epoch": 4.283380018674136, + "grad_norm": 5.3939385414123535, + "learning_rate": 1.8336e-07, + "loss": 0.193, + "step": 9175 + }, + { + "epoch": 4.295051353874883, + "grad_norm": 6.4233551025390625, + "learning_rate": 1.8385999999999998e-07, + "loss": 0.1424, + "step": 9200 + }, + { + "epoch": 4.30672268907563, + "grad_norm": 6.088770866394043, + "learning_rate": 1.8436e-07, + "loss": 0.1863, + "step": 9225 + }, + { + "epoch": 4.318394024276377, + "grad_norm": 8.557315826416016, + "learning_rate": 1.8485999999999998e-07, + "loss": 0.1227, + "step": 9250 + }, + { + "epoch": 4.330065359477124, + "grad_norm": 5.410427570343018, + "learning_rate": 1.8536e-07, + "loss": 0.1942, + "step": 9275 + }, + { + "epoch": 4.341736694677871, + "grad_norm": 4.211329460144043, + "learning_rate": 1.8586e-07, + "loss": 0.1457, + "step": 9300 + }, + { + "epoch": 4.353408029878618, + "grad_norm": 4.537903308868408, + "learning_rate": 1.8635999999999998e-07, + "loss": 0.18, + "step": 9325 + }, + { + "epoch": 4.365079365079365, + "grad_norm": 7.43745231628418, + "learning_rate": 1.8686e-07, + "loss": 0.1405, + "step": 9350 + }, + { + "epoch": 4.3767507002801125, + "grad_norm": 4.6163763999938965, + "learning_rate": 1.8735999999999998e-07, + "loss": 0.184, + "step": 9375 + }, + { + "epoch": 4.388422035480859, + "grad_norm": 4.933877944946289, + "learning_rate": 1.8786e-07, + "loss": 0.1333, + "step": 9400 + }, + { + "epoch": 4.400093370681606, + "grad_norm": 6.013834476470947, + "learning_rate": 1.8836e-07, + "loss": 0.1765, + "step": 9425 + }, + { + "epoch": 4.411764705882353, + "grad_norm": 4.945307731628418, + "learning_rate": 1.8885999999999999e-07, + "loss": 0.1286, + "step": 9450 + }, + { + "epoch": 4.4234360410831, + "grad_norm": 3.952646017074585, + "learning_rate": 1.8936e-07, + "loss": 0.1942, + "step": 9475 + }, + { + "epoch": 4.435107376283847, + "grad_norm": 8.337225914001465, + "learning_rate": 1.8985999999999998e-07, + "loss": 0.1381, + "step": 9500 + }, + { + "epoch": 4.446778711484594, + "grad_norm": 6.671125888824463, + "learning_rate": 1.9036e-07, + "loss": 0.1985, + "step": 9525 + }, + { + "epoch": 4.458450046685341, + "grad_norm": 6.973220348358154, + "learning_rate": 1.9086e-07, + "loss": 0.1275, + "step": 9550 + }, + { + "epoch": 4.470121381886088, + "grad_norm": 5.624568939208984, + "learning_rate": 1.9136e-07, + "loss": 0.1704, + "step": 9575 + }, + { + "epoch": 4.481792717086835, + "grad_norm": 7.6258745193481445, + "learning_rate": 1.9186e-07, + "loss": 0.1518, + "step": 9600 + }, + { + "epoch": 4.493464052287582, + "grad_norm": 4.0021185874938965, + "learning_rate": 1.9235999999999998e-07, + "loss": 0.1953, + "step": 9625 + }, + { + "epoch": 4.505135387488329, + "grad_norm": 6.774437427520752, + "learning_rate": 1.9286e-07, + "loss": 0.1527, + "step": 9650 + }, + { + "epoch": 4.516806722689076, + "grad_norm": 5.060838222503662, + "learning_rate": 1.9336e-07, + "loss": 0.2042, + "step": 9675 + }, + { + "epoch": 4.5284780578898225, + "grad_norm": 5.490878582000732, + "learning_rate": 1.9386e-07, + "loss": 0.1299, + "step": 9700 + }, + { + "epoch": 4.540149393090569, + "grad_norm": 5.598012447357178, + "learning_rate": 1.9436e-07, + "loss": 0.1717, + "step": 9725 + }, + { + "epoch": 4.551820728291316, + "grad_norm": 5.59892463684082, + "learning_rate": 1.9485999999999999e-07, + "loss": 0.1244, + "step": 9750 + }, + { + "epoch": 4.563492063492063, + "grad_norm": 4.751144886016846, + "learning_rate": 1.9536e-07, + "loss": 0.161, + "step": 9775 + }, + { + "epoch": 4.57516339869281, + "grad_norm": 6.7092671394348145, + "learning_rate": 1.9586e-07, + "loss": 0.1416, + "step": 9800 + }, + { + "epoch": 4.586834733893557, + "grad_norm": 4.288263320922852, + "learning_rate": 1.9636e-07, + "loss": 0.171, + "step": 9825 + }, + { + "epoch": 4.598506069094304, + "grad_norm": 8.770625114440918, + "learning_rate": 1.9686e-07, + "loss": 0.1334, + "step": 9850 + }, + { + "epoch": 4.610177404295051, + "grad_norm": 5.096324443817139, + "learning_rate": 1.9736e-07, + "loss": 0.1988, + "step": 9875 + }, + { + "epoch": 4.621848739495798, + "grad_norm": 4.740445613861084, + "learning_rate": 1.9786e-07, + "loss": 0.1476, + "step": 9900 + }, + { + "epoch": 4.633520074696545, + "grad_norm": 4.8285956382751465, + "learning_rate": 1.9836e-07, + "loss": 0.1912, + "step": 9925 + }, + { + "epoch": 4.645191409897293, + "grad_norm": 4.7548346519470215, + "learning_rate": 1.9886e-07, + "loss": 0.1305, + "step": 9950 + }, + { + "epoch": 4.6568627450980395, + "grad_norm": 4.447470188140869, + "learning_rate": 1.9936e-07, + "loss": 0.2013, + "step": 9975 + }, + { + "epoch": 4.6685340802987865, + "grad_norm": 6.167608261108398, + "learning_rate": 1.9986e-07, + "loss": 0.1471, + "step": 10000 + }, + { + "epoch": 4.6685340802987865, + "eval_loss": 0.15147170424461365, + "eval_runtime": 6575.0605, + "eval_samples_per_second": 1.432, + "eval_steps_per_second": 0.179, + "eval_wer": 0.09961246568706604, + "step": 10000 + }, + { + "epoch": 4.680205415499533, + "grad_norm": 3.9752037525177, + "learning_rate": 2.0036e-07, + "loss": 0.1705, + "step": 10025 + }, + { + "epoch": 4.69187675070028, + "grad_norm": 9.894227981567383, + "learning_rate": 2.0086e-07, + "loss": 0.1178, + "step": 10050 + }, + { + "epoch": 4.703548085901027, + "grad_norm": 5.56553840637207, + "learning_rate": 2.0136e-07, + "loss": 0.1849, + "step": 10075 + }, + { + "epoch": 4.715219421101774, + "grad_norm": 8.528691291809082, + "learning_rate": 2.0186e-07, + "loss": 0.1402, + "step": 10100 + }, + { + "epoch": 4.726890756302521, + "grad_norm": 5.351251125335693, + "learning_rate": 2.0236e-07, + "loss": 0.1812, + "step": 10125 + }, + { + "epoch": 4.738562091503268, + "grad_norm": 6.408919334411621, + "learning_rate": 2.0286e-07, + "loss": 0.132, + "step": 10150 + }, + { + "epoch": 4.750233426704015, + "grad_norm": 4.955003261566162, + "learning_rate": 2.0336000000000002e-07, + "loss": 0.1827, + "step": 10175 + }, + { + "epoch": 4.761904761904762, + "grad_norm": 9.441489219665527, + "learning_rate": 2.0386e-07, + "loss": 0.1711, + "step": 10200 + }, + { + "epoch": 4.773576097105509, + "grad_norm": 4.768829822540283, + "learning_rate": 2.0436e-07, + "loss": 0.1839, + "step": 10225 + }, + { + "epoch": 4.785247432306256, + "grad_norm": 8.283427238464355, + "learning_rate": 2.0485999999999997e-07, + "loss": 0.1308, + "step": 10250 + }, + { + "epoch": 4.796918767507003, + "grad_norm": 4.502756118774414, + "learning_rate": 2.0535999999999998e-07, + "loss": 0.1939, + "step": 10275 + }, + { + "epoch": 4.80859010270775, + "grad_norm": 6.445580959320068, + "learning_rate": 2.0585999999999997e-07, + "loss": 0.1366, + "step": 10300 + }, + { + "epoch": 4.8202614379084965, + "grad_norm": 5.302786350250244, + "learning_rate": 2.0635999999999998e-07, + "loss": 0.1733, + "step": 10325 + }, + { + "epoch": 4.831932773109243, + "grad_norm": 7.272347927093506, + "learning_rate": 2.0686e-07, + "loss": 0.1203, + "step": 10350 + }, + { + "epoch": 4.84360410830999, + "grad_norm": 4.1720170974731445, + "learning_rate": 2.0735999999999997e-07, + "loss": 0.1725, + "step": 10375 + }, + { + "epoch": 4.855275443510737, + "grad_norm": 4.301048755645752, + "learning_rate": 2.0785999999999998e-07, + "loss": 0.1177, + "step": 10400 + }, + { + "epoch": 4.866946778711485, + "grad_norm": 4.472489356994629, + "learning_rate": 2.0835999999999997e-07, + "loss": 0.1855, + "step": 10425 + }, + { + "epoch": 4.878618113912232, + "grad_norm": 7.996962070465088, + "learning_rate": 2.0885999999999998e-07, + "loss": 0.1327, + "step": 10450 + }, + { + "epoch": 4.890289449112979, + "grad_norm": 6.440398693084717, + "learning_rate": 2.0936e-07, + "loss": 0.1955, + "step": 10475 + }, + { + "epoch": 4.901960784313726, + "grad_norm": 7.208395481109619, + "learning_rate": 2.0985999999999997e-07, + "loss": 0.1387, + "step": 10500 + }, + { + "epoch": 4.913632119514473, + "grad_norm": 5.004977703094482, + "learning_rate": 2.1035999999999999e-07, + "loss": 0.1633, + "step": 10525 + }, + { + "epoch": 4.92530345471522, + "grad_norm": 3.838132381439209, + "learning_rate": 2.1085999999999997e-07, + "loss": 0.1151, + "step": 10550 + }, + { + "epoch": 4.936974789915967, + "grad_norm": 6.472508430480957, + "learning_rate": 2.1135999999999998e-07, + "loss": 0.1954, + "step": 10575 + }, + { + "epoch": 4.9486461251167135, + "grad_norm": 5.543705940246582, + "learning_rate": 2.1186e-07, + "loss": 0.1317, + "step": 10600 + }, + { + "epoch": 4.9603174603174605, + "grad_norm": 6.308438301086426, + "learning_rate": 2.1235999999999998e-07, + "loss": 0.1586, + "step": 10625 + }, + { + "epoch": 4.971988795518207, + "grad_norm": 7.787223815917969, + "learning_rate": 2.1286e-07, + "loss": 0.1266, + "step": 10650 + }, + { + "epoch": 4.983660130718954, + "grad_norm": 4.786161422729492, + "learning_rate": 2.1335999999999997e-07, + "loss": 0.2183, + "step": 10675 + }, + { + "epoch": 4.995331465919701, + "grad_norm": 5.482990264892578, + "learning_rate": 2.1385999999999998e-07, + "loss": 0.13, + "step": 10700 + }, + { + "epoch": 5.007002801120448, + "grad_norm": 4.17052698135376, + "learning_rate": 2.1434e-07, + "loss": 0.168, + "step": 10725 + }, + { + "epoch": 5.018674136321195, + "grad_norm": 7.545019149780273, + "learning_rate": 2.1483999999999998e-07, + "loss": 0.1227, + "step": 10750 + }, + { + "epoch": 5.030345471521942, + "grad_norm": 6.398622512817383, + "learning_rate": 2.1534e-07, + "loss": 0.1802, + "step": 10775 + }, + { + "epoch": 5.042016806722689, + "grad_norm": 6.926197052001953, + "learning_rate": 2.1584e-07, + "loss": 0.1188, + "step": 10800 + }, + { + "epoch": 5.053688141923436, + "grad_norm": 5.543834686279297, + "learning_rate": 2.1634e-07, + "loss": 0.1689, + "step": 10825 + }, + { + "epoch": 5.065359477124183, + "grad_norm": 5.125446796417236, + "learning_rate": 2.1684e-07, + "loss": 0.1124, + "step": 10850 + }, + { + "epoch": 5.07703081232493, + "grad_norm": 4.485465049743652, + "learning_rate": 2.1733999999999999e-07, + "loss": 0.1604, + "step": 10875 + }, + { + "epoch": 5.088702147525677, + "grad_norm": 12.635501861572266, + "learning_rate": 2.1784e-07, + "loss": 0.1181, + "step": 10900 + }, + { + "epoch": 5.1003734827264235, + "grad_norm": 6.018717288970947, + "learning_rate": 2.1834e-07, + "loss": 0.1523, + "step": 10925 + }, + { + "epoch": 5.1120448179271705, + "grad_norm": 8.683155059814453, + "learning_rate": 2.1884e-07, + "loss": 0.1214, + "step": 10950 + }, + { + "epoch": 5.123716153127917, + "grad_norm": 4.261901378631592, + "learning_rate": 2.1934e-07, + "loss": 0.1792, + "step": 10975 + }, + { + "epoch": 5.135387488328665, + "grad_norm": 7.0739264488220215, + "learning_rate": 2.1984e-07, + "loss": 0.1161, + "step": 11000 + }, + { + "epoch": 5.147058823529412, + "grad_norm": 6.2149529457092285, + "learning_rate": 2.2034e-07, + "loss": 0.1545, + "step": 11025 + }, + { + "epoch": 5.158730158730159, + "grad_norm": 9.735761642456055, + "learning_rate": 2.2084e-07, + "loss": 0.123, + "step": 11050 + }, + { + "epoch": 5.170401493930906, + "grad_norm": 2.7549943923950195, + "learning_rate": 2.2134e-07, + "loss": 0.1651, + "step": 11075 + }, + { + "epoch": 5.182072829131653, + "grad_norm": 13.182941436767578, + "learning_rate": 2.2184e-07, + "loss": 0.1333, + "step": 11100 + }, + { + "epoch": 5.1937441643324, + "grad_norm": 5.390936851501465, + "learning_rate": 2.2234e-07, + "loss": 0.1825, + "step": 11125 + }, + { + "epoch": 5.205415499533147, + "grad_norm": 5.555058479309082, + "learning_rate": 2.2284e-07, + "loss": 0.109, + "step": 11150 + }, + { + "epoch": 5.217086834733894, + "grad_norm": 3.377044916152954, + "learning_rate": 2.2334000000000001e-07, + "loss": 0.1604, + "step": 11175 + }, + { + "epoch": 5.228758169934641, + "grad_norm": 5.754917621612549, + "learning_rate": 2.2384e-07, + "loss": 0.1167, + "step": 11200 + }, + { + "epoch": 5.2404295051353875, + "grad_norm": 5.694931507110596, + "learning_rate": 2.2434e-07, + "loss": 0.1642, + "step": 11225 + }, + { + "epoch": 5.2521008403361344, + "grad_norm": 8.598726272583008, + "learning_rate": 2.2484e-07, + "loss": 0.1272, + "step": 11250 + }, + { + "epoch": 5.263772175536881, + "grad_norm": 5.686309814453125, + "learning_rate": 2.2534e-07, + "loss": 0.182, + "step": 11275 + }, + { + "epoch": 5.275443510737628, + "grad_norm": 7.420335292816162, + "learning_rate": 2.2584000000000002e-07, + "loss": 0.1181, + "step": 11300 + }, + { + "epoch": 5.287114845938375, + "grad_norm": 6.151350498199463, + "learning_rate": 2.2634e-07, + "loss": 0.1614, + "step": 11325 + }, + { + "epoch": 5.298786181139122, + "grad_norm": 7.9199957847595215, + "learning_rate": 2.2684e-07, + "loss": 0.13, + "step": 11350 + }, + { + "epoch": 5.310457516339869, + "grad_norm": 5.582814693450928, + "learning_rate": 2.2733999999999997e-07, + "loss": 0.1994, + "step": 11375 + }, + { + "epoch": 5.322128851540616, + "grad_norm": 8.254546165466309, + "learning_rate": 2.2783999999999998e-07, + "loss": 0.1087, + "step": 11400 + }, + { + "epoch": 5.333800186741363, + "grad_norm": 3.0663414001464844, + "learning_rate": 2.2833999999999997e-07, + "loss": 0.1602, + "step": 11425 + }, + { + "epoch": 5.34547152194211, + "grad_norm": 4.976311683654785, + "learning_rate": 2.2883999999999998e-07, + "loss": 0.1125, + "step": 11450 + }, + { + "epoch": 5.357142857142857, + "grad_norm": 5.26088285446167, + "learning_rate": 2.2934e-07, + "loss": 0.1631, + "step": 11475 + }, + { + "epoch": 5.368814192343605, + "grad_norm": 8.958911895751953, + "learning_rate": 2.2983999999999997e-07, + "loss": 0.1129, + "step": 11500 + }, + { + "epoch": 5.3804855275443515, + "grad_norm": 3.8142902851104736, + "learning_rate": 2.3033999999999998e-07, + "loss": 0.2008, + "step": 11525 + }, + { + "epoch": 5.392156862745098, + "grad_norm": 7.608828067779541, + "learning_rate": 2.3083999999999997e-07, + "loss": 0.1166, + "step": 11550 + }, + { + "epoch": 5.403828197945845, + "grad_norm": 4.0368475914001465, + "learning_rate": 2.3133999999999998e-07, + "loss": 0.154, + "step": 11575 + }, + { + "epoch": 5.415499533146592, + "grad_norm": 10.697487831115723, + "learning_rate": 2.3184e-07, + "loss": 0.1306, + "step": 11600 + }, + { + "epoch": 5.427170868347339, + "grad_norm": 7.596348762512207, + "learning_rate": 2.3233999999999997e-07, + "loss": 0.1723, + "step": 11625 + }, + { + "epoch": 5.438842203548086, + "grad_norm": 9.744882583618164, + "learning_rate": 2.3283999999999999e-07, + "loss": 0.129, + "step": 11650 + }, + { + "epoch": 5.450513538748833, + "grad_norm": 6.707164287567139, + "learning_rate": 2.3333999999999997e-07, + "loss": 0.1933, + "step": 11675 + }, + { + "epoch": 5.46218487394958, + "grad_norm": 5.012074947357178, + "learning_rate": 2.3383999999999998e-07, + "loss": 0.1223, + "step": 11700 + }, + { + "epoch": 5.473856209150327, + "grad_norm": 4.564844608306885, + "learning_rate": 2.3434e-07, + "loss": 0.1619, + "step": 11725 + }, + { + "epoch": 5.485527544351074, + "grad_norm": 6.225306034088135, + "learning_rate": 2.3483999999999998e-07, + "loss": 0.1143, + "step": 11750 + }, + { + "epoch": 5.497198879551821, + "grad_norm": 5.616468906402588, + "learning_rate": 2.3534e-07, + "loss": 0.1641, + "step": 11775 + }, + { + "epoch": 5.508870214752568, + "grad_norm": 5.898648738861084, + "learning_rate": 2.3583999999999997e-07, + "loss": 0.128, + "step": 11800 + }, + { + "epoch": 5.520541549953315, + "grad_norm": 5.743541717529297, + "learning_rate": 2.3633999999999998e-07, + "loss": 0.1671, + "step": 11825 + }, + { + "epoch": 5.5322128851540615, + "grad_norm": 9.724596977233887, + "learning_rate": 2.3684e-07, + "loss": 0.1263, + "step": 11850 + }, + { + "epoch": 5.543884220354808, + "grad_norm": 5.801641464233398, + "learning_rate": 2.3733999999999998e-07, + "loss": 0.1555, + "step": 11875 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 6.424407958984375, + "learning_rate": 2.3784e-07, + "loss": 0.1172, + "step": 11900 + }, + { + "epoch": 5.567226890756302, + "grad_norm": 4.034692764282227, + "learning_rate": 2.3833999999999998e-07, + "loss": 0.1539, + "step": 11925 + }, + { + "epoch": 5.578898225957049, + "grad_norm": 7.315247058868408, + "learning_rate": 2.3884e-07, + "loss": 0.1376, + "step": 11950 + }, + { + "epoch": 5.590569561157796, + "grad_norm": 4.622725963592529, + "learning_rate": 2.3933999999999997e-07, + "loss": 0.1555, + "step": 11975 + }, + { + "epoch": 5.602240896358543, + "grad_norm": 7.289337635040283, + "learning_rate": 2.3984e-07, + "loss": 0.1465, + "step": 12000 + }, + { + "epoch": 5.61391223155929, + "grad_norm": 5.700815677642822, + "learning_rate": 2.4034e-07, + "loss": 0.1753, + "step": 12025 + }, + { + "epoch": 5.625583566760037, + "grad_norm": 10.472694396972656, + "learning_rate": 2.4084e-07, + "loss": 0.1078, + "step": 12050 + }, + { + "epoch": 5.637254901960784, + "grad_norm": 6.967726707458496, + "learning_rate": 2.4133999999999996e-07, + "loss": 0.1628, + "step": 12075 + }, + { + "epoch": 5.648926237161532, + "grad_norm": 5.44551944732666, + "learning_rate": 2.4184e-07, + "loss": 0.1265, + "step": 12100 + }, + { + "epoch": 5.660597572362279, + "grad_norm": 3.403899669647217, + "learning_rate": 2.4234e-07, + "loss": 0.1646, + "step": 12125 + }, + { + "epoch": 5.6722689075630255, + "grad_norm": 6.885541915893555, + "learning_rate": 2.4283999999999997e-07, + "loss": 0.1376, + "step": 12150 + }, + { + "epoch": 5.683940242763772, + "grad_norm": 5.3647050857543945, + "learning_rate": 2.4334e-07, + "loss": 0.1683, + "step": 12175 + }, + { + "epoch": 5.695611577964519, + "grad_norm": 9.983818054199219, + "learning_rate": 2.4384e-07, + "loss": 0.1342, + "step": 12200 + }, + { + "epoch": 5.707282913165266, + "grad_norm": 4.701688766479492, + "learning_rate": 2.4434e-07, + "loss": 0.1687, + "step": 12225 + }, + { + "epoch": 5.718954248366013, + "grad_norm": 4.64987850189209, + "learning_rate": 2.4484e-07, + "loss": 0.1083, + "step": 12250 + }, + { + "epoch": 5.73062558356676, + "grad_norm": 6.0408935546875, + "learning_rate": 2.4534e-07, + "loss": 0.1661, + "step": 12275 + }, + { + "epoch": 5.742296918767507, + "grad_norm": 13.088526725769043, + "learning_rate": 2.4584e-07, + "loss": 0.1211, + "step": 12300 + }, + { + "epoch": 5.753968253968254, + "grad_norm": 4.763770580291748, + "learning_rate": 2.4633999999999997e-07, + "loss": 0.1531, + "step": 12325 + }, + { + "epoch": 5.765639589169001, + "grad_norm": 7.281481742858887, + "learning_rate": 2.4684e-07, + "loss": 0.1197, + "step": 12350 + }, + { + "epoch": 5.777310924369748, + "grad_norm": 3.6176838874816895, + "learning_rate": 2.4734e-07, + "loss": 0.1586, + "step": 12375 + }, + { + "epoch": 5.788982259570495, + "grad_norm": 9.852710723876953, + "learning_rate": 2.4784e-07, + "loss": 0.1155, + "step": 12400 + }, + { + "epoch": 5.800653594771242, + "grad_norm": 7.409560680389404, + "learning_rate": 2.4834e-07, + "loss": 0.155, + "step": 12425 + }, + { + "epoch": 5.812324929971989, + "grad_norm": 5.356072425842285, + "learning_rate": 2.4884e-07, + "loss": 0.1158, + "step": 12450 + }, + { + "epoch": 5.8239962651727355, + "grad_norm": 5.186484336853027, + "learning_rate": 2.4934e-07, + "loss": 0.1471, + "step": 12475 + }, + { + "epoch": 5.835667600373482, + "grad_norm": 7.531067848205566, + "learning_rate": 2.4984e-07, + "loss": 0.1174, + "step": 12500 + }, + { + "epoch": 5.847338935574229, + "grad_norm": 5.400341987609863, + "learning_rate": 2.5034e-07, + "loss": 0.1815, + "step": 12525 + }, + { + "epoch": 5.859010270774976, + "grad_norm": 7.280223369598389, + "learning_rate": 2.5084e-07, + "loss": 0.1251, + "step": 12550 + }, + { + "epoch": 5.870681605975724, + "grad_norm": 5.493415832519531, + "learning_rate": 2.5133999999999997e-07, + "loss": 0.1661, + "step": 12575 + }, + { + "epoch": 5.882352941176471, + "grad_norm": 10.021145820617676, + "learning_rate": 2.5184e-07, + "loss": 0.1275, + "step": 12600 + }, + { + "epoch": 5.894024276377218, + "grad_norm": 6.028408050537109, + "learning_rate": 2.5234e-07, + "loss": 0.1629, + "step": 12625 + }, + { + "epoch": 5.905695611577965, + "grad_norm": 4.85552453994751, + "learning_rate": 2.5284e-07, + "loss": 0.1204, + "step": 12650 + }, + { + "epoch": 5.917366946778712, + "grad_norm": 7.91325569152832, + "learning_rate": 2.5334e-07, + "loss": 0.174, + "step": 12675 + }, + { + "epoch": 5.929038281979459, + "grad_norm": 9.452722549438477, + "learning_rate": 2.5384e-07, + "loss": 0.1164, + "step": 12700 + }, + { + "epoch": 5.940709617180206, + "grad_norm": 5.12371826171875, + "learning_rate": 2.5434e-07, + "loss": 0.1568, + "step": 12725 + }, + { + "epoch": 5.9523809523809526, + "grad_norm": 4.421220779418945, + "learning_rate": 2.5484e-07, + "loss": 0.1141, + "step": 12750 + }, + { + "epoch": 5.9640522875816995, + "grad_norm": 2.8665106296539307, + "learning_rate": 2.5534e-07, + "loss": 0.1596, + "step": 12775 + }, + { + "epoch": 5.975723622782446, + "grad_norm": 7.798137187957764, + "learning_rate": 2.5584e-07, + "loss": 0.1087, + "step": 12800 + }, + { + "epoch": 5.987394957983193, + "grad_norm": 7.315576076507568, + "learning_rate": 2.5634e-07, + "loss": 0.1696, + "step": 12825 + }, + { + "epoch": 5.99906629318394, + "grad_norm": 7.312651634216309, + "learning_rate": 2.5684e-07, + "loss": 0.1194, + "step": 12850 + }, + { + "epoch": 6.010737628384687, + "grad_norm": 4.936952590942383, + "learning_rate": 2.5732e-07, + "loss": 0.1514, + "step": 12875 + }, + { + "epoch": 6.022408963585434, + "grad_norm": 10.799747467041016, + "learning_rate": 2.5781999999999996e-07, + "loss": 0.1231, + "step": 12900 + }, + { + "epoch": 6.034080298786181, + "grad_norm": 4.585947036743164, + "learning_rate": 2.5832e-07, + "loss": 0.1211, + "step": 12925 + }, + { + "epoch": 6.045751633986928, + "grad_norm": 13.68216609954834, + "learning_rate": 2.5882e-07, + "loss": 0.139, + "step": 12950 + }, + { + "epoch": 6.057422969187675, + "grad_norm": 5.997958660125732, + "learning_rate": 2.5931999999999997e-07, + "loss": 0.1375, + "step": 12975 + }, + { + "epoch": 6.069094304388422, + "grad_norm": 11.332950592041016, + "learning_rate": 2.5982e-07, + "loss": 0.1285, + "step": 13000 + }, + { + "epoch": 6.080765639589169, + "grad_norm": 3.158031702041626, + "learning_rate": 2.6032e-07, + "loss": 0.1383, + "step": 13025 + }, + { + "epoch": 6.092436974789916, + "grad_norm": 13.571795463562012, + "learning_rate": 2.6082e-07, + "loss": 0.1301, + "step": 13050 + }, + { + "epoch": 6.104108309990663, + "grad_norm": 6.358757972717285, + "learning_rate": 2.6131999999999996e-07, + "loss": 0.1355, + "step": 13075 + }, + { + "epoch": 6.1157796451914095, + "grad_norm": 12.662508964538574, + "learning_rate": 2.6182e-07, + "loss": 0.1245, + "step": 13100 + }, + { + "epoch": 6.127450980392156, + "grad_norm": 4.365048885345459, + "learning_rate": 2.6232e-07, + "loss": 0.1395, + "step": 13125 + }, + { + "epoch": 6.139122315592904, + "grad_norm": 8.536576271057129, + "learning_rate": 2.6281999999999997e-07, + "loss": 0.1303, + "step": 13150 + }, + { + "epoch": 6.150793650793651, + "grad_norm": 2.988816738128662, + "learning_rate": 2.6332e-07, + "loss": 0.1242, + "step": 13175 + }, + { + "epoch": 6.162464985994398, + "grad_norm": 8.541171073913574, + "learning_rate": 2.6382e-07, + "loss": 0.1184, + "step": 13200 + }, + { + "epoch": 6.174136321195145, + "grad_norm": 5.187004566192627, + "learning_rate": 2.6432e-07, + "loss": 0.1415, + "step": 13225 + }, + { + "epoch": 6.185807656395892, + "grad_norm": 9.733490943908691, + "learning_rate": 2.6482e-07, + "loss": 0.11, + "step": 13250 + }, + { + "epoch": 6.197478991596639, + "grad_norm": 3.2871172428131104, + "learning_rate": 2.6532e-07, + "loss": 0.1563, + "step": 13275 + }, + { + "epoch": 6.209150326797386, + "grad_norm": 20.811479568481445, + "learning_rate": 2.6582e-07, + "loss": 0.1341, + "step": 13300 + }, + { + "epoch": 6.220821661998133, + "grad_norm": 5.399178504943848, + "learning_rate": 2.6631999999999997e-07, + "loss": 0.1308, + "step": 13325 + }, + { + "epoch": 6.23249299719888, + "grad_norm": 10.317353248596191, + "learning_rate": 2.6682e-07, + "loss": 0.1268, + "step": 13350 + }, + { + "epoch": 6.2441643323996265, + "grad_norm": 7.681791305541992, + "learning_rate": 2.6732e-07, + "loss": 0.1449, + "step": 13375 + }, + { + "epoch": 6.2558356676003735, + "grad_norm": 12.44479751586914, + "learning_rate": 2.6781999999999997e-07, + "loss": 0.1228, + "step": 13400 + }, + { + "epoch": 6.26750700280112, + "grad_norm": 5.903497695922852, + "learning_rate": 2.6832e-07, + "loss": 0.1262, + "step": 13425 + }, + { + "epoch": 6.279178338001867, + "grad_norm": 17.685346603393555, + "learning_rate": 2.6882e-07, + "loss": 0.1288, + "step": 13450 + }, + { + "epoch": 6.290849673202614, + "grad_norm": 3.951446533203125, + "learning_rate": 2.6932e-07, + "loss": 0.1213, + "step": 13475 + }, + { + "epoch": 6.302521008403361, + "grad_norm": 8.137782096862793, + "learning_rate": 2.6982e-07, + "loss": 0.1228, + "step": 13500 + }, + { + "epoch": 6.314192343604108, + "grad_norm": 8.63837718963623, + "learning_rate": 2.7032e-07, + "loss": 0.1414, + "step": 13525 + }, + { + "epoch": 6.325863678804855, + "grad_norm": 9.500225067138672, + "learning_rate": 2.7082e-07, + "loss": 0.1141, + "step": 13550 + }, + { + "epoch": 6.337535014005602, + "grad_norm": 5.4421844482421875, + "learning_rate": 2.7131999999999997e-07, + "loss": 0.1213, + "step": 13575 + }, + { + "epoch": 6.349206349206349, + "grad_norm": 7.188438892364502, + "learning_rate": 2.7182e-07, + "loss": 0.1235, + "step": 13600 + }, + { + "epoch": 6.360877684407096, + "grad_norm": 5.949901103973389, + "learning_rate": 2.7232e-07, + "loss": 0.1372, + "step": 13625 + }, + { + "epoch": 6.372549019607844, + "grad_norm": 11.207901000976562, + "learning_rate": 2.7282e-07, + "loss": 0.1236, + "step": 13650 + }, + { + "epoch": 6.3842203548085905, + "grad_norm": 6.0445122718811035, + "learning_rate": 2.7332e-07, + "loss": 0.1323, + "step": 13675 + }, + { + "epoch": 6.395891690009337, + "grad_norm": 11.870309829711914, + "learning_rate": 2.7382e-07, + "loss": 0.1236, + "step": 13700 + }, + { + "epoch": 6.407563025210084, + "grad_norm": 7.774009704589844, + "learning_rate": 2.7432e-07, + "loss": 0.1373, + "step": 13725 + }, + { + "epoch": 6.419234360410831, + "grad_norm": 6.658696174621582, + "learning_rate": 2.7482e-07, + "loss": 0.1348, + "step": 13750 + }, + { + "epoch": 6.430905695611578, + "grad_norm": 5.360461711883545, + "learning_rate": 2.7532e-07, + "loss": 0.1523, + "step": 13775 + }, + { + "epoch": 6.442577030812325, + "grad_norm": 11.454927444458008, + "learning_rate": 2.7582e-07, + "loss": 0.1448, + "step": 13800 + }, + { + "epoch": 6.454248366013072, + "grad_norm": 3.2537364959716797, + "learning_rate": 2.7632e-07, + "loss": 0.1094, + "step": 13825 + }, + { + "epoch": 6.465919701213819, + "grad_norm": 8.776263236999512, + "learning_rate": 2.7682e-07, + "loss": 0.1164, + "step": 13850 + }, + { + "epoch": 6.477591036414566, + "grad_norm": 6.700248718261719, + "learning_rate": 2.7732e-07, + "loss": 0.1287, + "step": 13875 + }, + { + "epoch": 6.489262371615313, + "grad_norm": 9.243896484375, + "learning_rate": 2.7782e-07, + "loss": 0.1346, + "step": 13900 + }, + { + "epoch": 6.50093370681606, + "grad_norm": 8.710789680480957, + "learning_rate": 2.7832e-07, + "loss": 0.1463, + "step": 13925 + }, + { + "epoch": 6.512605042016807, + "grad_norm": 10.246273040771484, + "learning_rate": 2.7882e-07, + "loss": 0.1217, + "step": 13950 + }, + { + "epoch": 6.524276377217554, + "grad_norm": 4.089282512664795, + "learning_rate": 2.7932e-07, + "loss": 0.1368, + "step": 13975 + }, + { + "epoch": 6.5359477124183005, + "grad_norm": 14.169453620910645, + "learning_rate": 2.7982000000000003e-07, + "loss": 0.1105, + "step": 14000 + }, + { + "epoch": 6.5476190476190474, + "grad_norm": 6.04651403427124, + "learning_rate": 2.8032e-07, + "loss": 0.1318, + "step": 14025 + }, + { + "epoch": 6.559290382819794, + "grad_norm": 9.370837211608887, + "learning_rate": 2.8082e-07, + "loss": 0.1282, + "step": 14050 + }, + { + "epoch": 6.570961718020541, + "grad_norm": 4.370868682861328, + "learning_rate": 2.8132e-07, + "loss": 0.144, + "step": 14075 + }, + { + "epoch": 6.582633053221288, + "grad_norm": 9.317498207092285, + "learning_rate": 2.8182e-07, + "loss": 0.1233, + "step": 14100 + }, + { + "epoch": 6.594304388422035, + "grad_norm": 2.9422969818115234, + "learning_rate": 2.8232e-07, + "loss": 0.1428, + "step": 14125 + }, + { + "epoch": 6.605975723622782, + "grad_norm": 12.039034843444824, + "learning_rate": 2.8282e-07, + "loss": 0.1146, + "step": 14150 + }, + { + "epoch": 6.617647058823529, + "grad_norm": 4.379167556762695, + "learning_rate": 2.8332e-07, + "loss": 0.131, + "step": 14175 + }, + { + "epoch": 6.629318394024276, + "grad_norm": 9.709012031555176, + "learning_rate": 2.8382e-07, + "loss": 0.1159, + "step": 14200 + }, + { + "epoch": 6.640989729225024, + "grad_norm": 8.104528427124023, + "learning_rate": 2.8432e-07, + "loss": 0.137, + "step": 14225 + }, + { + "epoch": 6.652661064425771, + "grad_norm": 12.878413200378418, + "learning_rate": 2.8482e-07, + "loss": 0.1048, + "step": 14250 + }, + { + "epoch": 6.664332399626518, + "grad_norm": 3.268336057662964, + "learning_rate": 2.8532e-07, + "loss": 0.1471, + "step": 14275 + }, + { + "epoch": 6.6760037348272645, + "grad_norm": 11.308536529541016, + "learning_rate": 2.8582e-07, + "loss": 0.123, + "step": 14300 + }, + { + "epoch": 6.687675070028011, + "grad_norm": 5.743576526641846, + "learning_rate": 2.8632e-07, + "loss": 0.1277, + "step": 14325 + }, + { + "epoch": 6.699346405228758, + "grad_norm": 6.817793369293213, + "learning_rate": 2.8682e-07, + "loss": 0.1313, + "step": 14350 + }, + { + "epoch": 6.711017740429505, + "grad_norm": 3.572624921798706, + "learning_rate": 2.8732e-07, + "loss": 0.121, + "step": 14375 + }, + { + "epoch": 6.722689075630252, + "grad_norm": 8.181254386901855, + "learning_rate": 2.8782e-07, + "loss": 0.1158, + "step": 14400 + }, + { + "epoch": 6.734360410830999, + "grad_norm": 4.736342906951904, + "learning_rate": 2.8832000000000003e-07, + "loss": 0.15, + "step": 14425 + }, + { + "epoch": 6.746031746031746, + "grad_norm": 18.210702896118164, + "learning_rate": 2.8882e-07, + "loss": 0.1373, + "step": 14450 + }, + { + "epoch": 6.757703081232493, + "grad_norm": 5.613450050354004, + "learning_rate": 2.8932e-07, + "loss": 0.148, + "step": 14475 + }, + { + "epoch": 6.76937441643324, + "grad_norm": 11.278425216674805, + "learning_rate": 2.8982e-07, + "loss": 0.1255, + "step": 14500 + }, + { + "epoch": 6.781045751633987, + "grad_norm": 3.2928617000579834, + "learning_rate": 2.9032e-07, + "loss": 0.1288, + "step": 14525 + }, + { + "epoch": 6.792717086834734, + "grad_norm": 12.555643081665039, + "learning_rate": 2.9082e-07, + "loss": 0.107, + "step": 14550 + }, + { + "epoch": 6.804388422035481, + "grad_norm": 4.838390350341797, + "learning_rate": 2.9132e-07, + "loss": 0.1224, + "step": 14575 + }, + { + "epoch": 6.816059757236228, + "grad_norm": 11.363154411315918, + "learning_rate": 2.9182000000000003e-07, + "loss": 0.1344, + "step": 14600 + }, + { + "epoch": 6.8277310924369745, + "grad_norm": 2.745389223098755, + "learning_rate": 2.9232e-07, + "loss": 0.1456, + "step": 14625 + }, + { + "epoch": 6.839402427637721, + "grad_norm": 11.974946975708008, + "learning_rate": 2.9282e-07, + "loss": 0.1163, + "step": 14650 + }, + { + "epoch": 6.851073762838468, + "grad_norm": 3.53490948677063, + "learning_rate": 2.9332000000000004e-07, + "loss": 0.1255, + "step": 14675 + }, + { + "epoch": 6.862745098039216, + "grad_norm": 8.966546058654785, + "learning_rate": 2.9382e-07, + "loss": 0.1198, + "step": 14700 + }, + { + "epoch": 6.874416433239963, + "grad_norm": 5.7963480949401855, + "learning_rate": 2.9432e-07, + "loss": 0.1321, + "step": 14725 + }, + { + "epoch": 6.88608776844071, + "grad_norm": 13.834965705871582, + "learning_rate": 2.9482e-07, + "loss": 0.1356, + "step": 14750 + }, + { + "epoch": 6.897759103641457, + "grad_norm": 4.368019104003906, + "learning_rate": 2.9532000000000003e-07, + "loss": 0.1247, + "step": 14775 + }, + { + "epoch": 6.909430438842204, + "grad_norm": 6.511091232299805, + "learning_rate": 2.9582e-07, + "loss": 0.1207, + "step": 14800 + }, + { + "epoch": 6.921101774042951, + "grad_norm": 4.450834274291992, + "learning_rate": 2.9631999999999994e-07, + "loss": 0.1333, + "step": 14825 + }, + { + "epoch": 6.932773109243698, + "grad_norm": 7.795094013214111, + "learning_rate": 2.9682e-07, + "loss": 0.1144, + "step": 14850 + }, + { + "epoch": 6.944444444444445, + "grad_norm": 6.080096244812012, + "learning_rate": 2.9731999999999997e-07, + "loss": 0.1518, + "step": 14875 + }, + { + "epoch": 6.956115779645192, + "grad_norm": 7.597021102905273, + "learning_rate": 2.9781999999999995e-07, + "loss": 0.1385, + "step": 14900 + }, + { + "epoch": 6.9677871148459385, + "grad_norm": 4.243095397949219, + "learning_rate": 2.9831999999999993e-07, + "loss": 0.1303, + "step": 14925 + }, + { + "epoch": 6.979458450046685, + "grad_norm": 10.551504135131836, + "learning_rate": 2.9881999999999997e-07, + "loss": 0.1364, + "step": 14950 + }, + { + "epoch": 6.991129785247432, + "grad_norm": 4.460564136505127, + "learning_rate": 2.9931999999999996e-07, + "loss": 0.1185, + "step": 14975 + }, + { + "epoch": 7.002801120448179, + "grad_norm": 5.397023677825928, + "learning_rate": 2.9981999999999994e-07, + "loss": 0.149, + "step": 15000 + }, + { + "epoch": 7.002801120448179, + "eval_loss": 0.14280347526073456, + "eval_runtime": 6589.115, + "eval_samples_per_second": 1.429, + "eval_steps_per_second": 0.179, + "eval_wer": 0.09492975940578072, + "step": 15000 + }, + { + "epoch": 7.014472455648926, + "grad_norm": 5.963614463806152, + "learning_rate": 6.006e-07, + "loss": 0.1013, + "step": 15025 + }, + { + "epoch": 7.026143790849673, + "grad_norm": 3.1698148250579834, + "learning_rate": 6.015599999999999e-07, + "loss": 0.1194, + "step": 15050 + }, + { + "epoch": 7.03781512605042, + "grad_norm": 6.9241180419921875, + "learning_rate": 6.025599999999999e-07, + "loss": 0.0989, + "step": 15075 + }, + { + "epoch": 7.049486461251167, + "grad_norm": 8.083003044128418, + "learning_rate": 6.0356e-07, + "loss": 0.1198, + "step": 15100 + }, + { + "epoch": 7.061157796451914, + "grad_norm": 9.302962303161621, + "learning_rate": 6.0456e-07, + "loss": 0.0863, + "step": 15125 + }, + { + "epoch": 7.072829131652661, + "grad_norm": 3.6642816066741943, + "learning_rate": 6.055599999999999e-07, + "loss": 0.1192, + "step": 15150 + }, + { + "epoch": 7.084500466853408, + "grad_norm": 5.131696701049805, + "learning_rate": 6.0656e-07, + "loss": 0.0944, + "step": 15175 + }, + { + "epoch": 7.096171802054155, + "grad_norm": 5.429873466491699, + "learning_rate": 6.0756e-07, + "loss": 0.1432, + "step": 15200 + }, + { + "epoch": 7.107843137254902, + "grad_norm": 2.794274091720581, + "learning_rate": 6.085599999999999e-07, + "loss": 0.0996, + "step": 15225 + }, + { + "epoch": 7.1195144724556485, + "grad_norm": 4.206586837768555, + "learning_rate": 6.0956e-07, + "loss": 0.1557, + "step": 15250 + }, + { + "epoch": 7.131185807656396, + "grad_norm": 4.8860087394714355, + "learning_rate": 6.1056e-07, + "loss": 0.0954, + "step": 15275 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 5.189944744110107, + "learning_rate": 6.1156e-07, + "loss": 0.1637, + "step": 15300 + }, + { + "epoch": 7.15452847805789, + "grad_norm": 6.401843070983887, + "learning_rate": 6.125599999999999e-07, + "loss": 0.1137, + "step": 15325 + }, + { + "epoch": 7.166199813258637, + "grad_norm": 3.2334303855895996, + "learning_rate": 6.1356e-07, + "loss": 0.1411, + "step": 15350 + }, + { + "epoch": 7.177871148459384, + "grad_norm": 5.686134338378906, + "learning_rate": 6.1456e-07, + "loss": 0.0959, + "step": 15375 + }, + { + "epoch": 7.189542483660131, + "grad_norm": 5.280776023864746, + "learning_rate": 6.155599999999999e-07, + "loss": 0.1254, + "step": 15400 + }, + { + "epoch": 7.201213818860878, + "grad_norm": 10.093783378601074, + "learning_rate": 6.1656e-07, + "loss": 0.1124, + "step": 15425 + }, + { + "epoch": 7.212885154061625, + "grad_norm": 4.502685546875, + "learning_rate": 6.1756e-07, + "loss": 0.1333, + "step": 15450 + }, + { + "epoch": 7.224556489262372, + "grad_norm": 6.842624664306641, + "learning_rate": 6.1856e-07, + "loss": 0.1003, + "step": 15475 + }, + { + "epoch": 7.236227824463119, + "grad_norm": 4.324547290802002, + "learning_rate": 6.1956e-07, + "loss": 0.1222, + "step": 15500 + }, + { + "epoch": 7.2478991596638656, + "grad_norm": 5.093228340148926, + "learning_rate": 6.2056e-07, + "loss": 0.1144, + "step": 15525 + }, + { + "epoch": 7.2595704948646125, + "grad_norm": 4.086531639099121, + "learning_rate": 6.2156e-07, + "loss": 0.1592, + "step": 15550 + }, + { + "epoch": 7.271241830065359, + "grad_norm": 5.517257213592529, + "learning_rate": 6.225599999999999e-07, + "loss": 0.1114, + "step": 15575 + }, + { + "epoch": 7.282913165266106, + "grad_norm": 3.0472617149353027, + "learning_rate": 6.2356e-07, + "loss": 0.1257, + "step": 15600 + }, + { + "epoch": 7.294584500466853, + "grad_norm": 5.590120315551758, + "learning_rate": 6.2456e-07, + "loss": 0.1051, + "step": 15625 + }, + { + "epoch": 7.3062558356676, + "grad_norm": 4.192562103271484, + "learning_rate": 6.255599999999999e-07, + "loss": 0.1385, + "step": 15650 + }, + { + "epoch": 7.317927170868347, + "grad_norm": 6.378529071807861, + "learning_rate": 6.2656e-07, + "loss": 0.098, + "step": 15675 + }, + { + "epoch": 7.329598506069094, + "grad_norm": 3.6979291439056396, + "learning_rate": 6.2756e-07, + "loss": 0.1496, + "step": 15700 + }, + { + "epoch": 7.341269841269841, + "grad_norm": 5.2295122146606445, + "learning_rate": 6.2856e-07, + "loss": 0.0779, + "step": 15725 + }, + { + "epoch": 7.352941176470588, + "grad_norm": 3.9444265365600586, + "learning_rate": 6.295599999999999e-07, + "loss": 0.1394, + "step": 15750 + }, + { + "epoch": 7.364612511671335, + "grad_norm": 5.075552463531494, + "learning_rate": 6.3056e-07, + "loss": 0.1025, + "step": 15775 + }, + { + "epoch": 7.376283846872083, + "grad_norm": 4.6857733726501465, + "learning_rate": 6.3156e-07, + "loss": 0.149, + "step": 15800 + }, + { + "epoch": 7.3879551820728295, + "grad_norm": 3.647244930267334, + "learning_rate": 6.325599999999999e-07, + "loss": 0.0909, + "step": 15825 + }, + { + "epoch": 7.3996265172735765, + "grad_norm": 5.009864330291748, + "learning_rate": 6.3356e-07, + "loss": 0.1427, + "step": 15850 + }, + { + "epoch": 7.411297852474323, + "grad_norm": 7.3696513175964355, + "learning_rate": 6.3456e-07, + "loss": 0.1056, + "step": 15875 + }, + { + "epoch": 7.42296918767507, + "grad_norm": 5.746430397033691, + "learning_rate": 6.3556e-07, + "loss": 0.1283, + "step": 15900 + }, + { + "epoch": 7.434640522875817, + "grad_norm": 6.898996353149414, + "learning_rate": 6.3656e-07, + "loss": 0.0922, + "step": 15925 + }, + { + "epoch": 7.446311858076564, + "grad_norm": 4.499526023864746, + "learning_rate": 6.3756e-07, + "loss": 0.1294, + "step": 15950 + }, + { + "epoch": 7.457983193277311, + "grad_norm": 10.639655113220215, + "learning_rate": 6.3856e-07, + "loss": 0.108, + "step": 15975 + }, + { + "epoch": 7.469654528478058, + "grad_norm": 4.458117485046387, + "learning_rate": 6.395599999999999e-07, + "loss": 0.1426, + "step": 16000 + }, + { + "epoch": 7.481325863678805, + "grad_norm": 6.026330947875977, + "learning_rate": 6.4056e-07, + "loss": 0.1027, + "step": 16025 + }, + { + "epoch": 7.492997198879552, + "grad_norm": 8.034396171569824, + "learning_rate": 6.4156e-07, + "loss": 0.1368, + "step": 16050 + }, + { + "epoch": 7.504668534080299, + "grad_norm": 6.156011581420898, + "learning_rate": 6.4256e-07, + "loss": 0.1084, + "step": 16075 + }, + { + "epoch": 7.516339869281046, + "grad_norm": 4.8642168045043945, + "learning_rate": 6.4356e-07, + "loss": 0.1383, + "step": 16100 + }, + { + "epoch": 7.528011204481793, + "grad_norm": 6.108960151672363, + "learning_rate": 6.4456e-07, + "loss": 0.101, + "step": 16125 + }, + { + "epoch": 7.5396825396825395, + "grad_norm": 4.260036945343018, + "learning_rate": 6.4556e-07, + "loss": 0.1384, + "step": 16150 + }, + { + "epoch": 7.5513538748832865, + "grad_norm": 7.994537353515625, + "learning_rate": 6.4656e-07, + "loss": 0.1009, + "step": 16175 + }, + { + "epoch": 7.563025210084033, + "grad_norm": 5.753657817840576, + "learning_rate": 6.4756e-07, + "loss": 0.1383, + "step": 16200 + }, + { + "epoch": 7.57469654528478, + "grad_norm": 7.363603591918945, + "learning_rate": 6.4856e-07, + "loss": 0.1059, + "step": 16225 + }, + { + "epoch": 7.586367880485527, + "grad_norm": 5.041802883148193, + "learning_rate": 6.4956e-07, + "loss": 0.1257, + "step": 16250 + }, + { + "epoch": 7.598039215686274, + "grad_norm": 6.2316575050354, + "learning_rate": 6.5056e-07, + "loss": 0.1008, + "step": 16275 + }, + { + "epoch": 7.609710550887021, + "grad_norm": 3.6404504776000977, + "learning_rate": 6.5156e-07, + "loss": 0.1255, + "step": 16300 + }, + { + "epoch": 7.621381886087768, + "grad_norm": 5.228316783905029, + "learning_rate": 6.5256e-07, + "loss": 0.0888, + "step": 16325 + }, + { + "epoch": 7.633053221288515, + "grad_norm": 4.744984149932861, + "learning_rate": 6.5356e-07, + "loss": 0.1435, + "step": 16350 + }, + { + "epoch": 7.644724556489263, + "grad_norm": 6.733789443969727, + "learning_rate": 6.5456e-07, + "loss": 0.1008, + "step": 16375 + }, + { + "epoch": 7.65639589169001, + "grad_norm": 4.16718864440918, + "learning_rate": 6.5556e-07, + "loss": 0.1194, + "step": 16400 + }, + { + "epoch": 7.668067226890757, + "grad_norm": 2.9225594997406006, + "learning_rate": 6.5656e-07, + "loss": 0.0953, + "step": 16425 + }, + { + "epoch": 7.6797385620915035, + "grad_norm": 5.475734233856201, + "learning_rate": 6.5756e-07, + "loss": 0.1403, + "step": 16450 + }, + { + "epoch": 7.69140989729225, + "grad_norm": 5.1282477378845215, + "learning_rate": 6.5856e-07, + "loss": 0.1052, + "step": 16475 + }, + { + "epoch": 7.703081232492997, + "grad_norm": 5.281869411468506, + "learning_rate": 6.5956e-07, + "loss": 0.1438, + "step": 16500 + }, + { + "epoch": 7.714752567693744, + "grad_norm": 5.311507225036621, + "learning_rate": 6.6056e-07, + "loss": 0.1021, + "step": 16525 + }, + { + "epoch": 7.726423902894491, + "grad_norm": 3.4866130352020264, + "learning_rate": 6.6156e-07, + "loss": 0.1216, + "step": 16550 + }, + { + "epoch": 7.738095238095238, + "grad_norm": 9.126893997192383, + "learning_rate": 6.6256e-07, + "loss": 0.0901, + "step": 16575 + }, + { + "epoch": 7.749766573295985, + "grad_norm": 4.056077003479004, + "learning_rate": 6.6356e-07, + "loss": 0.1385, + "step": 16600 + }, + { + "epoch": 7.761437908496732, + "grad_norm": 11.753646850585938, + "learning_rate": 6.6456e-07, + "loss": 0.0845, + "step": 16625 + }, + { + "epoch": 7.773109243697479, + "grad_norm": 5.014488697052002, + "learning_rate": 6.6556e-07, + "loss": 0.1224, + "step": 16650 + }, + { + "epoch": 7.784780578898226, + "grad_norm": 6.308728218078613, + "learning_rate": 6.665600000000001e-07, + "loss": 0.0885, + "step": 16675 + }, + { + "epoch": 7.796451914098973, + "grad_norm": 2.8483879566192627, + "learning_rate": 6.6756e-07, + "loss": 0.1288, + "step": 16700 + }, + { + "epoch": 7.80812324929972, + "grad_norm": 8.292418479919434, + "learning_rate": 6.6856e-07, + "loss": 0.095, + "step": 16725 + }, + { + "epoch": 7.819794584500467, + "grad_norm": 4.9927263259887695, + "learning_rate": 6.6956e-07, + "loss": 0.1085, + "step": 16750 + }, + { + "epoch": 7.8314659197012135, + "grad_norm": 14.002336502075195, + "learning_rate": 6.7056e-07, + "loss": 0.0862, + "step": 16775 + }, + { + "epoch": 7.8431372549019605, + "grad_norm": 5.258606433868408, + "learning_rate": 6.7156e-07, + "loss": 0.1349, + "step": 16800 + }, + { + "epoch": 7.854808590102707, + "grad_norm": 6.183353900909424, + "learning_rate": 6.7256e-07, + "loss": 0.0962, + "step": 16825 + }, + { + "epoch": 7.866479925303455, + "grad_norm": 4.468369960784912, + "learning_rate": 6.735600000000001e-07, + "loss": 0.1222, + "step": 16850 + }, + { + "epoch": 7.878151260504202, + "grad_norm": 5.605790615081787, + "learning_rate": 6.7456e-07, + "loss": 0.1022, + "step": 16875 + }, + { + "epoch": 7.889822595704949, + "grad_norm": 3.859760284423828, + "learning_rate": 6.7556e-07, + "loss": 0.114, + "step": 16900 + }, + { + "epoch": 7.901493930905696, + "grad_norm": 7.612791061401367, + "learning_rate": 6.765600000000001e-07, + "loss": 0.0984, + "step": 16925 + }, + { + "epoch": 7.913165266106443, + "grad_norm": 3.680158853530884, + "learning_rate": 6.7756e-07, + "loss": 0.1247, + "step": 16950 + }, + { + "epoch": 7.92483660130719, + "grad_norm": 4.414365768432617, + "learning_rate": 6.7856e-07, + "loss": 0.0998, + "step": 16975 + }, + { + "epoch": 7.936507936507937, + "grad_norm": 4.558730125427246, + "learning_rate": 6.7956e-07, + "loss": 0.1391, + "step": 17000 + }, + { + "epoch": 7.948179271708684, + "grad_norm": 5.9088053703308105, + "learning_rate": 6.805600000000001e-07, + "loss": 0.0958, + "step": 17025 + }, + { + "epoch": 7.959850606909431, + "grad_norm": 4.314589977264404, + "learning_rate": 6.8152e-07, + "loss": 0.1399, + "step": 17050 + }, + { + "epoch": 7.9715219421101775, + "grad_norm": 6.758405685424805, + "learning_rate": 6.825199999999999e-07, + "loss": 0.1045, + "step": 17075 + }, + { + "epoch": 7.983193277310924, + "grad_norm": 5.049168109893799, + "learning_rate": 6.8352e-07, + "loss": 0.1305, + "step": 17100 + }, + { + "epoch": 7.994864612511671, + "grad_norm": 5.647031784057617, + "learning_rate": 6.8452e-07, + "loss": 0.1029, + "step": 17125 + }, + { + "epoch": 8.006535947712418, + "grad_norm": 2.389552354812622, + "learning_rate": 6.8552e-07, + "loss": 0.1187, + "step": 17150 + }, + { + "epoch": 8.018207282913165, + "grad_norm": 7.662734508514404, + "learning_rate": 6.8652e-07, + "loss": 0.0784, + "step": 17175 + }, + { + "epoch": 8.029878618113912, + "grad_norm": 4.091869354248047, + "learning_rate": 6.8752e-07, + "loss": 0.1117, + "step": 17200 + }, + { + "epoch": 8.041549953314659, + "grad_norm": 4.1873459815979, + "learning_rate": 6.8852e-07, + "loss": 0.0841, + "step": 17225 + }, + { + "epoch": 8.053221288515406, + "grad_norm": 1.8552043437957764, + "learning_rate": 6.895199999999999e-07, + "loss": 0.1085, + "step": 17250 + }, + { + "epoch": 8.064892623716153, + "grad_norm": 6.734919548034668, + "learning_rate": 6.9052e-07, + "loss": 0.0705, + "step": 17275 + }, + { + "epoch": 8.0765639589169, + "grad_norm": 5.204643726348877, + "learning_rate": 6.9152e-07, + "loss": 0.1131, + "step": 17300 + }, + { + "epoch": 8.088235294117647, + "grad_norm": 8.857121467590332, + "learning_rate": 6.9252e-07, + "loss": 0.0759, + "step": 17325 + }, + { + "epoch": 8.099906629318394, + "grad_norm": 3.829099655151367, + "learning_rate": 6.9352e-07, + "loss": 0.1171, + "step": 17350 + }, + { + "epoch": 8.11157796451914, + "grad_norm": 5.418254852294922, + "learning_rate": 6.9452e-07, + "loss": 0.0744, + "step": 17375 + }, + { + "epoch": 8.123249299719888, + "grad_norm": 4.435729026794434, + "learning_rate": 6.9552e-07, + "loss": 0.1098, + "step": 17400 + }, + { + "epoch": 8.134920634920634, + "grad_norm": 8.937211036682129, + "learning_rate": 6.9652e-07, + "loss": 0.0684, + "step": 17425 + }, + { + "epoch": 8.146591970121381, + "grad_norm": 4.719937801361084, + "learning_rate": 6.9752e-07, + "loss": 0.1071, + "step": 17450 + }, + { + "epoch": 8.158263305322128, + "grad_norm": 5.891451835632324, + "learning_rate": 6.9852e-07, + "loss": 0.0652, + "step": 17475 + }, + { + "epoch": 8.169934640522875, + "grad_norm": 2.8404786586761475, + "learning_rate": 6.9952e-07, + "loss": 0.0993, + "step": 17500 + }, + { + "epoch": 8.181605975723622, + "grad_norm": 6.352357387542725, + "learning_rate": 7.0052e-07, + "loss": 0.0816, + "step": 17525 + }, + { + "epoch": 8.193277310924369, + "grad_norm": 4.22411584854126, + "learning_rate": 7.0152e-07, + "loss": 0.1167, + "step": 17550 + }, + { + "epoch": 8.204948646125116, + "grad_norm": 7.104822158813477, + "learning_rate": 7.0252e-07, + "loss": 0.0873, + "step": 17575 + }, + { + "epoch": 8.216619981325863, + "grad_norm": 2.5561118125915527, + "learning_rate": 7.0352e-07, + "loss": 0.1045, + "step": 17600 + }, + { + "epoch": 8.22829131652661, + "grad_norm": 8.497262001037598, + "learning_rate": 7.0452e-07, + "loss": 0.0772, + "step": 17625 + }, + { + "epoch": 8.239962651727357, + "grad_norm": 4.319903373718262, + "learning_rate": 7.0552e-07, + "loss": 0.1291, + "step": 17650 + }, + { + "epoch": 8.251633986928105, + "grad_norm": 5.6005754470825195, + "learning_rate": 7.065200000000001e-07, + "loss": 0.0748, + "step": 17675 + }, + { + "epoch": 8.263305322128852, + "grad_norm": 4.352419853210449, + "learning_rate": 7.0752e-07, + "loss": 0.1162, + "step": 17700 + }, + { + "epoch": 8.2749766573296, + "grad_norm": 3.519353151321411, + "learning_rate": 7.0852e-07, + "loss": 0.0731, + "step": 17725 + }, + { + "epoch": 8.286647992530346, + "grad_norm": 4.001583576202393, + "learning_rate": 7.0952e-07, + "loss": 0.1132, + "step": 17750 + }, + { + "epoch": 8.298319327731093, + "grad_norm": 7.335994720458984, + "learning_rate": 7.1052e-07, + "loss": 0.0744, + "step": 17775 + }, + { + "epoch": 8.30999066293184, + "grad_norm": 4.729821681976318, + "learning_rate": 7.1152e-07, + "loss": 0.1145, + "step": 17800 + }, + { + "epoch": 8.321661998132587, + "grad_norm": 4.817710876464844, + "learning_rate": 7.1252e-07, + "loss": 0.0684, + "step": 17825 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 3.727107524871826, + "learning_rate": 7.1352e-07, + "loss": 0.1014, + "step": 17850 + }, + { + "epoch": 8.34500466853408, + "grad_norm": 6.569036960601807, + "learning_rate": 7.1452e-07, + "loss": 0.0809, + "step": 17875 + }, + { + "epoch": 8.356676003734828, + "grad_norm": 3.9731063842773438, + "learning_rate": 7.1552e-07, + "loss": 0.116, + "step": 17900 + }, + { + "epoch": 8.368347338935575, + "grad_norm": 5.585522174835205, + "learning_rate": 7.165200000000001e-07, + "loss": 0.0708, + "step": 17925 + }, + { + "epoch": 8.380018674136322, + "grad_norm": 4.29518461227417, + "learning_rate": 7.1752e-07, + "loss": 0.1187, + "step": 17950 + }, + { + "epoch": 8.391690009337069, + "grad_norm": 5.51121187210083, + "learning_rate": 7.1852e-07, + "loss": 0.0718, + "step": 17975 + }, + { + "epoch": 8.403361344537815, + "grad_norm": 3.5351364612579346, + "learning_rate": 7.1952e-07, + "loss": 0.1168, + "step": 18000 + }, + { + "epoch": 8.415032679738562, + "grad_norm": 4.3715128898620605, + "learning_rate": 7.2052e-07, + "loss": 0.079, + "step": 18025 + }, + { + "epoch": 8.42670401493931, + "grad_norm": 3.5346896648406982, + "learning_rate": 7.2152e-07, + "loss": 0.1288, + "step": 18050 + }, + { + "epoch": 8.438375350140056, + "grad_norm": 6.278663635253906, + "learning_rate": 7.2252e-07, + "loss": 0.0774, + "step": 18075 + }, + { + "epoch": 8.450046685340803, + "grad_norm": 4.804433345794678, + "learning_rate": 7.235200000000001e-07, + "loss": 0.1094, + "step": 18100 + }, + { + "epoch": 8.46171802054155, + "grad_norm": 3.196178436279297, + "learning_rate": 7.2452e-07, + "loss": 0.0775, + "step": 18125 + }, + { + "epoch": 8.473389355742297, + "grad_norm": 3.008148193359375, + "learning_rate": 7.2552e-07, + "loss": 0.1186, + "step": 18150 + }, + { + "epoch": 8.485060690943044, + "grad_norm": 6.987017631530762, + "learning_rate": 7.2652e-07, + "loss": 0.0705, + "step": 18175 + }, + { + "epoch": 8.49673202614379, + "grad_norm": 3.8024909496307373, + "learning_rate": 7.275199999999999e-07, + "loss": 0.116, + "step": 18200 + }, + { + "epoch": 8.508403361344538, + "grad_norm": 6.151003837585449, + "learning_rate": 7.285199999999999e-07, + "loss": 0.0701, + "step": 18225 + }, + { + "epoch": 8.520074696545285, + "grad_norm": 1.7352893352508545, + "learning_rate": 7.295199999999999e-07, + "loss": 0.114, + "step": 18250 + }, + { + "epoch": 8.531746031746032, + "grad_norm": 7.302786827087402, + "learning_rate": 7.3052e-07, + "loss": 0.0665, + "step": 18275 + }, + { + "epoch": 8.543417366946779, + "grad_norm": 3.3787872791290283, + "learning_rate": 7.315199999999999e-07, + "loss": 0.1052, + "step": 18300 + }, + { + "epoch": 8.555088702147525, + "grad_norm": 6.7813920974731445, + "learning_rate": 7.325199999999999e-07, + "loss": 0.0875, + "step": 18325 + }, + { + "epoch": 8.566760037348272, + "grad_norm": 6.053928375244141, + "learning_rate": 7.3352e-07, + "loss": 0.1131, + "step": 18350 + }, + { + "epoch": 8.57843137254902, + "grad_norm": 11.57036018371582, + "learning_rate": 7.345199999999999e-07, + "loss": 0.0769, + "step": 18375 + }, + { + "epoch": 8.590102707749766, + "grad_norm": 3.816927671432495, + "learning_rate": 7.355199999999999e-07, + "loss": 0.1181, + "step": 18400 + }, + { + "epoch": 8.601774042950513, + "grad_norm": 4.542242527008057, + "learning_rate": 7.365199999999999e-07, + "loss": 0.0829, + "step": 18425 + }, + { + "epoch": 8.61344537815126, + "grad_norm": 3.25649356842041, + "learning_rate": 7.3752e-07, + "loss": 0.1144, + "step": 18450 + }, + { + "epoch": 8.625116713352007, + "grad_norm": 7.228991508483887, + "learning_rate": 7.385199999999999e-07, + "loss": 0.0807, + "step": 18475 + }, + { + "epoch": 8.636788048552754, + "grad_norm": 2.458822727203369, + "learning_rate": 7.395199999999999e-07, + "loss": 0.1036, + "step": 18500 + }, + { + "epoch": 8.6484593837535, + "grad_norm": 6.6406474113464355, + "learning_rate": 7.4052e-07, + "loss": 0.0792, + "step": 18525 + }, + { + "epoch": 8.660130718954248, + "grad_norm": 6.3346099853515625, + "learning_rate": 7.415199999999999e-07, + "loss": 0.1265, + "step": 18550 + }, + { + "epoch": 8.671802054154995, + "grad_norm": 5.170047760009766, + "learning_rate": 7.425199999999999e-07, + "loss": 0.0687, + "step": 18575 + }, + { + "epoch": 8.683473389355742, + "grad_norm": 5.245858669281006, + "learning_rate": 7.4352e-07, + "loss": 0.1178, + "step": 18600 + }, + { + "epoch": 8.695144724556489, + "grad_norm": 5.291413307189941, + "learning_rate": 7.445199999999999e-07, + "loss": 0.0817, + "step": 18625 + }, + { + "epoch": 8.706816059757235, + "grad_norm": 4.81880521774292, + "learning_rate": 7.455199999999999e-07, + "loss": 0.1066, + "step": 18650 + }, + { + "epoch": 8.718487394957982, + "grad_norm": 2.500437021255493, + "learning_rate": 7.465199999999999e-07, + "loss": 0.0817, + "step": 18675 + }, + { + "epoch": 8.73015873015873, + "grad_norm": 3.1665596961975098, + "learning_rate": 7.4752e-07, + "loss": 0.0836, + "step": 18700 + }, + { + "epoch": 8.741830065359476, + "grad_norm": 4.715977668762207, + "learning_rate": 7.485199999999999e-07, + "loss": 0.0695, + "step": 18725 + }, + { + "epoch": 8.753501400560225, + "grad_norm": 3.079907178878784, + "learning_rate": 7.495199999999999e-07, + "loss": 0.1352, + "step": 18750 + }, + { + "epoch": 8.76517273576097, + "grad_norm": 7.7203288078308105, + "learning_rate": 7.5052e-07, + "loss": 0.0754, + "step": 18775 + }, + { + "epoch": 8.776844070961719, + "grad_norm": 5.75888204574585, + "learning_rate": 7.515199999999999e-07, + "loss": 0.1045, + "step": 18800 + }, + { + "epoch": 8.788515406162466, + "grad_norm": 9.753093719482422, + "learning_rate": 7.525199999999999e-07, + "loss": 0.0754, + "step": 18825 + }, + { + "epoch": 8.800186741363213, + "grad_norm": 4.152544975280762, + "learning_rate": 7.535199999999999e-07, + "loss": 0.115, + "step": 18850 + }, + { + "epoch": 8.81185807656396, + "grad_norm": 6.212908744812012, + "learning_rate": 7.5452e-07, + "loss": 0.0836, + "step": 18875 + }, + { + "epoch": 8.823529411764707, + "grad_norm": 4.721496105194092, + "learning_rate": 7.555199999999999e-07, + "loss": 0.117, + "step": 18900 + }, + { + "epoch": 8.835200746965453, + "grad_norm": 8.515325546264648, + "learning_rate": 7.565199999999999e-07, + "loss": 0.0852, + "step": 18925 + }, + { + "epoch": 8.8468720821662, + "grad_norm": 5.124463081359863, + "learning_rate": 7.5752e-07, + "loss": 0.1089, + "step": 18950 + }, + { + "epoch": 8.858543417366947, + "grad_norm": 10.328991889953613, + "learning_rate": 7.585199999999999e-07, + "loss": 0.0688, + "step": 18975 + }, + { + "epoch": 8.870214752567694, + "grad_norm": 8.239870071411133, + "learning_rate": 7.595199999999999e-07, + "loss": 0.1033, + "step": 19000 + }, + { + "epoch": 8.881886087768441, + "grad_norm": 9.038163185119629, + "learning_rate": 7.6052e-07, + "loss": 0.0853, + "step": 19025 + }, + { + "epoch": 8.893557422969188, + "grad_norm": 3.644261598587036, + "learning_rate": 7.6152e-07, + "loss": 0.0984, + "step": 19050 + }, + { + "epoch": 8.905228758169935, + "grad_norm": 4.2948832511901855, + "learning_rate": 7.625199999999999e-07, + "loss": 0.0708, + "step": 19075 + }, + { + "epoch": 8.916900093370682, + "grad_norm": 3.6126201152801514, + "learning_rate": 7.635199999999999e-07, + "loss": 0.1205, + "step": 19100 + }, + { + "epoch": 8.928571428571429, + "grad_norm": 5.034170150756836, + "learning_rate": 7.6452e-07, + "loss": 0.0727, + "step": 19125 + }, + { + "epoch": 8.940242763772176, + "grad_norm": 3.311403274536133, + "learning_rate": 7.6548e-07, + "loss": 0.1083, + "step": 19150 + }, + { + "epoch": 8.951914098972923, + "grad_norm": 6.666570663452148, + "learning_rate": 7.6648e-07, + "loss": 0.0808, + "step": 19175 + }, + { + "epoch": 8.96358543417367, + "grad_norm": 2.835561513900757, + "learning_rate": 7.6748e-07, + "loss": 0.098, + "step": 19200 + }, + { + "epoch": 8.975256769374417, + "grad_norm": 3.3188419342041016, + "learning_rate": 7.6848e-07, + "loss": 0.0801, + "step": 19225 + }, + { + "epoch": 8.986928104575163, + "grad_norm": 4.5640788078308105, + "learning_rate": 7.6948e-07, + "loss": 0.102, + "step": 19250 + }, + { + "epoch": 8.99859943977591, + "grad_norm": 9.698177337646484, + "learning_rate": 7.704800000000001e-07, + "loss": 0.1022, + "step": 19275 + }, + { + "epoch": 9.010270774976657, + "grad_norm": 3.567379951477051, + "learning_rate": 7.7148e-07, + "loss": 0.0744, + "step": 19300 + }, + { + "epoch": 9.021942110177404, + "grad_norm": 5.79501485824585, + "learning_rate": 7.7248e-07, + "loss": 0.0616, + "step": 19325 + }, + { + "epoch": 9.033613445378151, + "grad_norm": 3.6958703994750977, + "learning_rate": 7.7348e-07, + "loss": 0.0797, + "step": 19350 + }, + { + "epoch": 9.045284780578898, + "grad_norm": 8.376121520996094, + "learning_rate": 7.744799999999999e-07, + "loss": 0.0573, + "step": 19375 + }, + { + "epoch": 9.056956115779645, + "grad_norm": 4.162479877471924, + "learning_rate": 7.754799999999999e-07, + "loss": 0.0947, + "step": 19400 + }, + { + "epoch": 9.068627450980392, + "grad_norm": 6.144433498382568, + "learning_rate": 7.764799999999999e-07, + "loss": 0.0545, + "step": 19425 + }, + { + "epoch": 9.080298786181139, + "grad_norm": 4.648292541503906, + "learning_rate": 7.774799999999999e-07, + "loss": 0.0824, + "step": 19450 + }, + { + "epoch": 9.091970121381886, + "grad_norm": 5.517236709594727, + "learning_rate": 7.784799999999999e-07, + "loss": 0.0697, + "step": 19475 + }, + { + "epoch": 9.103641456582633, + "grad_norm": 6.232855796813965, + "learning_rate": 7.794799999999999e-07, + "loss": 0.0764, + "step": 19500 + }, + { + "epoch": 9.11531279178338, + "grad_norm": 6.580794811248779, + "learning_rate": 7.8048e-07, + "loss": 0.0756, + "step": 19525 + }, + { + "epoch": 9.126984126984127, + "grad_norm": 5.505826950073242, + "learning_rate": 7.814799999999999e-07, + "loss": 0.0936, + "step": 19550 + }, + { + "epoch": 9.138655462184873, + "grad_norm": 13.38469409942627, + "learning_rate": 7.824799999999999e-07, + "loss": 0.0711, + "step": 19575 + }, + { + "epoch": 9.15032679738562, + "grad_norm": 8.618947982788086, + "learning_rate": 7.834799999999999e-07, + "loss": 0.0752, + "step": 19600 + }, + { + "epoch": 9.161998132586367, + "grad_norm": 7.913994789123535, + "learning_rate": 7.844799999999999e-07, + "loss": 0.0594, + "step": 19625 + }, + { + "epoch": 9.173669467787114, + "grad_norm": 2.6098523139953613, + "learning_rate": 7.854799999999999e-07, + "loss": 0.0849, + "step": 19650 + }, + { + "epoch": 9.185340802987861, + "grad_norm": 6.636572360992432, + "learning_rate": 7.864799999999999e-07, + "loss": 0.0559, + "step": 19675 + }, + { + "epoch": 9.197012138188608, + "grad_norm": 2.971862554550171, + "learning_rate": 7.8748e-07, + "loss": 0.0908, + "step": 19700 + }, + { + "epoch": 9.208683473389355, + "grad_norm": 9.039565086364746, + "learning_rate": 7.884799999999999e-07, + "loss": 0.0642, + "step": 19725 + }, + { + "epoch": 9.220354808590102, + "grad_norm": 2.7158315181732178, + "learning_rate": 7.894799999999999e-07, + "loss": 0.0642, + "step": 19750 + }, + { + "epoch": 9.232026143790849, + "grad_norm": 5.9499921798706055, + "learning_rate": 7.9048e-07, + "loss": 0.0608, + "step": 19775 + }, + { + "epoch": 9.243697478991596, + "grad_norm": 8.955631256103516, + "learning_rate": 7.914799999999999e-07, + "loss": 0.0813, + "step": 19800 + }, + { + "epoch": 9.255368814192344, + "grad_norm": 8.660055160522461, + "learning_rate": 7.924799999999999e-07, + "loss": 0.0663, + "step": 19825 + }, + { + "epoch": 9.267040149393091, + "grad_norm": 4.199616432189941, + "learning_rate": 7.934799999999999e-07, + "loss": 0.0805, + "step": 19850 + }, + { + "epoch": 9.278711484593838, + "grad_norm": 4.530280590057373, + "learning_rate": 7.9448e-07, + "loss": 0.0539, + "step": 19875 + }, + { + "epoch": 9.290382819794585, + "grad_norm": 3.6157238483428955, + "learning_rate": 7.954799999999999e-07, + "loss": 0.0789, + "step": 19900 + }, + { + "epoch": 9.302054154995332, + "grad_norm": 11.870729446411133, + "learning_rate": 7.964799999999999e-07, + "loss": 0.0753, + "step": 19925 + }, + { + "epoch": 9.313725490196079, + "grad_norm": 3.857879400253296, + "learning_rate": 7.9748e-07, + "loss": 0.0804, + "step": 19950 + }, + { + "epoch": 9.325396825396826, + "grad_norm": 9.552889823913574, + "learning_rate": 7.984799999999999e-07, + "loss": 0.0696, + "step": 19975 + }, + { + "epoch": 9.337068160597573, + "grad_norm": 4.108628749847412, + "learning_rate": 7.994799999999999e-07, + "loss": 0.0697, + "step": 20000 + }, + { + "epoch": 9.337068160597573, + "eval_loss": 0.1435898393392563, + "eval_runtime": 6476.4775, + "eval_samples_per_second": 1.454, + "eval_steps_per_second": 0.182, + "eval_wer": 0.0940416599386404, + "step": 20000 + }, + { + "epoch": 9.34873949579832, + "grad_norm": 16.435834884643555, + "learning_rate": 8.0048e-07, + "loss": 0.0697, + "step": 20025 + }, + { + "epoch": 9.360410830999067, + "grad_norm": 2.7345032691955566, + "learning_rate": 8.0148e-07, + "loss": 0.0808, + "step": 20050 + }, + { + "epoch": 9.372082166199814, + "grad_norm": 7.900310039520264, + "learning_rate": 8.024799999999999e-07, + "loss": 0.0642, + "step": 20075 + }, + { + "epoch": 9.38375350140056, + "grad_norm": 5.272299289703369, + "learning_rate": 8.034799999999999e-07, + "loss": 0.0912, + "step": 20100 + }, + { + "epoch": 9.395424836601308, + "grad_norm": 7.384624004364014, + "learning_rate": 8.0448e-07, + "loss": 0.0689, + "step": 20125 + }, + { + "epoch": 9.407096171802054, + "grad_norm": 6.52332067489624, + "learning_rate": 8.054799999999999e-07, + "loss": 0.0758, + "step": 20150 + }, + { + "epoch": 9.418767507002801, + "grad_norm": 7.095821380615234, + "learning_rate": 8.064799999999999e-07, + "loss": 0.0928, + "step": 20175 + }, + { + "epoch": 9.430438842203548, + "grad_norm": 7.10612154006958, + "learning_rate": 8.0748e-07, + "loss": 0.0813, + "step": 20200 + }, + { + "epoch": 9.442110177404295, + "grad_norm": 2.9239766597747803, + "learning_rate": 8.084799999999999e-07, + "loss": 0.0512, + "step": 20225 + }, + { + "epoch": 9.453781512605042, + "grad_norm": 5.488339424133301, + "learning_rate": 8.094799999999999e-07, + "loss": 0.1023, + "step": 20250 + }, + { + "epoch": 9.465452847805789, + "grad_norm": 14.871928215026855, + "learning_rate": 8.1048e-07, + "loss": 0.0733, + "step": 20275 + }, + { + "epoch": 9.477124183006536, + "grad_norm": 4.915029525756836, + "learning_rate": 8.1148e-07, + "loss": 0.0781, + "step": 20300 + }, + { + "epoch": 9.488795518207283, + "grad_norm": 11.16457748413086, + "learning_rate": 8.124799999999999e-07, + "loss": 0.0824, + "step": 20325 + }, + { + "epoch": 9.50046685340803, + "grad_norm": 4.7055535316467285, + "learning_rate": 8.134799999999999e-07, + "loss": 0.0715, + "step": 20350 + }, + { + "epoch": 9.512138188608777, + "grad_norm": 9.46976375579834, + "learning_rate": 8.1448e-07, + "loss": 0.0622, + "step": 20375 + }, + { + "epoch": 9.523809523809524, + "grad_norm": 4.587946891784668, + "learning_rate": 8.154799999999999e-07, + "loss": 0.0973, + "step": 20400 + }, + { + "epoch": 9.53548085901027, + "grad_norm": 10.611247062683105, + "learning_rate": 8.164799999999999e-07, + "loss": 0.0701, + "step": 20425 + }, + { + "epoch": 9.547152194211018, + "grad_norm": 5.382807731628418, + "learning_rate": 8.1748e-07, + "loss": 0.0799, + "step": 20450 + }, + { + "epoch": 9.558823529411764, + "grad_norm": 12.089332580566406, + "learning_rate": 8.1848e-07, + "loss": 0.0705, + "step": 20475 + }, + { + "epoch": 9.570494864612511, + "grad_norm": 3.3015291690826416, + "learning_rate": 8.194799999999999e-07, + "loss": 0.0713, + "step": 20500 + }, + { + "epoch": 9.582166199813258, + "grad_norm": 7.883571147918701, + "learning_rate": 8.2048e-07, + "loss": 0.077, + "step": 20525 + }, + { + "epoch": 9.593837535014005, + "grad_norm": 5.369983673095703, + "learning_rate": 8.2148e-07, + "loss": 0.0812, + "step": 20550 + }, + { + "epoch": 9.605508870214752, + "grad_norm": 11.33123779296875, + "learning_rate": 8.224799999999999e-07, + "loss": 0.077, + "step": 20575 + }, + { + "epoch": 9.6171802054155, + "grad_norm": 6.490606307983398, + "learning_rate": 8.234799999999999e-07, + "loss": 0.082, + "step": 20600 + }, + { + "epoch": 9.628851540616246, + "grad_norm": 13.423641204833984, + "learning_rate": 8.2448e-07, + "loss": 0.0616, + "step": 20625 + }, + { + "epoch": 9.640522875816993, + "grad_norm": 5.520218372344971, + "learning_rate": 8.2548e-07, + "loss": 0.0772, + "step": 20650 + }, + { + "epoch": 9.65219421101774, + "grad_norm": 7.892084121704102, + "learning_rate": 8.264799999999999e-07, + "loss": 0.0821, + "step": 20675 + }, + { + "epoch": 9.663865546218487, + "grad_norm": 1.9956510066986084, + "learning_rate": 8.2748e-07, + "loss": 0.0863, + "step": 20700 + }, + { + "epoch": 9.675536881419234, + "grad_norm": 12.01615047454834, + "learning_rate": 8.2848e-07, + "loss": 0.0686, + "step": 20725 + }, + { + "epoch": 9.68720821661998, + "grad_norm": 4.383852481842041, + "learning_rate": 8.294799999999999e-07, + "loss": 0.0783, + "step": 20750 + }, + { + "epoch": 9.698879551820728, + "grad_norm": 9.427849769592285, + "learning_rate": 8.3048e-07, + "loss": 0.0696, + "step": 20775 + }, + { + "epoch": 9.710550887021475, + "grad_norm": 3.1968441009521484, + "learning_rate": 8.3148e-07, + "loss": 0.0824, + "step": 20800 + }, + { + "epoch": 9.722222222222221, + "grad_norm": 5.970606803894043, + "learning_rate": 8.3248e-07, + "loss": 0.0745, + "step": 20825 + }, + { + "epoch": 9.733893557422968, + "grad_norm": 3.773395538330078, + "learning_rate": 8.334799999999999e-07, + "loss": 0.087, + "step": 20850 + }, + { + "epoch": 9.745564892623715, + "grad_norm": 12.782556533813477, + "learning_rate": 8.3448e-07, + "loss": 0.0669, + "step": 20875 + }, + { + "epoch": 9.757236227824464, + "grad_norm": 3.228957176208496, + "learning_rate": 8.3548e-07, + "loss": 0.0844, + "step": 20900 + }, + { + "epoch": 9.768907563025211, + "grad_norm": 9.885223388671875, + "learning_rate": 8.364799999999999e-07, + "loss": 0.0724, + "step": 20925 + }, + { + "epoch": 9.780578898225958, + "grad_norm": 3.015071153640747, + "learning_rate": 8.3748e-07, + "loss": 0.0784, + "step": 20950 + }, + { + "epoch": 9.792250233426705, + "grad_norm": 7.431763172149658, + "learning_rate": 8.3848e-07, + "loss": 0.0782, + "step": 20975 + }, + { + "epoch": 9.803921568627452, + "grad_norm": 3.960148334503174, + "learning_rate": 8.394799999999999e-07, + "loss": 0.0765, + "step": 21000 + }, + { + "epoch": 9.815592903828199, + "grad_norm": 11.450105667114258, + "learning_rate": 8.4048e-07, + "loss": 0.0757, + "step": 21025 + }, + { + "epoch": 9.827264239028946, + "grad_norm": 2.3904082775115967, + "learning_rate": 8.4148e-07, + "loss": 0.0887, + "step": 21050 + }, + { + "epoch": 9.838935574229692, + "grad_norm": 5.4470086097717285, + "learning_rate": 8.4248e-07, + "loss": 0.0824, + "step": 21075 + }, + { + "epoch": 9.85060690943044, + "grad_norm": 2.513823986053467, + "learning_rate": 8.434799999999999e-07, + "loss": 0.0729, + "step": 21100 + }, + { + "epoch": 9.862278244631186, + "grad_norm": 14.719958305358887, + "learning_rate": 8.4448e-07, + "loss": 0.0705, + "step": 21125 + }, + { + "epoch": 9.873949579831933, + "grad_norm": 5.428534984588623, + "learning_rate": 8.4548e-07, + "loss": 0.0861, + "step": 21150 + }, + { + "epoch": 9.88562091503268, + "grad_norm": 5.955714702606201, + "learning_rate": 8.464799999999999e-07, + "loss": 0.0672, + "step": 21175 + }, + { + "epoch": 9.897292250233427, + "grad_norm": 7.154689788818359, + "learning_rate": 8.4744e-07, + "loss": 0.0954, + "step": 21200 + }, + { + "epoch": 9.908963585434174, + "grad_norm": 7.696076393127441, + "learning_rate": 8.484399999999999e-07, + "loss": 0.0617, + "step": 21225 + }, + { + "epoch": 9.920634920634921, + "grad_norm": 5.049380779266357, + "learning_rate": 8.494399999999999e-07, + "loss": 0.0775, + "step": 21250 + }, + { + "epoch": 9.932306255835668, + "grad_norm": 9.34526252746582, + "learning_rate": 8.5044e-07, + "loss": 0.0749, + "step": 21275 + }, + { + "epoch": 9.943977591036415, + "grad_norm": 7.990287780761719, + "learning_rate": 8.5144e-07, + "loss": 0.0865, + "step": 21300 + }, + { + "epoch": 9.955648926237162, + "grad_norm": 10.391454696655273, + "learning_rate": 8.524399999999999e-07, + "loss": 0.0658, + "step": 21325 + }, + { + "epoch": 9.967320261437909, + "grad_norm": 5.023215293884277, + "learning_rate": 8.534399999999999e-07, + "loss": 0.0713, + "step": 21350 + }, + { + "epoch": 9.978991596638656, + "grad_norm": 12.558219909667969, + "learning_rate": 8.5444e-07, + "loss": 0.0764, + "step": 21375 + }, + { + "epoch": 9.990662931839402, + "grad_norm": 4.4138336181640625, + "learning_rate": 8.554399999999999e-07, + "loss": 0.076, + "step": 21400 + }, + { + "epoch": 10.00233426704015, + "grad_norm": 2.9889421463012695, + "learning_rate": 8.564399999999999e-07, + "loss": 0.0805, + "step": 21425 + }, + { + "epoch": 10.014005602240896, + "grad_norm": 3.3753228187561035, + "learning_rate": 8.5744e-07, + "loss": 0.0487, + "step": 21450 + }, + { + "epoch": 10.025676937441643, + "grad_norm": 3.1645426750183105, + "learning_rate": 8.5844e-07, + "loss": 0.0719, + "step": 21475 + }, + { + "epoch": 10.03734827264239, + "grad_norm": 9.965222358703613, + "learning_rate": 8.594399999999999e-07, + "loss": 0.0453, + "step": 21500 + }, + { + "epoch": 10.049019607843137, + "grad_norm": 4.585392475128174, + "learning_rate": 8.6044e-07, + "loss": 0.0813, + "step": 21525 + }, + { + "epoch": 10.060690943043884, + "grad_norm": 2.651890277862549, + "learning_rate": 8.6144e-07, + "loss": 0.0441, + "step": 21550 + }, + { + "epoch": 10.072362278244631, + "grad_norm": 3.137420415878296, + "learning_rate": 8.624399999999999e-07, + "loss": 0.0776, + "step": 21575 + }, + { + "epoch": 10.084033613445378, + "grad_norm": 3.0008487701416016, + "learning_rate": 8.634399999999999e-07, + "loss": 0.0549, + "step": 21600 + }, + { + "epoch": 10.095704948646125, + "grad_norm": 5.419103145599365, + "learning_rate": 8.6444e-07, + "loss": 0.0807, + "step": 21625 + }, + { + "epoch": 10.107376283846872, + "grad_norm": 4.442772388458252, + "learning_rate": 8.654399999999999e-07, + "loss": 0.0473, + "step": 21650 + }, + { + "epoch": 10.119047619047619, + "grad_norm": 7.645913600921631, + "learning_rate": 8.664399999999999e-07, + "loss": 0.0832, + "step": 21675 + }, + { + "epoch": 10.130718954248366, + "grad_norm": 5.763233184814453, + "learning_rate": 8.6744e-07, + "loss": 0.0564, + "step": 21700 + }, + { + "epoch": 10.142390289449112, + "grad_norm": 2.8492894172668457, + "learning_rate": 8.6844e-07, + "loss": 0.0722, + "step": 21725 + }, + { + "epoch": 10.15406162464986, + "grad_norm": 5.006544589996338, + "learning_rate": 8.694399999999999e-07, + "loss": 0.0494, + "step": 21750 + }, + { + "epoch": 10.165732959850606, + "grad_norm": 3.509387731552124, + "learning_rate": 8.7044e-07, + "loss": 0.0764, + "step": 21775 + }, + { + "epoch": 10.177404295051353, + "grad_norm": 4.277322769165039, + "learning_rate": 8.7144e-07, + "loss": 0.0387, + "step": 21800 + }, + { + "epoch": 10.1890756302521, + "grad_norm": 3.3739705085754395, + "learning_rate": 8.724399999999999e-07, + "loss": 0.0558, + "step": 21825 + }, + { + "epoch": 10.200746965452847, + "grad_norm": 2.4098832607269287, + "learning_rate": 8.734399999999999e-07, + "loss": 0.041, + "step": 21850 + }, + { + "epoch": 10.212418300653594, + "grad_norm": 4.2443108558654785, + "learning_rate": 8.7444e-07, + "loss": 0.0614, + "step": 21875 + }, + { + "epoch": 10.224089635854341, + "grad_norm": 7.186864376068115, + "learning_rate": 8.7544e-07, + "loss": 0.0478, + "step": 21900 + }, + { + "epoch": 10.235760971055088, + "grad_norm": 3.2214882373809814, + "learning_rate": 8.764399999999999e-07, + "loss": 0.0596, + "step": 21925 + }, + { + "epoch": 10.247432306255835, + "grad_norm": 1.4043220281600952, + "learning_rate": 8.7744e-07, + "loss": 0.0558, + "step": 21950 + }, + { + "epoch": 10.259103641456583, + "grad_norm": 3.13395357131958, + "learning_rate": 8.7844e-07, + "loss": 0.0751, + "step": 21975 + }, + { + "epoch": 10.27077497665733, + "grad_norm": 3.777238368988037, + "learning_rate": 8.794399999999999e-07, + "loss": 0.0449, + "step": 22000 + }, + { + "epoch": 10.282446311858077, + "grad_norm": 3.8136322498321533, + "learning_rate": 8.804399999999999e-07, + "loss": 0.0741, + "step": 22025 + }, + { + "epoch": 10.294117647058824, + "grad_norm": 2.8861405849456787, + "learning_rate": 8.8144e-07, + "loss": 0.0514, + "step": 22050 + }, + { + "epoch": 10.305788982259571, + "grad_norm": 4.029909133911133, + "learning_rate": 8.8244e-07, + "loss": 0.0715, + "step": 22075 + }, + { + "epoch": 10.317460317460318, + "grad_norm": 9.571359634399414, + "learning_rate": 8.834399999999999e-07, + "loss": 0.0442, + "step": 22100 + }, + { + "epoch": 10.329131652661065, + "grad_norm": 4.114884376525879, + "learning_rate": 8.8444e-07, + "loss": 0.0752, + "step": 22125 + }, + { + "epoch": 10.340802987861812, + "grad_norm": 4.394742965698242, + "learning_rate": 8.8544e-07, + "loss": 0.046, + "step": 22150 + }, + { + "epoch": 10.352474323062559, + "grad_norm": 3.1479573249816895, + "learning_rate": 8.864399999999999e-07, + "loss": 0.067, + "step": 22175 + }, + { + "epoch": 10.364145658263306, + "grad_norm": 2.1959614753723145, + "learning_rate": 8.8744e-07, + "loss": 0.0495, + "step": 22200 + }, + { + "epoch": 10.375816993464053, + "grad_norm": 4.033957481384277, + "learning_rate": 8.8844e-07, + "loss": 0.0587, + "step": 22225 + }, + { + "epoch": 10.3874883286648, + "grad_norm": 6.8924736976623535, + "learning_rate": 8.8944e-07, + "loss": 0.0517, + "step": 22250 + }, + { + "epoch": 10.399159663865547, + "grad_norm": 3.8382160663604736, + "learning_rate": 8.904399999999999e-07, + "loss": 0.0678, + "step": 22275 + }, + { + "epoch": 10.410830999066294, + "grad_norm": 10.509393692016602, + "learning_rate": 8.9144e-07, + "loss": 0.048, + "step": 22300 + }, + { + "epoch": 10.42250233426704, + "grad_norm": 3.9929986000061035, + "learning_rate": 8.9244e-07, + "loss": 0.0707, + "step": 22325 + }, + { + "epoch": 10.434173669467787, + "grad_norm": 4.263001918792725, + "learning_rate": 8.934399999999999e-07, + "loss": 0.0534, + "step": 22350 + }, + { + "epoch": 10.445845004668534, + "grad_norm": 4.735872745513916, + "learning_rate": 8.9444e-07, + "loss": 0.0714, + "step": 22375 + }, + { + "epoch": 10.457516339869281, + "grad_norm": 2.775026321411133, + "learning_rate": 8.9544e-07, + "loss": 0.0599, + "step": 22400 + }, + { + "epoch": 10.469187675070028, + "grad_norm": 4.69821834564209, + "learning_rate": 8.964399999999999e-07, + "loss": 0.0679, + "step": 22425 + }, + { + "epoch": 10.480859010270775, + "grad_norm": 4.804164886474609, + "learning_rate": 8.9744e-07, + "loss": 0.0382, + "step": 22450 + }, + { + "epoch": 10.492530345471522, + "grad_norm": 4.525900840759277, + "learning_rate": 8.9844e-07, + "loss": 0.0723, + "step": 22475 + }, + { + "epoch": 10.504201680672269, + "grad_norm": 9.724448204040527, + "learning_rate": 8.9944e-07, + "loss": 0.0558, + "step": 22500 + }, + { + "epoch": 10.515873015873016, + "grad_norm": 3.797886848449707, + "learning_rate": 9.004399999999999e-07, + "loss": 0.0665, + "step": 22525 + }, + { + "epoch": 10.527544351073763, + "grad_norm": 4.131737232208252, + "learning_rate": 9.0144e-07, + "loss": 0.0463, + "step": 22550 + }, + { + "epoch": 10.53921568627451, + "grad_norm": 4.074097633361816, + "learning_rate": 9.0244e-07, + "loss": 0.0631, + "step": 22575 + }, + { + "epoch": 10.550887021475257, + "grad_norm": 6.83477258682251, + "learning_rate": 9.034399999999999e-07, + "loss": 0.0453, + "step": 22600 + }, + { + "epoch": 10.562558356676004, + "grad_norm": 4.893357276916504, + "learning_rate": 9.0444e-07, + "loss": 0.0579, + "step": 22625 + }, + { + "epoch": 10.57422969187675, + "grad_norm": 3.0031166076660156, + "learning_rate": 9.0544e-07, + "loss": 0.0468, + "step": 22650 + }, + { + "epoch": 10.585901027077497, + "grad_norm": 2.353924036026001, + "learning_rate": 9.0644e-07, + "loss": 0.0769, + "step": 22675 + }, + { + "epoch": 10.597572362278244, + "grad_norm": 4.528254985809326, + "learning_rate": 9.0744e-07, + "loss": 0.0457, + "step": 22700 + }, + { + "epoch": 10.609243697478991, + "grad_norm": 3.255937099456787, + "learning_rate": 9.0844e-07, + "loss": 0.07, + "step": 22725 + }, + { + "epoch": 10.620915032679738, + "grad_norm": 5.159088611602783, + "learning_rate": 9.0944e-07, + "loss": 0.0592, + "step": 22750 + }, + { + "epoch": 10.632586367880485, + "grad_norm": 5.292705535888672, + "learning_rate": 9.104399999999999e-07, + "loss": 0.075, + "step": 22775 + }, + { + "epoch": 10.644257703081232, + "grad_norm": 2.7910406589508057, + "learning_rate": 9.1144e-07, + "loss": 0.0512, + "step": 22800 + }, + { + "epoch": 10.655929038281979, + "grad_norm": 2.8850274085998535, + "learning_rate": 9.1244e-07, + "loss": 0.0531, + "step": 22825 + }, + { + "epoch": 10.667600373482726, + "grad_norm": 3.195202589035034, + "learning_rate": 9.1344e-07, + "loss": 0.0511, + "step": 22850 + }, + { + "epoch": 10.679271708683473, + "grad_norm": 3.8003671169281006, + "learning_rate": 9.1444e-07, + "loss": 0.0758, + "step": 22875 + }, + { + "epoch": 10.69094304388422, + "grad_norm": 3.130300998687744, + "learning_rate": 9.1544e-07, + "loss": 0.0522, + "step": 22900 + }, + { + "epoch": 10.702614379084967, + "grad_norm": 4.510003089904785, + "learning_rate": 9.1644e-07, + "loss": 0.0843, + "step": 22925 + }, + { + "epoch": 10.714285714285714, + "grad_norm": 6.213229656219482, + "learning_rate": 9.1744e-07, + "loss": 0.0481, + "step": 22950 + }, + { + "epoch": 10.72595704948646, + "grad_norm": 2.293363094329834, + "learning_rate": 9.1844e-07, + "loss": 0.0604, + "step": 22975 + }, + { + "epoch": 10.73762838468721, + "grad_norm": 1.0174866914749146, + "learning_rate": 9.1944e-07, + "loss": 0.0501, + "step": 23000 + }, + { + "epoch": 10.749299719887954, + "grad_norm": 5.294317722320557, + "learning_rate": 9.2044e-07, + "loss": 0.0734, + "step": 23025 + }, + { + "epoch": 10.760971055088703, + "grad_norm": 3.6734204292297363, + "learning_rate": 9.2144e-07, + "loss": 0.0492, + "step": 23050 + }, + { + "epoch": 10.77264239028945, + "grad_norm": 3.0096030235290527, + "learning_rate": 9.2244e-07, + "loss": 0.0517, + "step": 23075 + }, + { + "epoch": 10.784313725490197, + "grad_norm": 4.717250347137451, + "learning_rate": 9.2344e-07, + "loss": 0.0447, + "step": 23100 + }, + { + "epoch": 10.795985060690944, + "grad_norm": 3.78305983543396, + "learning_rate": 9.2444e-07, + "loss": 0.0804, + "step": 23125 + }, + { + "epoch": 10.80765639589169, + "grad_norm": 6.005961894989014, + "learning_rate": 9.2544e-07, + "loss": 0.0535, + "step": 23150 + }, + { + "epoch": 10.819327731092438, + "grad_norm": 5.146392345428467, + "learning_rate": 9.2644e-07, + "loss": 0.069, + "step": 23175 + }, + { + "epoch": 10.830999066293185, + "grad_norm": 6.436806678771973, + "learning_rate": 9.2744e-07, + "loss": 0.0532, + "step": 23200 + }, + { + "epoch": 10.842670401493931, + "grad_norm": 3.9533166885375977, + "learning_rate": 9.2844e-07, + "loss": 0.0761, + "step": 23225 + }, + { + "epoch": 10.854341736694678, + "grad_norm": 4.497999668121338, + "learning_rate": 9.2944e-07, + "loss": 0.0498, + "step": 23250 + }, + { + "epoch": 10.866013071895425, + "grad_norm": 3.916146755218506, + "learning_rate": 9.3044e-07, + "loss": 0.0707, + "step": 23275 + }, + { + "epoch": 10.877684407096172, + "grad_norm": 1.4552559852600098, + "learning_rate": 9.3144e-07, + "loss": 0.049, + "step": 23300 + }, + { + "epoch": 10.88935574229692, + "grad_norm": 4.581323146820068, + "learning_rate": 9.3244e-07, + "loss": 0.0679, + "step": 23325 + }, + { + "epoch": 10.901027077497666, + "grad_norm": 9.022045135498047, + "learning_rate": 9.3344e-07, + "loss": 0.0452, + "step": 23350 + }, + { + "epoch": 10.912698412698413, + "grad_norm": 2.782165288925171, + "learning_rate": 9.3444e-07, + "loss": 0.0697, + "step": 23375 + }, + { + "epoch": 10.92436974789916, + "grad_norm": 2.9013919830322266, + "learning_rate": 9.3544e-07, + "loss": 0.0539, + "step": 23400 + }, + { + "epoch": 10.936041083099907, + "grad_norm": 5.128820419311523, + "learning_rate": 9.3644e-07, + "loss": 0.0659, + "step": 23425 + }, + { + "epoch": 10.947712418300654, + "grad_norm": 3.9068808555603027, + "learning_rate": 9.3744e-07, + "loss": 0.0461, + "step": 23450 + }, + { + "epoch": 10.9593837535014, + "grad_norm": 3.185457229614258, + "learning_rate": 9.3844e-07, + "loss": 0.0737, + "step": 23475 + }, + { + "epoch": 10.971055088702148, + "grad_norm": 6.052894592285156, + "learning_rate": 9.3944e-07, + "loss": 0.0569, + "step": 23500 + }, + { + "epoch": 10.982726423902895, + "grad_norm": 3.6629199981689453, + "learning_rate": 9.4044e-07, + "loss": 0.0621, + "step": 23525 + }, + { + "epoch": 10.994397759103641, + "grad_norm": 1.8605912923812866, + "learning_rate": 9.4144e-07, + "loss": 0.0487, + "step": 23550 + }, + { + "epoch": 11.006069094304388, + "grad_norm": 3.8178212642669678, + "learning_rate": 9.424e-07, + "loss": 0.0691, + "step": 23575 + }, + { + "epoch": 11.017740429505135, + "grad_norm": 20.160478591918945, + "learning_rate": 9.433999999999999e-07, + "loss": 0.0342, + "step": 23600 + }, + { + "epoch": 11.029411764705882, + "grad_norm": 2.548696756362915, + "learning_rate": 9.444e-07, + "loss": 0.0535, + "step": 23625 + }, + { + "epoch": 11.04108309990663, + "grad_norm": 6.730154514312744, + "learning_rate": 9.454e-07, + "loss": 0.0303, + "step": 23650 + }, + { + "epoch": 11.052754435107376, + "grad_norm": 5.233523368835449, + "learning_rate": 9.464e-07, + "loss": 0.0631, + "step": 23675 + }, + { + "epoch": 11.064425770308123, + "grad_norm": 5.212557315826416, + "learning_rate": 9.474e-07, + "loss": 0.0376, + "step": 23700 + }, + { + "epoch": 11.07609710550887, + "grad_norm": 3.276489019393921, + "learning_rate": 9.484e-07, + "loss": 0.052, + "step": 23725 + }, + { + "epoch": 11.087768440709617, + "grad_norm": 1.3591026067733765, + "learning_rate": 9.494e-07, + "loss": 0.0376, + "step": 23750 + }, + { + "epoch": 11.099439775910364, + "grad_norm": 5.116319179534912, + "learning_rate": 9.503999999999999e-07, + "loss": 0.0549, + "step": 23775 + }, + { + "epoch": 11.11111111111111, + "grad_norm": 3.6078543663024902, + "learning_rate": 9.514e-07, + "loss": 0.0306, + "step": 23800 + }, + { + "epoch": 11.122782446311858, + "grad_norm": 1.3853808641433716, + "learning_rate": 9.524e-07, + "loss": 0.0554, + "step": 23825 + }, + { + "epoch": 11.134453781512605, + "grad_norm": 1.325642704963684, + "learning_rate": 9.534e-07, + "loss": 0.0316, + "step": 23850 + }, + { + "epoch": 11.146125116713351, + "grad_norm": 3.342453718185425, + "learning_rate": 9.544e-07, + "loss": 0.0565, + "step": 23875 + }, + { + "epoch": 11.157796451914098, + "grad_norm": 5.374787330627441, + "learning_rate": 9.554e-07, + "loss": 0.0371, + "step": 23900 + }, + { + "epoch": 11.169467787114845, + "grad_norm": 4.615719318389893, + "learning_rate": 9.564e-07, + "loss": 0.0464, + "step": 23925 + }, + { + "epoch": 11.181139122315592, + "grad_norm": 5.073611259460449, + "learning_rate": 9.574e-07, + "loss": 0.0374, + "step": 23950 + }, + { + "epoch": 11.19281045751634, + "grad_norm": 4.32796049118042, + "learning_rate": 9.584e-07, + "loss": 0.0623, + "step": 23975 + }, + { + "epoch": 11.204481792717086, + "grad_norm": 4.874443054199219, + "learning_rate": 9.594e-07, + "loss": 0.0333, + "step": 24000 + }, + { + "epoch": 11.216153127917833, + "grad_norm": 3.2107975482940674, + "learning_rate": 9.604e-07, + "loss": 0.0522, + "step": 24025 + }, + { + "epoch": 11.22782446311858, + "grad_norm": 2.836677312850952, + "learning_rate": 9.614e-07, + "loss": 0.0288, + "step": 24050 + }, + { + "epoch": 11.239495798319327, + "grad_norm": 1.4697824716567993, + "learning_rate": 9.624e-07, + "loss": 0.0602, + "step": 24075 + }, + { + "epoch": 11.251167133520074, + "grad_norm": 3.0650887489318848, + "learning_rate": 9.634e-07, + "loss": 0.0289, + "step": 24100 + }, + { + "epoch": 11.262838468720823, + "grad_norm": 3.5631065368652344, + "learning_rate": 9.644e-07, + "loss": 0.057, + "step": 24125 + }, + { + "epoch": 11.27450980392157, + "grad_norm": 5.980957508087158, + "learning_rate": 9.654e-07, + "loss": 0.0368, + "step": 24150 + }, + { + "epoch": 11.286181139122316, + "grad_norm": 2.9237494468688965, + "learning_rate": 9.664e-07, + "loss": 0.0572, + "step": 24175 + }, + { + "epoch": 11.297852474323063, + "grad_norm": 1.2119998931884766, + "learning_rate": 9.674e-07, + "loss": 0.03, + "step": 24200 + }, + { + "epoch": 11.30952380952381, + "grad_norm": 3.5559473037719727, + "learning_rate": 9.684e-07, + "loss": 0.0667, + "step": 24225 + }, + { + "epoch": 11.321195144724557, + "grad_norm": 3.1826765537261963, + "learning_rate": 9.694e-07, + "loss": 0.0397, + "step": 24250 + }, + { + "epoch": 11.332866479925304, + "grad_norm": 2.0330376625061035, + "learning_rate": 9.704e-07, + "loss": 0.0553, + "step": 24275 + }, + { + "epoch": 11.344537815126051, + "grad_norm": 4.895223617553711, + "learning_rate": 9.714e-07, + "loss": 0.0335, + "step": 24300 + }, + { + "epoch": 11.356209150326798, + "grad_norm": 3.050001621246338, + "learning_rate": 9.724e-07, + "loss": 0.0568, + "step": 24325 + }, + { + "epoch": 11.367880485527545, + "grad_norm": 6.4767560958862305, + "learning_rate": 9.734e-07, + "loss": 0.0404, + "step": 24350 + }, + { + "epoch": 11.379551820728292, + "grad_norm": 4.696669101715088, + "learning_rate": 9.744e-07, + "loss": 0.0641, + "step": 24375 + }, + { + "epoch": 11.391223155929039, + "grad_norm": 3.8657402992248535, + "learning_rate": 9.754e-07, + "loss": 0.0316, + "step": 24400 + }, + { + "epoch": 11.402894491129786, + "grad_norm": 4.243162155151367, + "learning_rate": 9.764e-07, + "loss": 0.0512, + "step": 24425 + }, + { + "epoch": 11.414565826330533, + "grad_norm": 6.798733234405518, + "learning_rate": 9.774e-07, + "loss": 0.0293, + "step": 24450 + }, + { + "epoch": 11.42623716153128, + "grad_norm": 2.7520804405212402, + "learning_rate": 9.784e-07, + "loss": 0.0513, + "step": 24475 + }, + { + "epoch": 11.437908496732026, + "grad_norm": 3.6984705924987793, + "learning_rate": 9.794e-07, + "loss": 0.0376, + "step": 24500 + }, + { + "epoch": 11.449579831932773, + "grad_norm": 10.192070960998535, + "learning_rate": 9.804e-07, + "loss": 0.0491, + "step": 24525 + }, + { + "epoch": 11.46125116713352, + "grad_norm": 3.717801332473755, + "learning_rate": 9.814e-07, + "loss": 0.0345, + "step": 24550 + }, + { + "epoch": 11.472922502334267, + "grad_norm": 5.155227184295654, + "learning_rate": 9.824e-07, + "loss": 0.0549, + "step": 24575 + }, + { + "epoch": 11.484593837535014, + "grad_norm": 5.626723766326904, + "learning_rate": 9.834e-07, + "loss": 0.0373, + "step": 24600 + }, + { + "epoch": 11.496265172735761, + "grad_norm": 3.582610607147217, + "learning_rate": 9.844e-07, + "loss": 0.0698, + "step": 24625 + }, + { + "epoch": 11.507936507936508, + "grad_norm": 4.073030948638916, + "learning_rate": 9.854e-07, + "loss": 0.0348, + "step": 24650 + }, + { + "epoch": 11.519607843137255, + "grad_norm": 5.026329517364502, + "learning_rate": 9.864e-07, + "loss": 0.0607, + "step": 24675 + }, + { + "epoch": 11.531279178338002, + "grad_norm": 3.416334390640259, + "learning_rate": 9.874e-07, + "loss": 0.0318, + "step": 24700 + }, + { + "epoch": 11.542950513538749, + "grad_norm": 2.8178179264068604, + "learning_rate": 9.884e-07, + "loss": 0.0524, + "step": 24725 + }, + { + "epoch": 11.554621848739496, + "grad_norm": 2.5382184982299805, + "learning_rate": 9.894e-07, + "loss": 0.0341, + "step": 24750 + }, + { + "epoch": 11.566293183940243, + "grad_norm": 4.652471542358398, + "learning_rate": 9.903999999999999e-07, + "loss": 0.065, + "step": 24775 + }, + { + "epoch": 11.57796451914099, + "grad_norm": 5.832390785217285, + "learning_rate": 9.914e-07, + "loss": 0.0317, + "step": 24800 + }, + { + "epoch": 11.589635854341736, + "grad_norm": 3.8395602703094482, + "learning_rate": 9.923999999999998e-07, + "loss": 0.0642, + "step": 24825 + }, + { + "epoch": 11.601307189542483, + "grad_norm": 5.006762981414795, + "learning_rate": 9.933999999999999e-07, + "loss": 0.0367, + "step": 24850 + }, + { + "epoch": 11.61297852474323, + "grad_norm": 3.3889858722686768, + "learning_rate": 9.944e-07, + "loss": 0.0567, + "step": 24875 + }, + { + "epoch": 11.624649859943977, + "grad_norm": 2.9701974391937256, + "learning_rate": 9.953999999999998e-07, + "loss": 0.0289, + "step": 24900 + }, + { + "epoch": 11.636321195144724, + "grad_norm": 3.798945903778076, + "learning_rate": 9.964e-07, + "loss": 0.0591, + "step": 24925 + }, + { + "epoch": 11.647992530345471, + "grad_norm": 3.1179754734039307, + "learning_rate": 9.974e-07, + "loss": 0.0286, + "step": 24950 + }, + { + "epoch": 11.659663865546218, + "grad_norm": 4.428998947143555, + "learning_rate": 9.983999999999998e-07, + "loss": 0.0668, + "step": 24975 + }, + { + "epoch": 11.671335200746965, + "grad_norm": 5.318285942077637, + "learning_rate": 9.994e-07, + "loss": 0.0374, + "step": 25000 + }, + { + "epoch": 11.671335200746965, + "eval_loss": 0.15606163442134857, + "eval_runtime": 6264.6896, + "eval_samples_per_second": 1.503, + "eval_steps_per_second": 0.188, + "eval_wer": 0.09723881802034555, + "step": 25000 + }, + { + "epoch": 11.671335200746965, + "step": 25000, + "total_flos": 4.081858297380864e+20, + "train_loss": 0.03221806969165802, + "train_runtime": 95538.6721, + "train_samples_per_second": 4.187, + "train_steps_per_second": 0.262 + } + ], + "logging_steps": 25, + "max_steps": 25000, + "num_input_tokens_seen": 0, + "num_train_epochs": 12, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.081858297380864e+20, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}