diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5822 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.458447874814972, + "eval_steps": 1000, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.010573059843518714, + "grad_norm": 58.068878173828125, + "learning_rate": 4.4e-07, + "loss": 3.1142, + "step": 25 + }, + { + "epoch": 0.02114611968703743, + "grad_norm": 37.93722152709961, + "learning_rate": 9.400000000000001e-07, + "loss": 2.5594, + "step": 50 + }, + { + "epoch": 0.03171917953055614, + "grad_norm": 22.363622665405273, + "learning_rate": 1.44e-06, + "loss": 1.6703, + "step": 75 + }, + { + "epoch": 0.04229223937407486, + "grad_norm": 15.931524276733398, + "learning_rate": 1.94e-06, + "loss": 1.0597, + "step": 100 + }, + { + "epoch": 0.05286529921759357, + "grad_norm": 13.918318748474121, + "learning_rate": 2.4400000000000004e-06, + "loss": 0.765, + "step": 125 + }, + { + "epoch": 0.06343835906111228, + "grad_norm": 14.767135620117188, + "learning_rate": 2.9400000000000002e-06, + "loss": 0.6955, + "step": 150 + }, + { + "epoch": 0.074011418904631, + "grad_norm": 11.596781730651855, + "learning_rate": 3.44e-06, + "loss": 0.6118, + "step": 175 + }, + { + "epoch": 0.08458447874814971, + "grad_norm": 10.992063522338867, + "learning_rate": 3.94e-06, + "loss": 0.6096, + "step": 200 + }, + { + "epoch": 0.09515753859166842, + "grad_norm": 12.467724800109863, + "learning_rate": 4.440000000000001e-06, + "loss": 0.5579, + "step": 225 + }, + { + "epoch": 0.10573059843518715, + "grad_norm": 11.876873016357422, + "learning_rate": 4.94e-06, + "loss": 0.5435, + "step": 250 + }, + { + "epoch": 0.11630365827870585, + "grad_norm": 12.889082908630371, + "learning_rate": 5.4400000000000004e-06, + "loss": 0.5693, + "step": 275 + }, + { + "epoch": 0.12687671812222456, + "grad_norm": 10.841497421264648, + "learning_rate": 5.94e-06, + "loss": 0.5494, + "step": 300 + }, + { + "epoch": 0.13744977796574329, + "grad_norm": 10.560270309448242, + "learning_rate": 6.440000000000001e-06, + "loss": 0.5065, + "step": 325 + }, + { + "epoch": 0.148022837809262, + "grad_norm": 13.699323654174805, + "learning_rate": 6.9400000000000005e-06, + "loss": 0.4911, + "step": 350 + }, + { + "epoch": 0.1585958976527807, + "grad_norm": 9.73985481262207, + "learning_rate": 7.440000000000001e-06, + "loss": 0.5293, + "step": 375 + }, + { + "epoch": 0.16916895749629943, + "grad_norm": 11.41083812713623, + "learning_rate": 7.94e-06, + "loss": 0.4365, + "step": 400 + }, + { + "epoch": 0.17974201733981815, + "grad_norm": 10.309052467346191, + "learning_rate": 8.44e-06, + "loss": 0.5207, + "step": 425 + }, + { + "epoch": 0.19031507718333684, + "grad_norm": 10.460412979125977, + "learning_rate": 8.94e-06, + "loss": 0.4537, + "step": 450 + }, + { + "epoch": 0.20088813702685557, + "grad_norm": 10.022929191589355, + "learning_rate": 9.440000000000001e-06, + "loss": 0.4395, + "step": 475 + }, + { + "epoch": 0.2114611968703743, + "grad_norm": 10.307358741760254, + "learning_rate": 9.940000000000001e-06, + "loss": 0.4439, + "step": 500 + }, + { + "epoch": 0.222034256713893, + "grad_norm": 8.121826171875, + "learning_rate": 9.98871794871795e-06, + "loss": 0.4624, + "step": 525 + }, + { + "epoch": 0.2326073165574117, + "grad_norm": 9.346823692321777, + "learning_rate": 9.975897435897436e-06, + "loss": 0.4533, + "step": 550 + }, + { + "epoch": 0.24318037640093043, + "grad_norm": 15.096284866333008, + "learning_rate": 9.963076923076925e-06, + "loss": 0.4372, + "step": 575 + }, + { + "epoch": 0.2537534362444491, + "grad_norm": 9.635919570922852, + "learning_rate": 9.950256410256412e-06, + "loss": 0.4149, + "step": 600 + }, + { + "epoch": 0.2643264960879679, + "grad_norm": 12.7789945602417, + "learning_rate": 9.937435897435898e-06, + "loss": 0.4219, + "step": 625 + }, + { + "epoch": 0.27489955593148657, + "grad_norm": 10.612743377685547, + "learning_rate": 9.924615384615385e-06, + "loss": 0.4103, + "step": 650 + }, + { + "epoch": 0.28547261577500527, + "grad_norm": 9.644041061401367, + "learning_rate": 9.911794871794874e-06, + "loss": 0.3747, + "step": 675 + }, + { + "epoch": 0.296045675618524, + "grad_norm": 12.278302192687988, + "learning_rate": 9.89897435897436e-06, + "loss": 0.3818, + "step": 700 + }, + { + "epoch": 0.3066187354620427, + "grad_norm": 11.676104545593262, + "learning_rate": 9.886153846153846e-06, + "loss": 0.3964, + "step": 725 + }, + { + "epoch": 0.3171917953055614, + "grad_norm": 9.235854148864746, + "learning_rate": 9.873333333333334e-06, + "loss": 0.3954, + "step": 750 + }, + { + "epoch": 0.32776485514908016, + "grad_norm": 10.16644287109375, + "learning_rate": 9.860512820512821e-06, + "loss": 0.4021, + "step": 775 + }, + { + "epoch": 0.33833791499259885, + "grad_norm": 11.959391593933105, + "learning_rate": 9.847692307692308e-06, + "loss": 0.4094, + "step": 800 + }, + { + "epoch": 0.34891097483611755, + "grad_norm": 9.894736289978027, + "learning_rate": 9.834871794871795e-06, + "loss": 0.386, + "step": 825 + }, + { + "epoch": 0.3594840346796363, + "grad_norm": 10.717971801757812, + "learning_rate": 9.822051282051283e-06, + "loss": 0.3786, + "step": 850 + }, + { + "epoch": 0.370057094523155, + "grad_norm": 9.43459701538086, + "learning_rate": 9.80923076923077e-06, + "loss": 0.3558, + "step": 875 + }, + { + "epoch": 0.3806301543666737, + "grad_norm": 8.999226570129395, + "learning_rate": 9.796410256410257e-06, + "loss": 0.4059, + "step": 900 + }, + { + "epoch": 0.39120321421019244, + "grad_norm": 9.803775787353516, + "learning_rate": 9.783589743589744e-06, + "loss": 0.3665, + "step": 925 + }, + { + "epoch": 0.40177627405371114, + "grad_norm": 6.630070209503174, + "learning_rate": 9.770769230769232e-06, + "loss": 0.3913, + "step": 950 + }, + { + "epoch": 0.4123493338972299, + "grad_norm": 8.892091751098633, + "learning_rate": 9.757948717948719e-06, + "loss": 0.4345, + "step": 975 + }, + { + "epoch": 0.4229223937407486, + "grad_norm": 11.313458442687988, + "learning_rate": 9.745128205128206e-06, + "loss": 0.3523, + "step": 1000 + }, + { + "epoch": 0.4229223937407486, + "eval_loss": 0.3129430413246155, + "eval_runtime": 466.842, + "eval_samples_per_second": 7.799, + "eval_steps_per_second": 0.977, + "eval_wer": 0.23648429161446008, + "step": 1000 + }, + { + "epoch": 0.4334954535842673, + "grad_norm": 10.663793563842773, + "learning_rate": 9.732307692307693e-06, + "loss": 0.3711, + "step": 1025 + }, + { + "epoch": 0.444068513427786, + "grad_norm": 8.810164451599121, + "learning_rate": 9.71948717948718e-06, + "loss": 0.355, + "step": 1050 + }, + { + "epoch": 0.4546415732713047, + "grad_norm": 17.50876235961914, + "learning_rate": 9.706666666666668e-06, + "loss": 0.3652, + "step": 1075 + }, + { + "epoch": 0.4652146331148234, + "grad_norm": 6.720116138458252, + "learning_rate": 9.693846153846155e-06, + "loss": 0.3415, + "step": 1100 + }, + { + "epoch": 0.47578769295834217, + "grad_norm": 6.864238739013672, + "learning_rate": 9.681025641025642e-06, + "loss": 0.3203, + "step": 1125 + }, + { + "epoch": 0.48636075280186086, + "grad_norm": 10.059639930725098, + "learning_rate": 9.668205128205129e-06, + "loss": 0.4017, + "step": 1150 + }, + { + "epoch": 0.49693381264537956, + "grad_norm": 8.964330673217773, + "learning_rate": 9.655384615384617e-06, + "loss": 0.3089, + "step": 1175 + }, + { + "epoch": 0.5075068724888983, + "grad_norm": 11.160856246948242, + "learning_rate": 9.642564102564104e-06, + "loss": 0.357, + "step": 1200 + }, + { + "epoch": 0.518079932332417, + "grad_norm": 8.594799041748047, + "learning_rate": 9.62974358974359e-06, + "loss": 0.3391, + "step": 1225 + }, + { + "epoch": 0.5286529921759358, + "grad_norm": 7.494054794311523, + "learning_rate": 9.616923076923077e-06, + "loss": 0.3315, + "step": 1250 + }, + { + "epoch": 0.5392260520194544, + "grad_norm": 7.793934345245361, + "learning_rate": 9.604102564102566e-06, + "loss": 0.3226, + "step": 1275 + }, + { + "epoch": 0.5497991118629731, + "grad_norm": 7.225709438323975, + "learning_rate": 9.591282051282053e-06, + "loss": 0.3317, + "step": 1300 + }, + { + "epoch": 0.5603721717064919, + "grad_norm": 6.296034812927246, + "learning_rate": 9.578461538461538e-06, + "loss": 0.3302, + "step": 1325 + }, + { + "epoch": 0.5709452315500105, + "grad_norm": 11.198543548583984, + "learning_rate": 9.565641025641026e-06, + "loss": 0.3038, + "step": 1350 + }, + { + "epoch": 0.5815182913935293, + "grad_norm": 6.4023542404174805, + "learning_rate": 9.552820512820513e-06, + "loss": 0.3511, + "step": 1375 + }, + { + "epoch": 0.592091351237048, + "grad_norm": 8.326896667480469, + "learning_rate": 9.54e-06, + "loss": 0.3257, + "step": 1400 + }, + { + "epoch": 0.6026644110805667, + "grad_norm": 7.079158306121826, + "learning_rate": 9.527179487179487e-06, + "loss": 0.335, + "step": 1425 + }, + { + "epoch": 0.6132374709240854, + "grad_norm": 8.422740936279297, + "learning_rate": 9.514358974358975e-06, + "loss": 0.3212, + "step": 1450 + }, + { + "epoch": 0.6238105307676042, + "grad_norm": 8.642356872558594, + "learning_rate": 9.501538461538462e-06, + "loss": 0.314, + "step": 1475 + }, + { + "epoch": 0.6343835906111228, + "grad_norm": 8.129634857177734, + "learning_rate": 9.488717948717949e-06, + "loss": 0.2889, + "step": 1500 + }, + { + "epoch": 0.6449566504546416, + "grad_norm": 7.115053653717041, + "learning_rate": 9.475897435897436e-06, + "loss": 0.3073, + "step": 1525 + }, + { + "epoch": 0.6555297102981603, + "grad_norm": 8.397727966308594, + "learning_rate": 9.463076923076924e-06, + "loss": 0.3212, + "step": 1550 + }, + { + "epoch": 0.666102770141679, + "grad_norm": 9.354975700378418, + "learning_rate": 9.450256410256411e-06, + "loss": 0.3285, + "step": 1575 + }, + { + "epoch": 0.6766758299851977, + "grad_norm": 7.693446159362793, + "learning_rate": 9.437435897435898e-06, + "loss": 0.3382, + "step": 1600 + }, + { + "epoch": 0.6872488898287165, + "grad_norm": 10.583995819091797, + "learning_rate": 9.424615384615385e-06, + "loss": 0.3198, + "step": 1625 + }, + { + "epoch": 0.6978219496722351, + "grad_norm": 9.70487117767334, + "learning_rate": 9.411794871794872e-06, + "loss": 0.2968, + "step": 1650 + }, + { + "epoch": 0.7083950095157538, + "grad_norm": 8.256582260131836, + "learning_rate": 9.39897435897436e-06, + "loss": 0.2994, + "step": 1675 + }, + { + "epoch": 0.7189680693592726, + "grad_norm": 8.840137481689453, + "learning_rate": 9.386153846153847e-06, + "loss": 0.347, + "step": 1700 + }, + { + "epoch": 0.7295411292027912, + "grad_norm": 6.581663608551025, + "learning_rate": 9.373333333333334e-06, + "loss": 0.3436, + "step": 1725 + }, + { + "epoch": 0.74011418904631, + "grad_norm": 10.300860404968262, + "learning_rate": 9.36051282051282e-06, + "loss": 0.2989, + "step": 1750 + }, + { + "epoch": 0.7506872488898287, + "grad_norm": 8.777045249938965, + "learning_rate": 9.34769230769231e-06, + "loss": 0.3212, + "step": 1775 + }, + { + "epoch": 0.7612603087333474, + "grad_norm": 8.94032096862793, + "learning_rate": 9.334871794871796e-06, + "loss": 0.2951, + "step": 1800 + }, + { + "epoch": 0.7718333685768661, + "grad_norm": 6.139760971069336, + "learning_rate": 9.322051282051283e-06, + "loss": 0.2962, + "step": 1825 + }, + { + "epoch": 0.7824064284203849, + "grad_norm": 6.843733310699463, + "learning_rate": 9.30923076923077e-06, + "loss": 0.3324, + "step": 1850 + }, + { + "epoch": 0.7929794882639035, + "grad_norm": 8.275738716125488, + "learning_rate": 9.296410256410258e-06, + "loss": 0.289, + "step": 1875 + }, + { + "epoch": 0.8035525481074223, + "grad_norm": 7.651088237762451, + "learning_rate": 9.283589743589745e-06, + "loss": 0.3563, + "step": 1900 + }, + { + "epoch": 0.814125607950941, + "grad_norm": 9.584896087646484, + "learning_rate": 9.270769230769232e-06, + "loss": 0.3009, + "step": 1925 + }, + { + "epoch": 0.8246986677944598, + "grad_norm": 8.920114517211914, + "learning_rate": 9.257948717948719e-06, + "loss": 0.3229, + "step": 1950 + }, + { + "epoch": 0.8352717276379784, + "grad_norm": 8.181923866271973, + "learning_rate": 9.245128205128206e-06, + "loss": 0.2707, + "step": 1975 + }, + { + "epoch": 0.8458447874814972, + "grad_norm": 6.475239276885986, + "learning_rate": 9.232307692307692e-06, + "loss": 0.3002, + "step": 2000 + }, + { + "epoch": 0.8458447874814972, + "eval_loss": 0.2391187697649002, + "eval_runtime": 464.6331, + "eval_samples_per_second": 7.836, + "eval_steps_per_second": 0.981, + "eval_wer": 0.19638962364842916, + "step": 2000 + }, + { + "epoch": 0.8564178473250159, + "grad_norm": 6.2346930503845215, + "learning_rate": 9.21948717948718e-06, + "loss": 0.3355, + "step": 2025 + }, + { + "epoch": 0.8669909071685346, + "grad_norm": 8.41283130645752, + "learning_rate": 9.207179487179488e-06, + "loss": 0.2924, + "step": 2050 + }, + { + "epoch": 0.8775639670120533, + "grad_norm": 10.027973175048828, + "learning_rate": 9.194358974358975e-06, + "loss": 0.2711, + "step": 2075 + }, + { + "epoch": 0.888137026855572, + "grad_norm": 6.886898040771484, + "learning_rate": 9.181538461538464e-06, + "loss": 0.2593, + "step": 2100 + }, + { + "epoch": 0.8987100866990907, + "grad_norm": 6.60000467300415, + "learning_rate": 9.168717948717949e-06, + "loss": 0.2818, + "step": 2125 + }, + { + "epoch": 0.9092831465426094, + "grad_norm": 8.06698989868164, + "learning_rate": 9.155897435897436e-06, + "loss": 0.2812, + "step": 2150 + }, + { + "epoch": 0.9198562063861282, + "grad_norm": 11.074329376220703, + "learning_rate": 9.143076923076924e-06, + "loss": 0.2717, + "step": 2175 + }, + { + "epoch": 0.9304292662296468, + "grad_norm": 7.394618511199951, + "learning_rate": 9.130256410256411e-06, + "loss": 0.2752, + "step": 2200 + }, + { + "epoch": 0.9410023260731656, + "grad_norm": 9.914189338684082, + "learning_rate": 9.117435897435898e-06, + "loss": 0.2842, + "step": 2225 + }, + { + "epoch": 0.9515753859166843, + "grad_norm": 13.050373077392578, + "learning_rate": 9.104615384615385e-06, + "loss": 0.2681, + "step": 2250 + }, + { + "epoch": 0.962148445760203, + "grad_norm": 9.03541088104248, + "learning_rate": 9.091794871794873e-06, + "loss": 0.2969, + "step": 2275 + }, + { + "epoch": 0.9727215056037217, + "grad_norm": 9.255524635314941, + "learning_rate": 9.07897435897436e-06, + "loss": 0.2558, + "step": 2300 + }, + { + "epoch": 0.9832945654472405, + "grad_norm": 9.215723037719727, + "learning_rate": 9.066153846153847e-06, + "loss": 0.2648, + "step": 2325 + }, + { + "epoch": 0.9938676252907591, + "grad_norm": 10.28739070892334, + "learning_rate": 9.053333333333334e-06, + "loss": 0.2809, + "step": 2350 + }, + { + "epoch": 1.0044406851342778, + "grad_norm": 5.855769157409668, + "learning_rate": 9.04051282051282e-06, + "loss": 0.2384, + "step": 2375 + }, + { + "epoch": 1.0150137449777965, + "grad_norm": 5.34947395324707, + "learning_rate": 9.027692307692309e-06, + "loss": 0.2049, + "step": 2400 + }, + { + "epoch": 1.0255868048213153, + "grad_norm": 6.5480170249938965, + "learning_rate": 9.014871794871796e-06, + "loss": 0.1907, + "step": 2425 + }, + { + "epoch": 1.036159864664834, + "grad_norm": 8.17578125, + "learning_rate": 9.002051282051283e-06, + "loss": 0.1948, + "step": 2450 + }, + { + "epoch": 1.0467329245083528, + "grad_norm": 6.4552083015441895, + "learning_rate": 8.98923076923077e-06, + "loss": 0.1703, + "step": 2475 + }, + { + "epoch": 1.0573059843518715, + "grad_norm": 6.099579811096191, + "learning_rate": 8.976410256410258e-06, + "loss": 0.1765, + "step": 2500 + }, + { + "epoch": 1.06787904419539, + "grad_norm": 7.8965301513671875, + "learning_rate": 8.963589743589745e-06, + "loss": 0.1849, + "step": 2525 + }, + { + "epoch": 1.0784521040389088, + "grad_norm": 6.395140171051025, + "learning_rate": 8.950769230769232e-06, + "loss": 0.2055, + "step": 2550 + }, + { + "epoch": 1.0890251638824275, + "grad_norm": 5.077881813049316, + "learning_rate": 8.937948717948718e-06, + "loss": 0.1857, + "step": 2575 + }, + { + "epoch": 1.0995982237259463, + "grad_norm": 6.063033103942871, + "learning_rate": 8.925128205128207e-06, + "loss": 0.1629, + "step": 2600 + }, + { + "epoch": 1.110171283569465, + "grad_norm": 5.514612197875977, + "learning_rate": 8.912307692307694e-06, + "loss": 0.1735, + "step": 2625 + }, + { + "epoch": 1.1207443434129838, + "grad_norm": 6.513278961181641, + "learning_rate": 8.89948717948718e-06, + "loss": 0.174, + "step": 2650 + }, + { + "epoch": 1.1313174032565025, + "grad_norm": 5.930003643035889, + "learning_rate": 8.886666666666667e-06, + "loss": 0.183, + "step": 2675 + }, + { + "epoch": 1.141890463100021, + "grad_norm": 6.062052249908447, + "learning_rate": 8.873846153846156e-06, + "loss": 0.2031, + "step": 2700 + }, + { + "epoch": 1.1524635229435398, + "grad_norm": 6.619091987609863, + "learning_rate": 8.861025641025641e-06, + "loss": 0.1669, + "step": 2725 + }, + { + "epoch": 1.1630365827870586, + "grad_norm": 6.383360385894775, + "learning_rate": 8.848205128205128e-06, + "loss": 0.1698, + "step": 2750 + }, + { + "epoch": 1.1736096426305773, + "grad_norm": 5.248838901519775, + "learning_rate": 8.835384615384616e-06, + "loss": 0.1783, + "step": 2775 + }, + { + "epoch": 1.184182702474096, + "grad_norm": 4.1173319816589355, + "learning_rate": 8.822564102564103e-06, + "loss": 0.1689, + "step": 2800 + }, + { + "epoch": 1.1947557623176146, + "grad_norm": 4.528750419616699, + "learning_rate": 8.80974358974359e-06, + "loss": 0.1714, + "step": 2825 + }, + { + "epoch": 1.2053288221611334, + "grad_norm": 7.49066686630249, + "learning_rate": 8.796923076923077e-06, + "loss": 0.1567, + "step": 2850 + }, + { + "epoch": 1.215901882004652, + "grad_norm": 6.05342960357666, + "learning_rate": 8.784102564102565e-06, + "loss": 0.1675, + "step": 2875 + }, + { + "epoch": 1.2264749418481709, + "grad_norm": 6.280456066131592, + "learning_rate": 8.771282051282052e-06, + "loss": 0.1885, + "step": 2900 + }, + { + "epoch": 1.2370480016916896, + "grad_norm": 6.7942657470703125, + "learning_rate": 8.758461538461539e-06, + "loss": 0.1725, + "step": 2925 + }, + { + "epoch": 1.2476210615352084, + "grad_norm": 5.41685152053833, + "learning_rate": 8.745641025641026e-06, + "loss": 0.178, + "step": 2950 + }, + { + "epoch": 1.258194121378727, + "grad_norm": 7.254810333251953, + "learning_rate": 8.732820512820513e-06, + "loss": 0.1788, + "step": 2975 + }, + { + "epoch": 1.2687671812222456, + "grad_norm": 5.75822639465332, + "learning_rate": 8.720000000000001e-06, + "loss": 0.1718, + "step": 3000 + }, + { + "epoch": 1.2687671812222456, + "eval_loss": 0.20488913357257843, + "eval_runtime": 470.0703, + "eval_samples_per_second": 7.746, + "eval_steps_per_second": 0.97, + "eval_wer": 0.16594737574829457, + "step": 3000 + }, + { + "epoch": 1.2793402410657644, + "grad_norm": 4.381278038024902, + "learning_rate": 8.707179487179488e-06, + "loss": 0.1656, + "step": 3025 + }, + { + "epoch": 1.2899133009092831, + "grad_norm": 6.3774919509887695, + "learning_rate": 8.694358974358975e-06, + "loss": 0.1987, + "step": 3050 + }, + { + "epoch": 1.3004863607528019, + "grad_norm": 5.695129871368408, + "learning_rate": 8.681538461538462e-06, + "loss": 0.1602, + "step": 3075 + }, + { + "epoch": 1.3110594205963206, + "grad_norm": 5.8820013999938965, + "learning_rate": 8.66871794871795e-06, + "loss": 0.1788, + "step": 3100 + }, + { + "epoch": 1.3216324804398392, + "grad_norm": 6.164638519287109, + "learning_rate": 8.655897435897437e-06, + "loss": 0.2075, + "step": 3125 + }, + { + "epoch": 1.3322055402833581, + "grad_norm": 6.681603908538818, + "learning_rate": 8.643076923076924e-06, + "loss": 0.1867, + "step": 3150 + }, + { + "epoch": 1.3427786001268767, + "grad_norm": 4.936852931976318, + "learning_rate": 8.63025641025641e-06, + "loss": 0.1917, + "step": 3175 + }, + { + "epoch": 1.3533516599703954, + "grad_norm": 5.827350616455078, + "learning_rate": 8.6174358974359e-06, + "loss": 0.1898, + "step": 3200 + }, + { + "epoch": 1.3639247198139142, + "grad_norm": 6.6198272705078125, + "learning_rate": 8.604615384615386e-06, + "loss": 0.1659, + "step": 3225 + }, + { + "epoch": 1.374497779657433, + "grad_norm": 8.098386764526367, + "learning_rate": 8.591794871794873e-06, + "loss": 0.1793, + "step": 3250 + }, + { + "epoch": 1.3850708395009517, + "grad_norm": 7.836790561676025, + "learning_rate": 8.57897435897436e-06, + "loss": 0.1551, + "step": 3275 + }, + { + "epoch": 1.3956438993444702, + "grad_norm": 4.521047115325928, + "learning_rate": 8.566153846153848e-06, + "loss": 0.156, + "step": 3300 + }, + { + "epoch": 1.406216959187989, + "grad_norm": 6.938994407653809, + "learning_rate": 8.553333333333333e-06, + "loss": 0.1722, + "step": 3325 + }, + { + "epoch": 1.4167900190315077, + "grad_norm": 7.659555435180664, + "learning_rate": 8.54051282051282e-06, + "loss": 0.1781, + "step": 3350 + }, + { + "epoch": 1.4273630788750264, + "grad_norm": 5.197014808654785, + "learning_rate": 8.527692307692309e-06, + "loss": 0.1606, + "step": 3375 + }, + { + "epoch": 1.4379361387185452, + "grad_norm": 4.832046031951904, + "learning_rate": 8.514871794871795e-06, + "loss": 0.1579, + "step": 3400 + }, + { + "epoch": 1.4485091985620637, + "grad_norm": 7.802398681640625, + "learning_rate": 8.502051282051282e-06, + "loss": 0.1922, + "step": 3425 + }, + { + "epoch": 1.4590822584055827, + "grad_norm": 6.803133487701416, + "learning_rate": 8.489230769230769e-06, + "loss": 0.2039, + "step": 3450 + }, + { + "epoch": 1.4696553182491012, + "grad_norm": 6.156517505645752, + "learning_rate": 8.476410256410258e-06, + "loss": 0.1723, + "step": 3475 + }, + { + "epoch": 1.48022837809262, + "grad_norm": 6.290517330169678, + "learning_rate": 8.463589743589744e-06, + "loss": 0.1739, + "step": 3500 + }, + { + "epoch": 1.4908014379361387, + "grad_norm": 7.008253574371338, + "learning_rate": 8.450769230769231e-06, + "loss": 0.1465, + "step": 3525 + }, + { + "epoch": 1.5013744977796575, + "grad_norm": 5.356746673583984, + "learning_rate": 8.437948717948718e-06, + "loss": 0.185, + "step": 3550 + }, + { + "epoch": 1.5119475576231762, + "grad_norm": 7.195596694946289, + "learning_rate": 8.425128205128205e-06, + "loss": 0.1968, + "step": 3575 + }, + { + "epoch": 1.5225206174666948, + "grad_norm": 7.071013927459717, + "learning_rate": 8.412307692307693e-06, + "loss": 0.1722, + "step": 3600 + }, + { + "epoch": 1.5330936773102137, + "grad_norm": 6.704313278198242, + "learning_rate": 8.39948717948718e-06, + "loss": 0.1763, + "step": 3625 + }, + { + "epoch": 1.5436667371537323, + "grad_norm": 3.831084966659546, + "learning_rate": 8.386666666666667e-06, + "loss": 0.1795, + "step": 3650 + }, + { + "epoch": 1.554239796997251, + "grad_norm": 5.931239604949951, + "learning_rate": 8.373846153846154e-06, + "loss": 0.1717, + "step": 3675 + }, + { + "epoch": 1.5648128568407698, + "grad_norm": 8.750226974487305, + "learning_rate": 8.361025641025642e-06, + "loss": 0.1679, + "step": 3700 + }, + { + "epoch": 1.5753859166842883, + "grad_norm": 5.434117317199707, + "learning_rate": 8.34820512820513e-06, + "loss": 0.1343, + "step": 3725 + }, + { + "epoch": 1.5859589765278073, + "grad_norm": 6.517702102661133, + "learning_rate": 8.335384615384616e-06, + "loss": 0.1641, + "step": 3750 + }, + { + "epoch": 1.5965320363713258, + "grad_norm": 7.22881555557251, + "learning_rate": 8.322564102564103e-06, + "loss": 0.152, + "step": 3775 + }, + { + "epoch": 1.6071050962148445, + "grad_norm": 9.106505393981934, + "learning_rate": 8.309743589743591e-06, + "loss": 0.168, + "step": 3800 + }, + { + "epoch": 1.6176781560583633, + "grad_norm": 6.742056369781494, + "learning_rate": 8.296923076923078e-06, + "loss": 0.1624, + "step": 3825 + }, + { + "epoch": 1.628251215901882, + "grad_norm": 5.521613597869873, + "learning_rate": 8.284102564102565e-06, + "loss": 0.1476, + "step": 3850 + }, + { + "epoch": 1.6388242757454008, + "grad_norm": 5.299131870269775, + "learning_rate": 8.271282051282052e-06, + "loss": 0.1576, + "step": 3875 + }, + { + "epoch": 1.6493973355889193, + "grad_norm": 3.890312671661377, + "learning_rate": 8.25846153846154e-06, + "loss": 0.162, + "step": 3900 + }, + { + "epoch": 1.6599703954324383, + "grad_norm": 5.404117584228516, + "learning_rate": 8.245641025641027e-06, + "loss": 0.1622, + "step": 3925 + }, + { + "epoch": 1.6705434552759568, + "grad_norm": 8.395030975341797, + "learning_rate": 8.232820512820512e-06, + "loss": 0.1473, + "step": 3950 + }, + { + "epoch": 1.6811165151194756, + "grad_norm": 7.276181221008301, + "learning_rate": 8.220000000000001e-06, + "loss": 0.1538, + "step": 3975 + }, + { + "epoch": 1.6916895749629943, + "grad_norm": 8.637175559997559, + "learning_rate": 8.207179487179488e-06, + "loss": 0.1537, + "step": 4000 + }, + { + "epoch": 1.6916895749629943, + "eval_loss": 0.1817464828491211, + "eval_runtime": 469.7808, + "eval_samples_per_second": 7.75, + "eval_steps_per_second": 0.971, + "eval_wer": 0.15156155738085295, + "step": 4000 + }, + { + "epoch": 1.7022626348065129, + "grad_norm": 2.9336845874786377, + "learning_rate": 8.194358974358975e-06, + "loss": 0.1789, + "step": 4025 + }, + { + "epoch": 1.7128356946500318, + "grad_norm": 4.7298688888549805, + "learning_rate": 8.181538461538461e-06, + "loss": 0.1631, + "step": 4050 + }, + { + "epoch": 1.7234087544935504, + "grad_norm": 6.827828407287598, + "learning_rate": 8.16871794871795e-06, + "loss": 0.1599, + "step": 4075 + }, + { + "epoch": 1.733981814337069, + "grad_norm": 5.460879325866699, + "learning_rate": 8.155897435897437e-06, + "loss": 0.1836, + "step": 4100 + }, + { + "epoch": 1.7445548741805879, + "grad_norm": 6.356947898864746, + "learning_rate": 8.143076923076924e-06, + "loss": 0.1533, + "step": 4125 + }, + { + "epoch": 1.7551279340241066, + "grad_norm": 7.573319435119629, + "learning_rate": 8.13025641025641e-06, + "loss": 0.1437, + "step": 4150 + }, + { + "epoch": 1.7657009938676254, + "grad_norm": 4.901601791381836, + "learning_rate": 8.117435897435897e-06, + "loss": 0.1371, + "step": 4175 + }, + { + "epoch": 1.7762740537111439, + "grad_norm": 6.051869869232178, + "learning_rate": 8.104615384615386e-06, + "loss": 0.1883, + "step": 4200 + }, + { + "epoch": 1.7868471135546629, + "grad_norm": 7.222878932952881, + "learning_rate": 8.091794871794873e-06, + "loss": 0.1614, + "step": 4225 + }, + { + "epoch": 1.7974201733981814, + "grad_norm": 6.554632663726807, + "learning_rate": 8.07897435897436e-06, + "loss": 0.1629, + "step": 4250 + }, + { + "epoch": 1.8079932332417001, + "grad_norm": 6.802252292633057, + "learning_rate": 8.066153846153846e-06, + "loss": 0.1526, + "step": 4275 + }, + { + "epoch": 1.8185662930852189, + "grad_norm": 6.444076061248779, + "learning_rate": 8.053333333333335e-06, + "loss": 0.1549, + "step": 4300 + }, + { + "epoch": 1.8291393529287374, + "grad_norm": 6.49000358581543, + "learning_rate": 8.040512820512822e-06, + "loss": 0.1469, + "step": 4325 + }, + { + "epoch": 1.8397124127722564, + "grad_norm": 5.989566802978516, + "learning_rate": 8.027692307692308e-06, + "loss": 0.1587, + "step": 4350 + }, + { + "epoch": 1.850285472615775, + "grad_norm": 4.621600151062012, + "learning_rate": 8.014871794871795e-06, + "loss": 0.1394, + "step": 4375 + }, + { + "epoch": 1.8608585324592937, + "grad_norm": 7.434966087341309, + "learning_rate": 8.002051282051284e-06, + "loss": 0.1484, + "step": 4400 + }, + { + "epoch": 1.8714315923028124, + "grad_norm": 4.955178737640381, + "learning_rate": 7.98923076923077e-06, + "loss": 0.1304, + "step": 4425 + }, + { + "epoch": 1.8820046521463312, + "grad_norm": 6.121462821960449, + "learning_rate": 7.976410256410257e-06, + "loss": 0.1676, + "step": 4450 + }, + { + "epoch": 1.89257771198985, + "grad_norm": 5.908482551574707, + "learning_rate": 7.963589743589744e-06, + "loss": 0.1463, + "step": 4475 + }, + { + "epoch": 1.9031507718333684, + "grad_norm": 4.962184906005859, + "learning_rate": 7.950769230769233e-06, + "loss": 0.1494, + "step": 4500 + }, + { + "epoch": 1.9137238316768874, + "grad_norm": 6.131477355957031, + "learning_rate": 7.93794871794872e-06, + "loss": 0.146, + "step": 4525 + }, + { + "epoch": 1.924296891520406, + "grad_norm": 7.316920280456543, + "learning_rate": 7.925128205128205e-06, + "loss": 0.1451, + "step": 4550 + }, + { + "epoch": 1.9348699513639247, + "grad_norm": 6.914496898651123, + "learning_rate": 7.912307692307693e-06, + "loss": 0.1263, + "step": 4575 + }, + { + "epoch": 1.9454430112074435, + "grad_norm": 6.739365100860596, + "learning_rate": 7.89948717948718e-06, + "loss": 0.1467, + "step": 4600 + }, + { + "epoch": 1.9560160710509622, + "grad_norm": 7.861861705780029, + "learning_rate": 7.886666666666667e-06, + "loss": 0.1582, + "step": 4625 + }, + { + "epoch": 1.966589130894481, + "grad_norm": 8.007691383361816, + "learning_rate": 7.873846153846154e-06, + "loss": 0.1569, + "step": 4650 + }, + { + "epoch": 1.9771621907379995, + "grad_norm": 9.34388542175293, + "learning_rate": 7.861025641025642e-06, + "loss": 0.1419, + "step": 4675 + }, + { + "epoch": 1.9877352505815185, + "grad_norm": 7.265613555908203, + "learning_rate": 7.848205128205129e-06, + "loss": 0.1436, + "step": 4700 + }, + { + "epoch": 1.998308310425037, + "grad_norm": 5.50709867477417, + "learning_rate": 7.835384615384616e-06, + "loss": 0.1467, + "step": 4725 + }, + { + "epoch": 2.0088813702685555, + "grad_norm": 4.5442962646484375, + "learning_rate": 7.822564102564103e-06, + "loss": 0.0813, + "step": 4750 + }, + { + "epoch": 2.0194544301120745, + "grad_norm": 6.023507595062256, + "learning_rate": 7.80974358974359e-06, + "loss": 0.0902, + "step": 4775 + }, + { + "epoch": 2.030027489955593, + "grad_norm": 2.5194263458251953, + "learning_rate": 7.796923076923078e-06, + "loss": 0.092, + "step": 4800 + }, + { + "epoch": 2.040600549799112, + "grad_norm": 5.59611701965332, + "learning_rate": 7.784102564102565e-06, + "loss": 0.0888, + "step": 4825 + }, + { + "epoch": 2.0511736096426305, + "grad_norm": 5.128935813903809, + "learning_rate": 7.771282051282052e-06, + "loss": 0.0886, + "step": 4850 + }, + { + "epoch": 2.0617466694861495, + "grad_norm": 4.068778038024902, + "learning_rate": 7.758461538461538e-06, + "loss": 0.0754, + "step": 4875 + }, + { + "epoch": 2.072319729329668, + "grad_norm": 3.985748052597046, + "learning_rate": 7.745641025641027e-06, + "loss": 0.0882, + "step": 4900 + }, + { + "epoch": 2.0828927891731865, + "grad_norm": 3.7527148723602295, + "learning_rate": 7.732820512820514e-06, + "loss": 0.0752, + "step": 4925 + }, + { + "epoch": 2.0934658490167055, + "grad_norm": 6.429442405700684, + "learning_rate": 7.72e-06, + "loss": 0.0999, + "step": 4950 + }, + { + "epoch": 2.104038908860224, + "grad_norm": 3.7136788368225098, + "learning_rate": 7.707179487179487e-06, + "loss": 0.0745, + "step": 4975 + }, + { + "epoch": 2.114611968703743, + "grad_norm": 3.5466103553771973, + "learning_rate": 7.694358974358976e-06, + "loss": 0.0807, + "step": 5000 + }, + { + "epoch": 2.114611968703743, + "eval_loss": 0.16425499320030212, + "eval_runtime": 474.1617, + "eval_samples_per_second": 7.679, + "eval_steps_per_second": 0.962, + "eval_wer": 0.14989094621560164, + "step": 5000 + }, + { + "epoch": 2.1251850285472615, + "grad_norm": 4.999958515167236, + "learning_rate": 7.681538461538463e-06, + "loss": 0.0804, + "step": 5025 + }, + { + "epoch": 2.13575808839078, + "grad_norm": 5.525317668914795, + "learning_rate": 7.66871794871795e-06, + "loss": 0.1002, + "step": 5050 + }, + { + "epoch": 2.146331148234299, + "grad_norm": 4.599583148956299, + "learning_rate": 7.655897435897436e-06, + "loss": 0.0868, + "step": 5075 + }, + { + "epoch": 2.1569042080778176, + "grad_norm": 5.12149715423584, + "learning_rate": 7.643076923076925e-06, + "loss": 0.0937, + "step": 5100 + }, + { + "epoch": 2.1674772679213365, + "grad_norm": 3.3281843662261963, + "learning_rate": 7.630256410256412e-06, + "loss": 0.0748, + "step": 5125 + }, + { + "epoch": 2.178050327764855, + "grad_norm": 5.142365455627441, + "learning_rate": 7.617435897435898e-06, + "loss": 0.0931, + "step": 5150 + }, + { + "epoch": 2.188623387608374, + "grad_norm": 3.4713194370269775, + "learning_rate": 7.604615384615385e-06, + "loss": 0.0747, + "step": 5175 + }, + { + "epoch": 2.1991964474518926, + "grad_norm": 3.9539084434509277, + "learning_rate": 7.591794871794872e-06, + "loss": 0.0917, + "step": 5200 + }, + { + "epoch": 2.209769507295411, + "grad_norm": 6.873681545257568, + "learning_rate": 7.578974358974359e-06, + "loss": 0.0842, + "step": 5225 + }, + { + "epoch": 2.22034256713893, + "grad_norm": 3.6090524196624756, + "learning_rate": 7.566153846153847e-06, + "loss": 0.0779, + "step": 5250 + }, + { + "epoch": 2.2309156269824486, + "grad_norm": 4.458488464355469, + "learning_rate": 7.553333333333334e-06, + "loss": 0.0854, + "step": 5275 + }, + { + "epoch": 2.2414886868259676, + "grad_norm": 4.1330718994140625, + "learning_rate": 7.540512820512821e-06, + "loss": 0.0674, + "step": 5300 + }, + { + "epoch": 2.252061746669486, + "grad_norm": 3.256821393966675, + "learning_rate": 7.527692307692308e-06, + "loss": 0.0787, + "step": 5325 + }, + { + "epoch": 2.262634806513005, + "grad_norm": 3.738051176071167, + "learning_rate": 7.514871794871795e-06, + "loss": 0.0844, + "step": 5350 + }, + { + "epoch": 2.2732078663565236, + "grad_norm": 6.081026554107666, + "learning_rate": 7.5020512820512826e-06, + "loss": 0.0794, + "step": 5375 + }, + { + "epoch": 2.283780926200042, + "grad_norm": 4.990882396697998, + "learning_rate": 7.489230769230769e-06, + "loss": 0.0806, + "step": 5400 + }, + { + "epoch": 2.294353986043561, + "grad_norm": 3.432616710662842, + "learning_rate": 7.476410256410257e-06, + "loss": 0.0961, + "step": 5425 + }, + { + "epoch": 2.3049270458870796, + "grad_norm": 9.63456916809082, + "learning_rate": 7.463589743589744e-06, + "loss": 0.0801, + "step": 5450 + }, + { + "epoch": 2.3155001057305986, + "grad_norm": 4.464268207550049, + "learning_rate": 7.4507692307692316e-06, + "loss": 0.0796, + "step": 5475 + }, + { + "epoch": 2.326073165574117, + "grad_norm": 3.0448217391967773, + "learning_rate": 7.437948717948718e-06, + "loss": 0.0718, + "step": 5500 + }, + { + "epoch": 2.3366462254176357, + "grad_norm": 6.034780979156494, + "learning_rate": 7.425128205128206e-06, + "loss": 0.1072, + "step": 5525 + }, + { + "epoch": 2.3472192852611546, + "grad_norm": 5.285013675689697, + "learning_rate": 7.412307692307693e-06, + "loss": 0.094, + "step": 5550 + }, + { + "epoch": 2.357792345104673, + "grad_norm": 4.443994998931885, + "learning_rate": 7.3994871794871806e-06, + "loss": 0.0897, + "step": 5575 + }, + { + "epoch": 2.368365404948192, + "grad_norm": 6.323433876037598, + "learning_rate": 7.386666666666667e-06, + "loss": 0.0774, + "step": 5600 + }, + { + "epoch": 2.3789384647917107, + "grad_norm": 7.71209192276001, + "learning_rate": 7.373846153846155e-06, + "loss": 0.097, + "step": 5625 + }, + { + "epoch": 2.389511524635229, + "grad_norm": 2.249687910079956, + "learning_rate": 7.361025641025642e-06, + "loss": 0.1082, + "step": 5650 + }, + { + "epoch": 2.400084584478748, + "grad_norm": 4.210668087005615, + "learning_rate": 7.3482051282051295e-06, + "loss": 0.0649, + "step": 5675 + }, + { + "epoch": 2.4106576443222667, + "grad_norm": 5.31463098526001, + "learning_rate": 7.335384615384616e-06, + "loss": 0.0924, + "step": 5700 + }, + { + "epoch": 2.4212307041657857, + "grad_norm": 5.068974494934082, + "learning_rate": 7.322564102564104e-06, + "loss": 0.0783, + "step": 5725 + }, + { + "epoch": 2.431803764009304, + "grad_norm": 5.510648727416992, + "learning_rate": 7.309743589743591e-06, + "loss": 0.0791, + "step": 5750 + }, + { + "epoch": 2.442376823852823, + "grad_norm": 5.710975646972656, + "learning_rate": 7.296923076923077e-06, + "loss": 0.0842, + "step": 5775 + }, + { + "epoch": 2.4529498836963417, + "grad_norm": 4.2290215492248535, + "learning_rate": 7.2841025641025645e-06, + "loss": 0.0827, + "step": 5800 + }, + { + "epoch": 2.4635229435398607, + "grad_norm": 5.621830940246582, + "learning_rate": 7.271282051282051e-06, + "loss": 0.0809, + "step": 5825 + }, + { + "epoch": 2.474096003383379, + "grad_norm": 2.6628761291503906, + "learning_rate": 7.258461538461539e-06, + "loss": 0.0784, + "step": 5850 + }, + { + "epoch": 2.4846690632268977, + "grad_norm": 5.889407634735107, + "learning_rate": 7.245641025641026e-06, + "loss": 0.0908, + "step": 5875 + }, + { + "epoch": 2.4952421230704167, + "grad_norm": 4.617676258087158, + "learning_rate": 7.2328205128205135e-06, + "loss": 0.0883, + "step": 5900 + }, + { + "epoch": 2.5058151829139352, + "grad_norm": 4.988363265991211, + "learning_rate": 7.22e-06, + "loss": 0.1109, + "step": 5925 + }, + { + "epoch": 2.516388242757454, + "grad_norm": 5.262870788574219, + "learning_rate": 7.207179487179487e-06, + "loss": 0.1085, + "step": 5950 + }, + { + "epoch": 2.5269613026009727, + "grad_norm": 19.423404693603516, + "learning_rate": 7.194358974358975e-06, + "loss": 0.0868, + "step": 5975 + }, + { + "epoch": 2.5375343624444913, + "grad_norm": 6.528380870819092, + "learning_rate": 7.181538461538462e-06, + "loss": 0.089, + "step": 6000 + }, + { + "epoch": 2.5375343624444913, + "eval_loss": 0.15615449845790863, + "eval_runtime": 479.6175, + "eval_samples_per_second": 7.591, + "eval_steps_per_second": 0.951, + "eval_wer": 0.13476263399693722, + "step": 6000 + }, + { + "epoch": 2.5481074222880102, + "grad_norm": 4.123988628387451, + "learning_rate": 7.168717948717949e-06, + "loss": 0.085, + "step": 6025 + }, + { + "epoch": 2.5586804821315288, + "grad_norm": 3.1818017959594727, + "learning_rate": 7.155897435897436e-06, + "loss": 0.0906, + "step": 6050 + }, + { + "epoch": 2.5692535419750477, + "grad_norm": 5.67425537109375, + "learning_rate": 7.143076923076924e-06, + "loss": 0.0928, + "step": 6075 + }, + { + "epoch": 2.5798266018185663, + "grad_norm": 4.55906867980957, + "learning_rate": 7.130256410256411e-06, + "loss": 0.0732, + "step": 6100 + }, + { + "epoch": 2.590399661662085, + "grad_norm": 3.958922863006592, + "learning_rate": 7.117435897435898e-06, + "loss": 0.0867, + "step": 6125 + }, + { + "epoch": 2.6009727215056038, + "grad_norm": 4.0345306396484375, + "learning_rate": 7.104615384615385e-06, + "loss": 0.0636, + "step": 6150 + }, + { + "epoch": 2.6115457813491223, + "grad_norm": 6.317999839782715, + "learning_rate": 7.091794871794873e-06, + "loss": 0.0816, + "step": 6175 + }, + { + "epoch": 2.6221188411926413, + "grad_norm": 7.474951267242432, + "learning_rate": 7.07897435897436e-06, + "loss": 0.0953, + "step": 6200 + }, + { + "epoch": 2.63269190103616, + "grad_norm": 4.363856315612793, + "learning_rate": 7.066153846153847e-06, + "loss": 0.0931, + "step": 6225 + }, + { + "epoch": 2.6432649608796783, + "grad_norm": 6.427609920501709, + "learning_rate": 7.053333333333334e-06, + "loss": 0.0794, + "step": 6250 + }, + { + "epoch": 2.6538380207231973, + "grad_norm": 5.55384635925293, + "learning_rate": 7.040512820512822e-06, + "loss": 0.0892, + "step": 6275 + }, + { + "epoch": 2.6644110805667163, + "grad_norm": 5.477039813995361, + "learning_rate": 7.027692307692309e-06, + "loss": 0.0681, + "step": 6300 + }, + { + "epoch": 2.674984140410235, + "grad_norm": 4.115767955780029, + "learning_rate": 7.014871794871796e-06, + "loss": 0.0673, + "step": 6325 + }, + { + "epoch": 2.6855572002537533, + "grad_norm": 3.343461751937866, + "learning_rate": 7.002051282051283e-06, + "loss": 0.0651, + "step": 6350 + }, + { + "epoch": 2.6961302600972723, + "grad_norm": 5.022345542907715, + "learning_rate": 6.989230769230769e-06, + "loss": 0.0727, + "step": 6375 + }, + { + "epoch": 2.706703319940791, + "grad_norm": 5.0488128662109375, + "learning_rate": 6.976410256410257e-06, + "loss": 0.0808, + "step": 6400 + }, + { + "epoch": 2.71727637978431, + "grad_norm": 5.218719005584717, + "learning_rate": 6.963589743589744e-06, + "loss": 0.0955, + "step": 6425 + }, + { + "epoch": 2.7278494396278283, + "grad_norm": 4.660473346710205, + "learning_rate": 6.950769230769231e-06, + "loss": 0.0855, + "step": 6450 + }, + { + "epoch": 2.738422499471347, + "grad_norm": 5.673701286315918, + "learning_rate": 6.937948717948718e-06, + "loss": 0.0899, + "step": 6475 + }, + { + "epoch": 2.748995559314866, + "grad_norm": 4.193291187286377, + "learning_rate": 6.925128205128206e-06, + "loss": 0.0986, + "step": 6500 + }, + { + "epoch": 2.7595686191583844, + "grad_norm": 3.0398550033569336, + "learning_rate": 6.912307692307693e-06, + "loss": 0.0747, + "step": 6525 + }, + { + "epoch": 2.7701416790019033, + "grad_norm": 4.971275806427002, + "learning_rate": 6.899487179487179e-06, + "loss": 0.0829, + "step": 6550 + }, + { + "epoch": 2.780714738845422, + "grad_norm": 5.15726375579834, + "learning_rate": 6.886666666666667e-06, + "loss": 0.074, + "step": 6575 + }, + { + "epoch": 2.7912877986889404, + "grad_norm": 4.134614944458008, + "learning_rate": 6.873846153846154e-06, + "loss": 0.0764, + "step": 6600 + }, + { + "epoch": 2.8018608585324594, + "grad_norm": 3.367145538330078, + "learning_rate": 6.861025641025642e-06, + "loss": 0.085, + "step": 6625 + }, + { + "epoch": 2.812433918375978, + "grad_norm": 5.638070106506348, + "learning_rate": 6.848205128205128e-06, + "loss": 0.0687, + "step": 6650 + }, + { + "epoch": 2.823006978219497, + "grad_norm": 5.024541854858398, + "learning_rate": 6.835384615384616e-06, + "loss": 0.0778, + "step": 6675 + }, + { + "epoch": 2.8335800380630154, + "grad_norm": 2.271406412124634, + "learning_rate": 6.822564102564103e-06, + "loss": 0.0755, + "step": 6700 + }, + { + "epoch": 2.844153097906534, + "grad_norm": 3.323329210281372, + "learning_rate": 6.8097435897435906e-06, + "loss": 0.0762, + "step": 6725 + }, + { + "epoch": 2.854726157750053, + "grad_norm": 3.916624069213867, + "learning_rate": 6.796923076923077e-06, + "loss": 0.0985, + "step": 6750 + }, + { + "epoch": 2.8652992175935714, + "grad_norm": 4.36145544052124, + "learning_rate": 6.784102564102565e-06, + "loss": 0.0815, + "step": 6775 + }, + { + "epoch": 2.8758722774370904, + "grad_norm": 3.7215347290039062, + "learning_rate": 6.771282051282052e-06, + "loss": 0.0721, + "step": 6800 + }, + { + "epoch": 2.886445337280609, + "grad_norm": 3.406437397003174, + "learning_rate": 6.7584615384615396e-06, + "loss": 0.0794, + "step": 6825 + }, + { + "epoch": 2.8970183971241275, + "grad_norm": 4.781268119812012, + "learning_rate": 6.745641025641026e-06, + "loss": 0.0831, + "step": 6850 + }, + { + "epoch": 2.9075914569676464, + "grad_norm": 7.577853202819824, + "learning_rate": 6.732820512820514e-06, + "loss": 0.0837, + "step": 6875 + }, + { + "epoch": 2.9181645168111654, + "grad_norm": 4.966701984405518, + "learning_rate": 6.720000000000001e-06, + "loss": 0.0882, + "step": 6900 + }, + { + "epoch": 2.928737576654684, + "grad_norm": 4.185604572296143, + "learning_rate": 6.7071794871794886e-06, + "loss": 0.0894, + "step": 6925 + }, + { + "epoch": 2.9393106364982025, + "grad_norm": 6.426883697509766, + "learning_rate": 6.694358974358975e-06, + "loss": 0.0762, + "step": 6950 + }, + { + "epoch": 2.9498836963417214, + "grad_norm": 5.418060779571533, + "learning_rate": 6.681538461538463e-06, + "loss": 0.0837, + "step": 6975 + }, + { + "epoch": 2.96045675618524, + "grad_norm": 7.900200366973877, + "learning_rate": 6.668717948717949e-06, + "loss": 0.0883, + "step": 7000 + }, + { + "epoch": 2.96045675618524, + "eval_loss": 0.1452471762895584, + "eval_runtime": 481.3751, + "eval_samples_per_second": 7.564, + "eval_steps_per_second": 0.947, + "eval_wer": 0.12678082509629218, + "step": 7000 + }, + { + "epoch": 2.971029816028759, + "grad_norm": 7.546687126159668, + "learning_rate": 6.655897435897436e-06, + "loss": 0.083, + "step": 7025 + }, + { + "epoch": 2.9816028758722775, + "grad_norm": 4.659261703491211, + "learning_rate": 6.6430769230769235e-06, + "loss": 0.0789, + "step": 7050 + }, + { + "epoch": 2.992175935715796, + "grad_norm": 5.171137809753418, + "learning_rate": 6.63025641025641e-06, + "loss": 0.0843, + "step": 7075 + }, + { + "epoch": 3.002748995559315, + "grad_norm": 2.0631213188171387, + "learning_rate": 6.617435897435898e-06, + "loss": 0.0597, + "step": 7100 + }, + { + "epoch": 3.0133220554028335, + "grad_norm": 1.6943330764770508, + "learning_rate": 6.604615384615385e-06, + "loss": 0.0441, + "step": 7125 + }, + { + "epoch": 3.0238951152463525, + "grad_norm": 2.18405818939209, + "learning_rate": 6.5917948717948725e-06, + "loss": 0.0399, + "step": 7150 + }, + { + "epoch": 3.034468175089871, + "grad_norm": 4.707269668579102, + "learning_rate": 6.578974358974359e-06, + "loss": 0.0497, + "step": 7175 + }, + { + "epoch": 3.0450412349333895, + "grad_norm": 2.237076759338379, + "learning_rate": 6.566153846153846e-06, + "loss": 0.0381, + "step": 7200 + }, + { + "epoch": 3.0556142947769085, + "grad_norm": 2.0544979572296143, + "learning_rate": 6.553333333333334e-06, + "loss": 0.0458, + "step": 7225 + }, + { + "epoch": 3.066187354620427, + "grad_norm": 3.0701844692230225, + "learning_rate": 6.540512820512821e-06, + "loss": 0.0641, + "step": 7250 + }, + { + "epoch": 3.076760414463946, + "grad_norm": 2.446314573287964, + "learning_rate": 6.527692307692308e-06, + "loss": 0.0398, + "step": 7275 + }, + { + "epoch": 3.0873334743074645, + "grad_norm": 1.1977494955062866, + "learning_rate": 6.514871794871795e-06, + "loss": 0.0347, + "step": 7300 + }, + { + "epoch": 3.0979065341509835, + "grad_norm": 3.8696768283843994, + "learning_rate": 6.5025641025641026e-06, + "loss": 0.0454, + "step": 7325 + }, + { + "epoch": 3.108479593994502, + "grad_norm": 4.348515033721924, + "learning_rate": 6.48974358974359e-06, + "loss": 0.0358, + "step": 7350 + }, + { + "epoch": 3.1190526538380206, + "grad_norm": 3.3741064071655273, + "learning_rate": 6.476923076923077e-06, + "loss": 0.0477, + "step": 7375 + }, + { + "epoch": 3.1296257136815395, + "grad_norm": 1.3603962659835815, + "learning_rate": 6.464102564102565e-06, + "loss": 0.0424, + "step": 7400 + }, + { + "epoch": 3.140198773525058, + "grad_norm": 2.087542772293091, + "learning_rate": 6.4512820512820516e-06, + "loss": 0.034, + "step": 7425 + }, + { + "epoch": 3.150771833368577, + "grad_norm": 3.6920878887176514, + "learning_rate": 6.438461538461539e-06, + "loss": 0.0441, + "step": 7450 + }, + { + "epoch": 3.1613448932120956, + "grad_norm": 2.65889573097229, + "learning_rate": 6.425641025641026e-06, + "loss": 0.0399, + "step": 7475 + }, + { + "epoch": 3.1719179530556145, + "grad_norm": 6.821660995483398, + "learning_rate": 6.412820512820514e-06, + "loss": 0.0331, + "step": 7500 + }, + { + "epoch": 3.182491012899133, + "grad_norm": 3.360375165939331, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.0424, + "step": 7525 + }, + { + "epoch": 3.1930640727426516, + "grad_norm": 2.295417547225952, + "learning_rate": 6.387179487179488e-06, + "loss": 0.0406, + "step": 7550 + }, + { + "epoch": 3.2036371325861706, + "grad_norm": 2.809596300125122, + "learning_rate": 6.374358974358975e-06, + "loss": 0.0536, + "step": 7575 + }, + { + "epoch": 3.214210192429689, + "grad_norm": 2.86702823638916, + "learning_rate": 6.361538461538463e-06, + "loss": 0.0401, + "step": 7600 + }, + { + "epoch": 3.224783252273208, + "grad_norm": 3.1539106369018555, + "learning_rate": 6.3487179487179495e-06, + "loss": 0.038, + "step": 7625 + }, + { + "epoch": 3.2353563121167266, + "grad_norm": 5.205074787139893, + "learning_rate": 6.335897435897436e-06, + "loss": 0.0402, + "step": 7650 + }, + { + "epoch": 3.245929371960245, + "grad_norm": 4.798672199249268, + "learning_rate": 6.323076923076924e-06, + "loss": 0.0412, + "step": 7675 + }, + { + "epoch": 3.256502431803764, + "grad_norm": 2.3215603828430176, + "learning_rate": 6.310256410256411e-06, + "loss": 0.0389, + "step": 7700 + }, + { + "epoch": 3.2670754916472826, + "grad_norm": 2.9327590465545654, + "learning_rate": 6.2974358974358985e-06, + "loss": 0.0327, + "step": 7725 + }, + { + "epoch": 3.2776485514908016, + "grad_norm": 2.3523976802825928, + "learning_rate": 6.284615384615385e-06, + "loss": 0.0415, + "step": 7750 + }, + { + "epoch": 3.28822161133432, + "grad_norm": 2.7430717945098877, + "learning_rate": 6.271794871794872e-06, + "loss": 0.038, + "step": 7775 + }, + { + "epoch": 3.2987946711778386, + "grad_norm": 3.922346830368042, + "learning_rate": 6.258974358974359e-06, + "loss": 0.0505, + "step": 7800 + }, + { + "epoch": 3.3093677310213576, + "grad_norm": 3.7691900730133057, + "learning_rate": 6.246153846153846e-06, + "loss": 0.037, + "step": 7825 + }, + { + "epoch": 3.319940790864876, + "grad_norm": 5.420505523681641, + "learning_rate": 6.2333333333333335e-06, + "loss": 0.0403, + "step": 7850 + }, + { + "epoch": 3.330513850708395, + "grad_norm": 2.5405962467193604, + "learning_rate": 6.22051282051282e-06, + "loss": 0.0481, + "step": 7875 + }, + { + "epoch": 3.3410869105519136, + "grad_norm": 5.486624717712402, + "learning_rate": 6.207692307692308e-06, + "loss": 0.0499, + "step": 7900 + }, + { + "epoch": 3.3516599703954326, + "grad_norm": 7.166432857513428, + "learning_rate": 6.194871794871795e-06, + "loss": 0.0455, + "step": 7925 + }, + { + "epoch": 3.362233030238951, + "grad_norm": 2.30519437789917, + "learning_rate": 6.1820512820512825e-06, + "loss": 0.0526, + "step": 7950 + }, + { + "epoch": 3.3728060900824697, + "grad_norm": 1.9276854991912842, + "learning_rate": 6.169230769230769e-06, + "loss": 0.0496, + "step": 7975 + }, + { + "epoch": 3.3833791499259886, + "grad_norm": 2.095248222351074, + "learning_rate": 6.156410256410257e-06, + "loss": 0.0368, + "step": 8000 + }, + { + "epoch": 3.3833791499259886, + "eval_loss": 0.14458830654621124, + "eval_runtime": 482.4832, + "eval_samples_per_second": 7.546, + "eval_steps_per_second": 0.945, + "eval_wer": 0.13239593484616455, + "step": 8000 + }, + { + "epoch": 3.393952209769507, + "grad_norm": 1.7885074615478516, + "learning_rate": 6.143589743589744e-06, + "loss": 0.0361, + "step": 8025 + }, + { + "epoch": 3.404525269613026, + "grad_norm": 4.022132873535156, + "learning_rate": 6.1307692307692315e-06, + "loss": 0.0463, + "step": 8050 + }, + { + "epoch": 3.4150983294565447, + "grad_norm": 2.1497411727905273, + "learning_rate": 6.117948717948718e-06, + "loss": 0.0427, + "step": 8075 + }, + { + "epoch": 3.4256713893000637, + "grad_norm": 1.6746113300323486, + "learning_rate": 6.105128205128206e-06, + "loss": 0.0344, + "step": 8100 + }, + { + "epoch": 3.436244449143582, + "grad_norm": 3.6092941761016846, + "learning_rate": 6.092307692307693e-06, + "loss": 0.0469, + "step": 8125 + }, + { + "epoch": 3.4468175089871007, + "grad_norm": 4.466283798217773, + "learning_rate": 6.0794871794871805e-06, + "loss": 0.0423, + "step": 8150 + }, + { + "epoch": 3.4573905688306197, + "grad_norm": 2.550199508666992, + "learning_rate": 6.066666666666667e-06, + "loss": 0.0542, + "step": 8175 + }, + { + "epoch": 3.467963628674138, + "grad_norm": 2.781538963317871, + "learning_rate": 6.053846153846155e-06, + "loss": 0.0423, + "step": 8200 + }, + { + "epoch": 3.478536688517657, + "grad_norm": 1.9282548427581787, + "learning_rate": 6.041025641025642e-06, + "loss": 0.0386, + "step": 8225 + }, + { + "epoch": 3.4891097483611757, + "grad_norm": 2.867671012878418, + "learning_rate": 6.028205128205129e-06, + "loss": 0.0364, + "step": 8250 + }, + { + "epoch": 3.4996828082046942, + "grad_norm": 2.200422763824463, + "learning_rate": 6.015384615384616e-06, + "loss": 0.0396, + "step": 8275 + }, + { + "epoch": 3.510255868048213, + "grad_norm": 2.84183931350708, + "learning_rate": 6.002564102564103e-06, + "loss": 0.0432, + "step": 8300 + }, + { + "epoch": 3.5208289278917317, + "grad_norm": 3.7579634189605713, + "learning_rate": 5.989743589743591e-06, + "loss": 0.0503, + "step": 8325 + }, + { + "epoch": 3.5314019877352507, + "grad_norm": 5.463958740234375, + "learning_rate": 5.976923076923078e-06, + "loss": 0.0367, + "step": 8350 + }, + { + "epoch": 3.5419750475787692, + "grad_norm": 2.921443462371826, + "learning_rate": 5.9641025641025644e-06, + "loss": 0.0474, + "step": 8375 + }, + { + "epoch": 3.5525481074222878, + "grad_norm": 2.1998775005340576, + "learning_rate": 5.951282051282051e-06, + "loss": 0.0393, + "step": 8400 + }, + { + "epoch": 3.5631211672658067, + "grad_norm": 2.0558321475982666, + "learning_rate": 5.938461538461538e-06, + "loss": 0.0434, + "step": 8425 + }, + { + "epoch": 3.5736942271093257, + "grad_norm": 4.614722728729248, + "learning_rate": 5.925641025641026e-06, + "loss": 0.0419, + "step": 8450 + }, + { + "epoch": 3.5842672869528442, + "grad_norm": 3.2002508640289307, + "learning_rate": 5.912820512820513e-06, + "loss": 0.0579, + "step": 8475 + }, + { + "epoch": 3.5948403467963628, + "grad_norm": 2.0797934532165527, + "learning_rate": 5.9e-06, + "loss": 0.047, + "step": 8500 + }, + { + "epoch": 3.6054134066398817, + "grad_norm": 1.3923156261444092, + "learning_rate": 5.887179487179487e-06, + "loss": 0.035, + "step": 8525 + }, + { + "epoch": 3.6159864664834003, + "grad_norm": 2.350816011428833, + "learning_rate": 5.874358974358975e-06, + "loss": 0.0678, + "step": 8550 + }, + { + "epoch": 3.6265595263269192, + "grad_norm": 3.1680469512939453, + "learning_rate": 5.861538461538462e-06, + "loss": 0.055, + "step": 8575 + }, + { + "epoch": 3.6371325861704378, + "grad_norm": 2.174107551574707, + "learning_rate": 5.848717948717949e-06, + "loss": 0.0475, + "step": 8600 + }, + { + "epoch": 3.6477056460139563, + "grad_norm": 3.112668037414551, + "learning_rate": 5.835897435897436e-06, + "loss": 0.0447, + "step": 8625 + }, + { + "epoch": 3.6582787058574753, + "grad_norm": 4.258768558502197, + "learning_rate": 5.823076923076924e-06, + "loss": 0.0322, + "step": 8650 + }, + { + "epoch": 3.668851765700994, + "grad_norm": 3.305208683013916, + "learning_rate": 5.8102564102564106e-06, + "loss": 0.0454, + "step": 8675 + }, + { + "epoch": 3.679424825544513, + "grad_norm": 2.314253568649292, + "learning_rate": 5.797435897435898e-06, + "loss": 0.0376, + "step": 8700 + }, + { + "epoch": 3.6899978853880313, + "grad_norm": 2.856414318084717, + "learning_rate": 5.784615384615385e-06, + "loss": 0.0396, + "step": 8725 + }, + { + "epoch": 3.70057094523155, + "grad_norm": 1.2516982555389404, + "learning_rate": 5.771794871794873e-06, + "loss": 0.0323, + "step": 8750 + }, + { + "epoch": 3.711144005075069, + "grad_norm": 3.374429225921631, + "learning_rate": 5.7589743589743596e-06, + "loss": 0.0487, + "step": 8775 + }, + { + "epoch": 3.7217170649185873, + "grad_norm": 3.7010583877563477, + "learning_rate": 5.746153846153847e-06, + "loss": 0.0489, + "step": 8800 + }, + { + "epoch": 3.7322901247621063, + "grad_norm": 1.8617634773254395, + "learning_rate": 5.733333333333334e-06, + "loss": 0.0376, + "step": 8825 + }, + { + "epoch": 3.742863184605625, + "grad_norm": 1.5360052585601807, + "learning_rate": 5.720512820512821e-06, + "loss": 0.0392, + "step": 8850 + }, + { + "epoch": 3.7534362444491434, + "grad_norm": 3.569289207458496, + "learning_rate": 5.7076923076923086e-06, + "loss": 0.0506, + "step": 8875 + }, + { + "epoch": 3.7640093042926623, + "grad_norm": 4.679437160491943, + "learning_rate": 5.694871794871795e-06, + "loss": 0.0397, + "step": 8900 + }, + { + "epoch": 3.774582364136181, + "grad_norm": 1.4421815872192383, + "learning_rate": 5.682051282051283e-06, + "loss": 0.0507, + "step": 8925 + }, + { + "epoch": 3.7851554239797, + "grad_norm": 4.020344257354736, + "learning_rate": 5.66923076923077e-06, + "loss": 0.0426, + "step": 8950 + }, + { + "epoch": 3.7957284838232184, + "grad_norm": 1.8350247144699097, + "learning_rate": 5.6564102564102575e-06, + "loss": 0.0577, + "step": 8975 + }, + { + "epoch": 3.806301543666737, + "grad_norm": 4.344027519226074, + "learning_rate": 5.6435897435897435e-06, + "loss": 0.0463, + "step": 9000 + }, + { + "epoch": 3.806301543666737, + "eval_loss": 0.1400885134935379, + "eval_runtime": 478.4392, + "eval_samples_per_second": 7.61, + "eval_steps_per_second": 0.953, + "eval_wer": 0.12863705972434916, + "step": 9000 + }, + { + "epoch": 3.816874603510256, + "grad_norm": 1.8994994163513184, + "learning_rate": 5.63076923076923e-06, + "loss": 0.0396, + "step": 9025 + }, + { + "epoch": 3.827447663353775, + "grad_norm": 2.9287285804748535, + "learning_rate": 5.617948717948718e-06, + "loss": 0.0353, + "step": 9050 + }, + { + "epoch": 3.8380207231972934, + "grad_norm": 5.816341876983643, + "learning_rate": 5.605128205128205e-06, + "loss": 0.0416, + "step": 9075 + }, + { + "epoch": 3.848593783040812, + "grad_norm": 1.939314603805542, + "learning_rate": 5.5923076923076925e-06, + "loss": 0.0356, + "step": 9100 + }, + { + "epoch": 3.859166842884331, + "grad_norm": 2.6920735836029053, + "learning_rate": 5.579487179487179e-06, + "loss": 0.0361, + "step": 9125 + }, + { + "epoch": 3.8697399027278494, + "grad_norm": 3.10347056388855, + "learning_rate": 5.566666666666667e-06, + "loss": 0.0465, + "step": 9150 + }, + { + "epoch": 3.8803129625713684, + "grad_norm": 3.6406362056732178, + "learning_rate": 5.553846153846154e-06, + "loss": 0.0451, + "step": 9175 + }, + { + "epoch": 3.890886022414887, + "grad_norm": 1.8818650245666504, + "learning_rate": 5.5410256410256415e-06, + "loss": 0.0399, + "step": 9200 + }, + { + "epoch": 3.9014590822584054, + "grad_norm": 2.670872688293457, + "learning_rate": 5.528205128205128e-06, + "loss": 0.0394, + "step": 9225 + }, + { + "epoch": 3.9120321421019244, + "grad_norm": 6.323610782623291, + "learning_rate": 5.515384615384616e-06, + "loss": 0.0469, + "step": 9250 + }, + { + "epoch": 3.922605201945443, + "grad_norm": 3.8592729568481445, + "learning_rate": 5.502564102564103e-06, + "loss": 0.0432, + "step": 9275 + }, + { + "epoch": 3.933178261788962, + "grad_norm": 1.5597033500671387, + "learning_rate": 5.4897435897435905e-06, + "loss": 0.0326, + "step": 9300 + }, + { + "epoch": 3.9437513216324804, + "grad_norm": 1.413038969039917, + "learning_rate": 5.476923076923077e-06, + "loss": 0.0439, + "step": 9325 + }, + { + "epoch": 3.954324381475999, + "grad_norm": 1.5831608772277832, + "learning_rate": 5.464102564102565e-06, + "loss": 0.0406, + "step": 9350 + }, + { + "epoch": 3.964897441319518, + "grad_norm": 4.640722274780273, + "learning_rate": 5.451282051282052e-06, + "loss": 0.046, + "step": 9375 + }, + { + "epoch": 3.9754705011630365, + "grad_norm": 2.2848503589630127, + "learning_rate": 5.4384615384615395e-06, + "loss": 0.039, + "step": 9400 + }, + { + "epoch": 3.9860435610065554, + "grad_norm": 3.464656352996826, + "learning_rate": 5.425641025641026e-06, + "loss": 0.0482, + "step": 9425 + }, + { + "epoch": 3.996616620850074, + "grad_norm": 1.6633790731430054, + "learning_rate": 5.412820512820514e-06, + "loss": 0.0458, + "step": 9450 + }, + { + "epoch": 4.0071896806935925, + "grad_norm": 3.9742796421051025, + "learning_rate": 5.400000000000001e-06, + "loss": 0.0301, + "step": 9475 + }, + { + "epoch": 4.017762740537111, + "grad_norm": 0.8984088897705078, + "learning_rate": 5.387179487179488e-06, + "loss": 0.0171, + "step": 9500 + }, + { + "epoch": 4.02833580038063, + "grad_norm": 1.1975101232528687, + "learning_rate": 5.374358974358975e-06, + "loss": 0.0346, + "step": 9525 + }, + { + "epoch": 4.038908860224149, + "grad_norm": 1.3089258670806885, + "learning_rate": 5.361538461538462e-06, + "loss": 0.0261, + "step": 9550 + }, + { + "epoch": 4.0494819200676675, + "grad_norm": 1.7242599725723267, + "learning_rate": 5.34871794871795e-06, + "loss": 0.024, + "step": 9575 + }, + { + "epoch": 4.060054979911186, + "grad_norm": 3.1359024047851562, + "learning_rate": 5.335897435897436e-06, + "loss": 0.019, + "step": 9600 + }, + { + "epoch": 4.070628039754705, + "grad_norm": 1.996058702468872, + "learning_rate": 5.323076923076923e-06, + "loss": 0.0165, + "step": 9625 + }, + { + "epoch": 4.081201099598224, + "grad_norm": 1.165366530418396, + "learning_rate": 5.31025641025641e-06, + "loss": 0.0241, + "step": 9650 + }, + { + "epoch": 4.0917741594417425, + "grad_norm": 1.7959648370742798, + "learning_rate": 5.297435897435897e-06, + "loss": 0.0193, + "step": 9675 + }, + { + "epoch": 4.102347219285261, + "grad_norm": 3.0931942462921143, + "learning_rate": 5.284615384615385e-06, + "loss": 0.0188, + "step": 9700 + }, + { + "epoch": 4.1129202791287796, + "grad_norm": 1.2411391735076904, + "learning_rate": 5.271794871794872e-06, + "loss": 0.0315, + "step": 9725 + }, + { + "epoch": 4.123493338972299, + "grad_norm": 4.090996265411377, + "learning_rate": 5.258974358974359e-06, + "loss": 0.0216, + "step": 9750 + }, + { + "epoch": 4.1340663988158175, + "grad_norm": 1.1179065704345703, + "learning_rate": 5.246153846153846e-06, + "loss": 0.0234, + "step": 9775 + }, + { + "epoch": 4.144639458659336, + "grad_norm": 1.4874383211135864, + "learning_rate": 5.233333333333334e-06, + "loss": 0.0187, + "step": 9800 + }, + { + "epoch": 4.155212518502855, + "grad_norm": 0.8263124823570251, + "learning_rate": 5.220512820512821e-06, + "loss": 0.0253, + "step": 9825 + }, + { + "epoch": 4.165785578346373, + "grad_norm": 2.70497727394104, + "learning_rate": 5.207692307692308e-06, + "loss": 0.0228, + "step": 9850 + }, + { + "epoch": 4.1763586381898925, + "grad_norm": 4.427598476409912, + "learning_rate": 5.194871794871795e-06, + "loss": 0.0248, + "step": 9875 + }, + { + "epoch": 4.186931698033411, + "grad_norm": 1.1242289543151855, + "learning_rate": 5.182051282051283e-06, + "loss": 0.0205, + "step": 9900 + }, + { + "epoch": 4.19750475787693, + "grad_norm": 0.6889589428901672, + "learning_rate": 5.16923076923077e-06, + "loss": 0.0152, + "step": 9925 + }, + { + "epoch": 4.208077817720448, + "grad_norm": 1.5763376951217651, + "learning_rate": 5.156410256410257e-06, + "loss": 0.0164, + "step": 9950 + }, + { + "epoch": 4.218650877563967, + "grad_norm": 3.120882034301758, + "learning_rate": 5.143589743589744e-06, + "loss": 0.0249, + "step": 9975 + }, + { + "epoch": 4.229223937407486, + "grad_norm": 1.5591440200805664, + "learning_rate": 5.130769230769232e-06, + "loss": 0.0278, + "step": 10000 + }, + { + "epoch": 4.229223937407486, + "eval_loss": 0.14359265565872192, + "eval_runtime": 472.3801, + "eval_samples_per_second": 7.708, + "eval_steps_per_second": 0.965, + "eval_wer": 0.11810292821012576, + "step": 10000 + }, + { + "epoch": 4.239796997251005, + "grad_norm": 1.2260849475860596, + "learning_rate": 5.1179487179487186e-06, + "loss": 0.0169, + "step": 10025 + }, + { + "epoch": 4.250370057094523, + "grad_norm": 0.49284470081329346, + "learning_rate": 5.105128205128206e-06, + "loss": 0.018, + "step": 10050 + }, + { + "epoch": 4.260943116938042, + "grad_norm": 2.0094289779663086, + "learning_rate": 5.092307692307693e-06, + "loss": 0.0218, + "step": 10075 + }, + { + "epoch": 4.27151617678156, + "grad_norm": 1.3108640909194946, + "learning_rate": 5.07948717948718e-06, + "loss": 0.025, + "step": 10100 + }, + { + "epoch": 4.28208923662508, + "grad_norm": 1.7406195402145386, + "learning_rate": 5.0666666666666676e-06, + "loss": 0.0235, + "step": 10125 + }, + { + "epoch": 4.292662296468598, + "grad_norm": 0.876416027545929, + "learning_rate": 5.053846153846154e-06, + "loss": 0.0195, + "step": 10150 + }, + { + "epoch": 4.303235356312117, + "grad_norm": 1.6580686569213867, + "learning_rate": 5.041025641025642e-06, + "loss": 0.0167, + "step": 10175 + }, + { + "epoch": 4.313808416155635, + "grad_norm": 0.5891424417495728, + "learning_rate": 5.028205128205128e-06, + "loss": 0.0262, + "step": 10200 + }, + { + "epoch": 4.324381475999155, + "grad_norm": 2.2991206645965576, + "learning_rate": 5.015384615384616e-06, + "loss": 0.0253, + "step": 10225 + }, + { + "epoch": 4.334954535842673, + "grad_norm": 1.3528612852096558, + "learning_rate": 5.0025641025641025e-06, + "loss": 0.0269, + "step": 10250 + }, + { + "epoch": 4.345527595686192, + "grad_norm": 0.8127634525299072, + "learning_rate": 4.98974358974359e-06, + "loss": 0.0225, + "step": 10275 + }, + { + "epoch": 4.35610065552971, + "grad_norm": 0.9224340915679932, + "learning_rate": 4.976923076923078e-06, + "loss": 0.0219, + "step": 10300 + }, + { + "epoch": 4.366673715373229, + "grad_norm": 1.4799253940582275, + "learning_rate": 4.964102564102565e-06, + "loss": 0.0176, + "step": 10325 + }, + { + "epoch": 4.377246775216748, + "grad_norm": 3.0369462966918945, + "learning_rate": 4.9512820512820515e-06, + "loss": 0.0207, + "step": 10350 + }, + { + "epoch": 4.387819835060267, + "grad_norm": 0.7996551990509033, + "learning_rate": 4.938461538461538e-06, + "loss": 0.0236, + "step": 10375 + }, + { + "epoch": 4.398392894903785, + "grad_norm": 0.9955740571022034, + "learning_rate": 4.925641025641026e-06, + "loss": 0.0274, + "step": 10400 + }, + { + "epoch": 4.408965954747304, + "grad_norm": 1.9452871084213257, + "learning_rate": 4.912820512820513e-06, + "loss": 0.0164, + "step": 10425 + }, + { + "epoch": 4.419539014590822, + "grad_norm": 10.20065975189209, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0215, + "step": 10450 + }, + { + "epoch": 4.430112074434342, + "grad_norm": 0.7727457880973816, + "learning_rate": 4.887179487179487e-06, + "loss": 0.0154, + "step": 10475 + }, + { + "epoch": 4.44068513427786, + "grad_norm": 2.406764268875122, + "learning_rate": 4.874358974358975e-06, + "loss": 0.0197, + "step": 10500 + }, + { + "epoch": 4.451258194121379, + "grad_norm": 1.1302127838134766, + "learning_rate": 4.861538461538462e-06, + "loss": 0.0213, + "step": 10525 + }, + { + "epoch": 4.461831253964897, + "grad_norm": 1.5602991580963135, + "learning_rate": 4.8487179487179495e-06, + "loss": 0.0222, + "step": 10550 + }, + { + "epoch": 4.472404313808416, + "grad_norm": 2.2636196613311768, + "learning_rate": 4.835897435897436e-06, + "loss": 0.0208, + "step": 10575 + }, + { + "epoch": 4.482977373651935, + "grad_norm": 1.560081124305725, + "learning_rate": 4.823076923076924e-06, + "loss": 0.0212, + "step": 10600 + }, + { + "epoch": 4.493550433495454, + "grad_norm": 1.5913350582122803, + "learning_rate": 4.810256410256411e-06, + "loss": 0.0202, + "step": 10625 + }, + { + "epoch": 4.504123493338972, + "grad_norm": 2.0870747566223145, + "learning_rate": 4.7974358974358985e-06, + "loss": 0.0197, + "step": 10650 + }, + { + "epoch": 4.514696553182491, + "grad_norm": 0.832817554473877, + "learning_rate": 4.7846153846153845e-06, + "loss": 0.0229, + "step": 10675 + }, + { + "epoch": 4.52526961302601, + "grad_norm": 0.8193099498748779, + "learning_rate": 4.771794871794872e-06, + "loss": 0.0325, + "step": 10700 + }, + { + "epoch": 4.535842672869529, + "grad_norm": 2.3790667057037354, + "learning_rate": 4.758974358974359e-06, + "loss": 0.0226, + "step": 10725 + }, + { + "epoch": 4.546415732713047, + "grad_norm": 4.055891990661621, + "learning_rate": 4.746153846153847e-06, + "loss": 0.0339, + "step": 10750 + }, + { + "epoch": 4.556988792556566, + "grad_norm": 1.3302873373031616, + "learning_rate": 4.7333333333333335e-06, + "loss": 0.0205, + "step": 10775 + }, + { + "epoch": 4.567561852400084, + "grad_norm": 1.9055285453796387, + "learning_rate": 4.720512820512821e-06, + "loss": 0.0246, + "step": 10800 + }, + { + "epoch": 4.578134912243604, + "grad_norm": 0.8478215336799622, + "learning_rate": 4.707692307692308e-06, + "loss": 0.0283, + "step": 10825 + }, + { + "epoch": 4.588707972087122, + "grad_norm": 1.2983384132385254, + "learning_rate": 4.694871794871796e-06, + "loss": 0.0214, + "step": 10850 + }, + { + "epoch": 4.599281031930641, + "grad_norm": 0.70320063829422, + "learning_rate": 4.6820512820512825e-06, + "loss": 0.0344, + "step": 10875 + }, + { + "epoch": 4.609854091774159, + "grad_norm": 1.7480090856552124, + "learning_rate": 4.66923076923077e-06, + "loss": 0.0198, + "step": 10900 + }, + { + "epoch": 4.620427151617678, + "grad_norm": 1.8584214448928833, + "learning_rate": 4.656410256410257e-06, + "loss": 0.0198, + "step": 10925 + }, + { + "epoch": 4.631000211461197, + "grad_norm": 3.884225368499756, + "learning_rate": 4.643589743589745e-06, + "loss": 0.0197, + "step": 10950 + }, + { + "epoch": 4.641573271304716, + "grad_norm": 0.8960859775543213, + "learning_rate": 4.630769230769231e-06, + "loss": 0.0212, + "step": 10975 + }, + { + "epoch": 4.652146331148234, + "grad_norm": 0.8120248317718506, + "learning_rate": 4.617948717948718e-06, + "loss": 0.0157, + "step": 11000 + }, + { + "epoch": 4.652146331148234, + "eval_loss": 0.14061123132705688, + "eval_runtime": 474.4412, + "eval_samples_per_second": 7.674, + "eval_steps_per_second": 0.961, + "eval_wer": 0.11253422432595481, + "step": 11000 + }, + { + "epoch": 4.662719390991753, + "grad_norm": 0.5586819648742676, + "learning_rate": 4.605128205128205e-06, + "loss": 0.0201, + "step": 11025 + }, + { + "epoch": 4.673292450835271, + "grad_norm": 2.2997443675994873, + "learning_rate": 4.592307692307693e-06, + "loss": 0.0207, + "step": 11050 + }, + { + "epoch": 4.683865510678791, + "grad_norm": 3.0606119632720947, + "learning_rate": 4.57948717948718e-06, + "loss": 0.0261, + "step": 11075 + }, + { + "epoch": 4.694438570522309, + "grad_norm": 1.2574974298477173, + "learning_rate": 4.566666666666667e-06, + "loss": 0.0229, + "step": 11100 + }, + { + "epoch": 4.705011630365828, + "grad_norm": 0.962881326675415, + "learning_rate": 4.553846153846154e-06, + "loss": 0.0198, + "step": 11125 + }, + { + "epoch": 4.715584690209346, + "grad_norm": 1.8664745092391968, + "learning_rate": 4.5415384615384615e-06, + "loss": 0.0185, + "step": 11150 + }, + { + "epoch": 4.726157750052865, + "grad_norm": 2.5618274211883545, + "learning_rate": 4.528717948717949e-06, + "loss": 0.0306, + "step": 11175 + }, + { + "epoch": 4.736730809896384, + "grad_norm": 0.9622751474380493, + "learning_rate": 4.515897435897436e-06, + "loss": 0.037, + "step": 11200 + }, + { + "epoch": 4.747303869739903, + "grad_norm": 0.5940545201301575, + "learning_rate": 4.503076923076924e-06, + "loss": 0.0178, + "step": 11225 + }, + { + "epoch": 4.757876929583421, + "grad_norm": 2.6667637825012207, + "learning_rate": 4.4902564102564105e-06, + "loss": 0.015, + "step": 11250 + }, + { + "epoch": 4.76844998942694, + "grad_norm": 0.793354868888855, + "learning_rate": 4.477435897435898e-06, + "loss": 0.0299, + "step": 11275 + }, + { + "epoch": 4.779023049270458, + "grad_norm": 4.2273030281066895, + "learning_rate": 4.464615384615385e-06, + "loss": 0.0165, + "step": 11300 + }, + { + "epoch": 4.789596109113978, + "grad_norm": 2.6417057514190674, + "learning_rate": 4.451794871794872e-06, + "loss": 0.0232, + "step": 11325 + }, + { + "epoch": 4.800169168957496, + "grad_norm": 2.1996712684631348, + "learning_rate": 4.4389743589743595e-06, + "loss": 0.015, + "step": 11350 + }, + { + "epoch": 4.810742228801015, + "grad_norm": 5.317453384399414, + "learning_rate": 4.426153846153846e-06, + "loss": 0.0199, + "step": 11375 + }, + { + "epoch": 4.821315288644533, + "grad_norm": 0.8886387944221497, + "learning_rate": 4.413333333333334e-06, + "loss": 0.0219, + "step": 11400 + }, + { + "epoch": 4.831888348488053, + "grad_norm": 1.3750237226486206, + "learning_rate": 4.400512820512821e-06, + "loss": 0.0248, + "step": 11425 + }, + { + "epoch": 4.842461408331571, + "grad_norm": 1.9163587093353271, + "learning_rate": 4.387692307692308e-06, + "loss": 0.0299, + "step": 11450 + }, + { + "epoch": 4.85303446817509, + "grad_norm": 1.0380934476852417, + "learning_rate": 4.374871794871795e-06, + "loss": 0.016, + "step": 11475 + }, + { + "epoch": 4.863607528018608, + "grad_norm": 2.2600953578948975, + "learning_rate": 4.362051282051282e-06, + "loss": 0.0201, + "step": 11500 + }, + { + "epoch": 4.874180587862127, + "grad_norm": 2.316375255584717, + "learning_rate": 4.34923076923077e-06, + "loss": 0.0244, + "step": 11525 + }, + { + "epoch": 4.884753647705646, + "grad_norm": 2.1449790000915527, + "learning_rate": 4.336410256410257e-06, + "loss": 0.0167, + "step": 11550 + }, + { + "epoch": 4.895326707549165, + "grad_norm": 2.5895638465881348, + "learning_rate": 4.323589743589744e-06, + "loss": 0.0162, + "step": 11575 + }, + { + "epoch": 4.905899767392683, + "grad_norm": 2.5529627799987793, + "learning_rate": 4.310769230769231e-06, + "loss": 0.0227, + "step": 11600 + }, + { + "epoch": 4.916472827236202, + "grad_norm": 1.019690752029419, + "learning_rate": 4.297948717948718e-06, + "loss": 0.0307, + "step": 11625 + }, + { + "epoch": 4.927045887079721, + "grad_norm": 1.7469873428344727, + "learning_rate": 4.285128205128206e-06, + "loss": 0.0137, + "step": 11650 + }, + { + "epoch": 4.93761894692324, + "grad_norm": 1.8508704900741577, + "learning_rate": 4.2723076923076925e-06, + "loss": 0.0228, + "step": 11675 + }, + { + "epoch": 4.948192006766758, + "grad_norm": 0.8989993333816528, + "learning_rate": 4.25948717948718e-06, + "loss": 0.0165, + "step": 11700 + }, + { + "epoch": 4.958765066610277, + "grad_norm": 1.5571892261505127, + "learning_rate": 4.246666666666667e-06, + "loss": 0.0234, + "step": 11725 + }, + { + "epoch": 4.9693381264537955, + "grad_norm": 0.6282488703727722, + "learning_rate": 4.233846153846154e-06, + "loss": 0.0194, + "step": 11750 + }, + { + "epoch": 4.979911186297315, + "grad_norm": 0.898673415184021, + "learning_rate": 4.2210256410256414e-06, + "loss": 0.0177, + "step": 11775 + }, + { + "epoch": 4.990484246140833, + "grad_norm": 0.9171255826950073, + "learning_rate": 4.208205128205128e-06, + "loss": 0.0163, + "step": 11800 + }, + { + "epoch": 5.001057305984352, + "grad_norm": 0.9866194128990173, + "learning_rate": 4.195384615384616e-06, + "loss": 0.0175, + "step": 11825 + }, + { + "epoch": 5.0116303658278705, + "grad_norm": 0.3489556312561035, + "learning_rate": 4.182564102564103e-06, + "loss": 0.0107, + "step": 11850 + }, + { + "epoch": 5.022203425671389, + "grad_norm": 3.7504043579101562, + "learning_rate": 4.1697435897435904e-06, + "loss": 0.0104, + "step": 11875 + }, + { + "epoch": 5.032776485514908, + "grad_norm": 1.138054609298706, + "learning_rate": 4.156923076923077e-06, + "loss": 0.0078, + "step": 11900 + }, + { + "epoch": 5.043349545358427, + "grad_norm": 2.7337746620178223, + "learning_rate": 4.144102564102564e-06, + "loss": 0.0079, + "step": 11925 + }, + { + "epoch": 5.0539226052019455, + "grad_norm": 0.4507981836795807, + "learning_rate": 4.131282051282052e-06, + "loss": 0.0127, + "step": 11950 + }, + { + "epoch": 5.064495665045464, + "grad_norm": 0.9632282853126526, + "learning_rate": 4.118461538461539e-06, + "loss": 0.0107, + "step": 11975 + }, + { + "epoch": 5.0750687248889825, + "grad_norm": 0.3650486171245575, + "learning_rate": 4.105641025641026e-06, + "loss": 0.0201, + "step": 12000 + }, + { + "epoch": 5.0750687248889825, + "eval_loss": 0.1392413079738617, + "eval_runtime": 476.9371, + "eval_samples_per_second": 7.634, + "eval_steps_per_second": 0.956, + "eval_wer": 0.11443686481971321, + "step": 12000 + }, + { + "epoch": 5.085641784732502, + "grad_norm": 0.6850853562355042, + "learning_rate": 4.092820512820513e-06, + "loss": 0.0168, + "step": 12025 + }, + { + "epoch": 5.0962148445760205, + "grad_norm": 0.9726247787475586, + "learning_rate": 4.08e-06, + "loss": 0.0156, + "step": 12050 + }, + { + "epoch": 5.106787904419539, + "grad_norm": 0.6085502505302429, + "learning_rate": 4.0671794871794876e-06, + "loss": 0.0092, + "step": 12075 + }, + { + "epoch": 5.1173609642630575, + "grad_norm": 0.3284889757633209, + "learning_rate": 4.054358974358974e-06, + "loss": 0.0128, + "step": 12100 + }, + { + "epoch": 5.127934024106576, + "grad_norm": 0.4705301523208618, + "learning_rate": 4.041538461538462e-06, + "loss": 0.0104, + "step": 12125 + }, + { + "epoch": 5.1385070839500955, + "grad_norm": 1.9244158267974854, + "learning_rate": 4.028717948717949e-06, + "loss": 0.0081, + "step": 12150 + }, + { + "epoch": 5.149080143793614, + "grad_norm": 2.1168432235717773, + "learning_rate": 4.0158974358974366e-06, + "loss": 0.0087, + "step": 12175 + }, + { + "epoch": 5.1596532036371325, + "grad_norm": 1.652687430381775, + "learning_rate": 4.003076923076923e-06, + "loss": 0.0113, + "step": 12200 + }, + { + "epoch": 5.170226263480651, + "grad_norm": 2.1567976474761963, + "learning_rate": 3.990256410256411e-06, + "loss": 0.0088, + "step": 12225 + }, + { + "epoch": 5.18079932332417, + "grad_norm": 0.7141952514648438, + "learning_rate": 3.977435897435898e-06, + "loss": 0.0108, + "step": 12250 + }, + { + "epoch": 5.191372383167689, + "grad_norm": 1.3821399211883545, + "learning_rate": 3.964615384615385e-06, + "loss": 0.0111, + "step": 12275 + }, + { + "epoch": 5.2019454430112075, + "grad_norm": 0.7689725160598755, + "learning_rate": 3.951794871794872e-06, + "loss": 0.0084, + "step": 12300 + }, + { + "epoch": 5.212518502854726, + "grad_norm": 0.7550517320632935, + "learning_rate": 3.938974358974359e-06, + "loss": 0.0143, + "step": 12325 + }, + { + "epoch": 5.223091562698245, + "grad_norm": 0.4243580102920532, + "learning_rate": 3.926153846153846e-06, + "loss": 0.008, + "step": 12350 + }, + { + "epoch": 5.233664622541764, + "grad_norm": 0.6995705366134644, + "learning_rate": 3.913333333333334e-06, + "loss": 0.0078, + "step": 12375 + }, + { + "epoch": 5.2442376823852825, + "grad_norm": 0.3228248357772827, + "learning_rate": 3.9005128205128205e-06, + "loss": 0.0089, + "step": 12400 + }, + { + "epoch": 5.254810742228801, + "grad_norm": 0.5981309413909912, + "learning_rate": 3.887692307692308e-06, + "loss": 0.0116, + "step": 12425 + }, + { + "epoch": 5.26538380207232, + "grad_norm": 1.0151786804199219, + "learning_rate": 3.874871794871795e-06, + "loss": 0.0109, + "step": 12450 + }, + { + "epoch": 5.275956861915838, + "grad_norm": 1.7748947143554688, + "learning_rate": 3.862051282051283e-06, + "loss": 0.0165, + "step": 12475 + }, + { + "epoch": 5.2865299217593575, + "grad_norm": 3.3617355823516846, + "learning_rate": 3.8492307692307695e-06, + "loss": 0.0113, + "step": 12500 + }, + { + "epoch": 5.297102981602876, + "grad_norm": 0.9399589896202087, + "learning_rate": 3.836410256410257e-06, + "loss": 0.0145, + "step": 12525 + }, + { + "epoch": 5.307676041446395, + "grad_norm": 0.4181835651397705, + "learning_rate": 3.823589743589744e-06, + "loss": 0.0132, + "step": 12550 + }, + { + "epoch": 5.318249101289913, + "grad_norm": 2.0555076599121094, + "learning_rate": 3.8107692307692313e-06, + "loss": 0.0081, + "step": 12575 + }, + { + "epoch": 5.328822161133432, + "grad_norm": 1.8507764339447021, + "learning_rate": 3.7979487179487185e-06, + "loss": 0.0108, + "step": 12600 + }, + { + "epoch": 5.339395220976951, + "grad_norm": 3.1068594455718994, + "learning_rate": 3.7851282051282058e-06, + "loss": 0.0146, + "step": 12625 + }, + { + "epoch": 5.34996828082047, + "grad_norm": 1.1594772338867188, + "learning_rate": 3.772307692307693e-06, + "loss": 0.0138, + "step": 12650 + }, + { + "epoch": 5.360541340663988, + "grad_norm": 0.45777344703674316, + "learning_rate": 3.7594871794871794e-06, + "loss": 0.0175, + "step": 12675 + }, + { + "epoch": 5.371114400507507, + "grad_norm": 0.6128593683242798, + "learning_rate": 3.7466666666666667e-06, + "loss": 0.0084, + "step": 12700 + }, + { + "epoch": 5.381687460351025, + "grad_norm": 0.644332230091095, + "learning_rate": 3.733846153846154e-06, + "loss": 0.0111, + "step": 12725 + }, + { + "epoch": 5.392260520194545, + "grad_norm": 0.603568971157074, + "learning_rate": 3.721025641025641e-06, + "loss": 0.0111, + "step": 12750 + }, + { + "epoch": 5.402833580038063, + "grad_norm": 0.3822285532951355, + "learning_rate": 3.7082051282051284e-06, + "loss": 0.0095, + "step": 12775 + }, + { + "epoch": 5.413406639881582, + "grad_norm": 0.493023157119751, + "learning_rate": 3.6953846153846156e-06, + "loss": 0.0157, + "step": 12800 + }, + { + "epoch": 5.4239796997251, + "grad_norm": 1.2487260103225708, + "learning_rate": 3.682564102564103e-06, + "loss": 0.0097, + "step": 12825 + }, + { + "epoch": 5.43455275956862, + "grad_norm": 0.8562780618667603, + "learning_rate": 3.66974358974359e-06, + "loss": 0.0139, + "step": 12850 + }, + { + "epoch": 5.445125819412138, + "grad_norm": 0.750851571559906, + "learning_rate": 3.6569230769230774e-06, + "loss": 0.0123, + "step": 12875 + }, + { + "epoch": 5.455698879255657, + "grad_norm": 0.41030699014663696, + "learning_rate": 3.6441025641025646e-06, + "loss": 0.0116, + "step": 12900 + }, + { + "epoch": 5.466271939099175, + "grad_norm": 0.5965930223464966, + "learning_rate": 3.631282051282052e-06, + "loss": 0.0107, + "step": 12925 + }, + { + "epoch": 5.476844998942694, + "grad_norm": 0.4181739389896393, + "learning_rate": 3.618461538461539e-06, + "loss": 0.0077, + "step": 12950 + }, + { + "epoch": 5.487418058786213, + "grad_norm": 3.0473992824554443, + "learning_rate": 3.6056410256410255e-06, + "loss": 0.0144, + "step": 12975 + }, + { + "epoch": 5.497991118629732, + "grad_norm": 6.770436763763428, + "learning_rate": 3.5928205128205128e-06, + "loss": 0.0121, + "step": 13000 + }, + { + "epoch": 5.497991118629732, + "eval_loss": 0.14054465293884277, + "eval_runtime": 479.854, + "eval_samples_per_second": 7.588, + "eval_steps_per_second": 0.95, + "eval_wer": 0.11290547125156619, + "step": 13000 + }, + { + "epoch": 5.50856417847325, + "grad_norm": 1.682664155960083, + "learning_rate": 3.58e-06, + "loss": 0.0098, + "step": 13025 + }, + { + "epoch": 5.519137238316769, + "grad_norm": 1.461300253868103, + "learning_rate": 3.5671794871794873e-06, + "loss": 0.0099, + "step": 13050 + }, + { + "epoch": 5.529710298160287, + "grad_norm": 0.3437669575214386, + "learning_rate": 3.5543589743589745e-06, + "loss": 0.014, + "step": 13075 + }, + { + "epoch": 5.540283358003807, + "grad_norm": 1.4875051975250244, + "learning_rate": 3.5415384615384618e-06, + "loss": 0.013, + "step": 13100 + }, + { + "epoch": 5.550856417847325, + "grad_norm": 3.039585590362549, + "learning_rate": 3.528717948717949e-06, + "loss": 0.0131, + "step": 13125 + }, + { + "epoch": 5.561429477690844, + "grad_norm": 0.7710385918617249, + "learning_rate": 3.5158974358974363e-06, + "loss": 0.011, + "step": 13150 + }, + { + "epoch": 5.572002537534362, + "grad_norm": 4.138525485992432, + "learning_rate": 3.5030769230769235e-06, + "loss": 0.0098, + "step": 13175 + }, + { + "epoch": 5.582575597377881, + "grad_norm": 4.096315860748291, + "learning_rate": 3.4902564102564108e-06, + "loss": 0.0143, + "step": 13200 + }, + { + "epoch": 5.5931486572214, + "grad_norm": 1.198435664176941, + "learning_rate": 3.477435897435898e-06, + "loss": 0.0127, + "step": 13225 + }, + { + "epoch": 5.603721717064919, + "grad_norm": 0.3887302875518799, + "learning_rate": 3.4646153846153853e-06, + "loss": 0.0138, + "step": 13250 + }, + { + "epoch": 5.614294776908437, + "grad_norm": 1.2115799188613892, + "learning_rate": 3.4517948717948717e-06, + "loss": 0.0172, + "step": 13275 + }, + { + "epoch": 5.624867836751956, + "grad_norm": 0.7135342955589294, + "learning_rate": 3.438974358974359e-06, + "loss": 0.0148, + "step": 13300 + }, + { + "epoch": 5.635440896595474, + "grad_norm": 2.1200053691864014, + "learning_rate": 3.426153846153846e-06, + "loss": 0.0084, + "step": 13325 + }, + { + "epoch": 5.646013956438994, + "grad_norm": 1.8395202159881592, + "learning_rate": 3.4133333333333334e-06, + "loss": 0.01, + "step": 13350 + }, + { + "epoch": 5.656587016282512, + "grad_norm": 4.734637260437012, + "learning_rate": 3.4005128205128207e-06, + "loss": 0.0141, + "step": 13375 + }, + { + "epoch": 5.667160076126031, + "grad_norm": 2.1322085857391357, + "learning_rate": 3.387692307692308e-06, + "loss": 0.0154, + "step": 13400 + }, + { + "epoch": 5.677733135969549, + "grad_norm": 0.8213221430778503, + "learning_rate": 3.374871794871795e-06, + "loss": 0.0065, + "step": 13425 + }, + { + "epoch": 5.688306195813068, + "grad_norm": 0.4765898585319519, + "learning_rate": 3.3620512820512824e-06, + "loss": 0.008, + "step": 13450 + }, + { + "epoch": 5.698879255656587, + "grad_norm": 0.424668550491333, + "learning_rate": 3.3492307692307696e-06, + "loss": 0.0104, + "step": 13475 + }, + { + "epoch": 5.709452315500106, + "grad_norm": 0.7294663786888123, + "learning_rate": 3.336410256410257e-06, + "loss": 0.0114, + "step": 13500 + }, + { + "epoch": 5.720025375343624, + "grad_norm": 0.4553682208061218, + "learning_rate": 3.323589743589744e-06, + "loss": 0.0133, + "step": 13525 + }, + { + "epoch": 5.730598435187143, + "grad_norm": 4.255309104919434, + "learning_rate": 3.3107692307692314e-06, + "loss": 0.0269, + "step": 13550 + }, + { + "epoch": 5.741171495030661, + "grad_norm": 0.8475791811943054, + "learning_rate": 3.297948717948718e-06, + "loss": 0.0086, + "step": 13575 + }, + { + "epoch": 5.751744554874181, + "grad_norm": 4.737633228302002, + "learning_rate": 3.285128205128205e-06, + "loss": 0.0193, + "step": 13600 + }, + { + "epoch": 5.762317614717699, + "grad_norm": 1.0523945093154907, + "learning_rate": 3.2723076923076923e-06, + "loss": 0.0124, + "step": 13625 + }, + { + "epoch": 5.772890674561218, + "grad_norm": 0.8394791483879089, + "learning_rate": 3.2594871794871795e-06, + "loss": 0.0109, + "step": 13650 + }, + { + "epoch": 5.783463734404736, + "grad_norm": 2.474153518676758, + "learning_rate": 3.2466666666666668e-06, + "loss": 0.0123, + "step": 13675 + }, + { + "epoch": 5.794036794248256, + "grad_norm": 0.8185378909111023, + "learning_rate": 3.233846153846154e-06, + "loss": 0.0084, + "step": 13700 + }, + { + "epoch": 5.804609854091774, + "grad_norm": 4.937212944030762, + "learning_rate": 3.2210256410256413e-06, + "loss": 0.0134, + "step": 13725 + }, + { + "epoch": 5.815182913935293, + "grad_norm": 0.6099960207939148, + "learning_rate": 3.2082051282051285e-06, + "loss": 0.0091, + "step": 13750 + }, + { + "epoch": 5.825755973778811, + "grad_norm": 0.43338268995285034, + "learning_rate": 3.1953846153846158e-06, + "loss": 0.0118, + "step": 13775 + }, + { + "epoch": 5.836329033622331, + "grad_norm": 0.47640514373779297, + "learning_rate": 3.182564102564103e-06, + "loss": 0.0072, + "step": 13800 + }, + { + "epoch": 5.846902093465849, + "grad_norm": 2.1803908348083496, + "learning_rate": 3.1697435897435903e-06, + "loss": 0.0119, + "step": 13825 + }, + { + "epoch": 5.857475153309368, + "grad_norm": 1.2694458961486816, + "learning_rate": 3.1569230769230775e-06, + "loss": 0.0102, + "step": 13850 + }, + { + "epoch": 5.868048213152886, + "grad_norm": 0.539225697517395, + "learning_rate": 3.144102564102564e-06, + "loss": 0.0107, + "step": 13875 + }, + { + "epoch": 5.878621272996405, + "grad_norm": 2.2293202877044678, + "learning_rate": 3.131282051282051e-06, + "loss": 0.0093, + "step": 13900 + }, + { + "epoch": 5.889194332839924, + "grad_norm": 0.3595990538597107, + "learning_rate": 3.1184615384615384e-06, + "loss": 0.0075, + "step": 13925 + }, + { + "epoch": 5.899767392683443, + "grad_norm": 1.155290961265564, + "learning_rate": 3.1056410256410257e-06, + "loss": 0.011, + "step": 13950 + }, + { + "epoch": 5.910340452526961, + "grad_norm": 6.233605861663818, + "learning_rate": 3.092820512820513e-06, + "loss": 0.0071, + "step": 13975 + }, + { + "epoch": 5.92091351237048, + "grad_norm": 0.6025995016098022, + "learning_rate": 3.08e-06, + "loss": 0.0074, + "step": 14000 + }, + { + "epoch": 5.92091351237048, + "eval_loss": 0.13853086531162262, + "eval_runtime": 478.5411, + "eval_samples_per_second": 7.609, + "eval_steps_per_second": 0.953, + "eval_wer": 0.11954151004686993, + "step": 14000 + }, + { + "epoch": 5.931486572213998, + "grad_norm": 0.934971034526825, + "learning_rate": 3.0671794871794874e-06, + "loss": 0.0149, + "step": 14025 + }, + { + "epoch": 5.942059632057518, + "grad_norm": 2.112450122833252, + "learning_rate": 3.0543589743589747e-06, + "loss": 0.0254, + "step": 14050 + }, + { + "epoch": 5.952632691901036, + "grad_norm": 1.504460334777832, + "learning_rate": 3.041538461538462e-06, + "loss": 0.0079, + "step": 14075 + }, + { + "epoch": 5.963205751744555, + "grad_norm": 3.3018078804016113, + "learning_rate": 3.028717948717949e-06, + "loss": 0.0084, + "step": 14100 + }, + { + "epoch": 5.9737788115880734, + "grad_norm": 0.2936100363731384, + "learning_rate": 3.0158974358974364e-06, + "loss": 0.0077, + "step": 14125 + }, + { + "epoch": 5.984351871431592, + "grad_norm": 2.9025211334228516, + "learning_rate": 3.0030769230769236e-06, + "loss": 0.0107, + "step": 14150 + }, + { + "epoch": 5.994924931275111, + "grad_norm": 0.5673441886901855, + "learning_rate": 2.99025641025641e-06, + "loss": 0.0101, + "step": 14175 + }, + { + "epoch": 6.00549799111863, + "grad_norm": 0.29159656167030334, + "learning_rate": 2.9774358974358973e-06, + "loss": 0.009, + "step": 14200 + }, + { + "epoch": 6.0160710509621484, + "grad_norm": 0.2572082579135895, + "learning_rate": 2.9646153846153845e-06, + "loss": 0.007, + "step": 14225 + }, + { + "epoch": 6.026644110805667, + "grad_norm": 0.2045765519142151, + "learning_rate": 2.951794871794872e-06, + "loss": 0.0058, + "step": 14250 + }, + { + "epoch": 6.0372171706491855, + "grad_norm": 0.34850168228149414, + "learning_rate": 2.938974358974359e-06, + "loss": 0.0067, + "step": 14275 + }, + { + "epoch": 6.047790230492705, + "grad_norm": 5.987231254577637, + "learning_rate": 2.9261538461538463e-06, + "loss": 0.0052, + "step": 14300 + }, + { + "epoch": 6.0583632903362235, + "grad_norm": 0.34981194138526917, + "learning_rate": 2.9133333333333335e-06, + "loss": 0.0059, + "step": 14325 + }, + { + "epoch": 6.068936350179742, + "grad_norm": 0.2722649574279785, + "learning_rate": 2.9005128205128208e-06, + "loss": 0.0105, + "step": 14350 + }, + { + "epoch": 6.0795094100232605, + "grad_norm": 0.33031409978866577, + "learning_rate": 2.887692307692308e-06, + "loss": 0.0144, + "step": 14375 + }, + { + "epoch": 6.090082469866779, + "grad_norm": 0.26611846685409546, + "learning_rate": 2.8748717948717953e-06, + "loss": 0.0053, + "step": 14400 + }, + { + "epoch": 6.1006555297102985, + "grad_norm": 0.29690757393836975, + "learning_rate": 2.8620512820512825e-06, + "loss": 0.0042, + "step": 14425 + }, + { + "epoch": 6.111228589553817, + "grad_norm": 0.2454768419265747, + "learning_rate": 2.8492307692307698e-06, + "loss": 0.0065, + "step": 14450 + }, + { + "epoch": 6.1218016493973355, + "grad_norm": 0.3441830575466156, + "learning_rate": 2.836410256410257e-06, + "loss": 0.0088, + "step": 14475 + }, + { + "epoch": 6.132374709240854, + "grad_norm": 0.4179341793060303, + "learning_rate": 2.8235897435897434e-06, + "loss": 0.0137, + "step": 14500 + }, + { + "epoch": 6.142947769084373, + "grad_norm": 0.3850744664669037, + "learning_rate": 2.8107692307692307e-06, + "loss": 0.0033, + "step": 14525 + }, + { + "epoch": 6.153520828927892, + "grad_norm": 0.2846491038799286, + "learning_rate": 2.797948717948718e-06, + "loss": 0.0061, + "step": 14550 + }, + { + "epoch": 6.1640938887714105, + "grad_norm": 0.3616473376750946, + "learning_rate": 2.785128205128205e-06, + "loss": 0.0052, + "step": 14575 + }, + { + "epoch": 6.174666948614929, + "grad_norm": 0.38456159830093384, + "learning_rate": 2.7723076923076924e-06, + "loss": 0.0046, + "step": 14600 + }, + { + "epoch": 6.185240008458448, + "grad_norm": 0.267269492149353, + "learning_rate": 2.7594871794871797e-06, + "loss": 0.0069, + "step": 14625 + }, + { + "epoch": 6.195813068301967, + "grad_norm": 0.9138497114181519, + "learning_rate": 2.746666666666667e-06, + "loss": 0.0068, + "step": 14650 + }, + { + "epoch": 6.2063861281454855, + "grad_norm": 0.2973681390285492, + "learning_rate": 2.733846153846154e-06, + "loss": 0.0113, + "step": 14675 + }, + { + "epoch": 6.216959187989004, + "grad_norm": 0.3064761757850647, + "learning_rate": 2.7210256410256414e-06, + "loss": 0.0076, + "step": 14700 + }, + { + "epoch": 6.227532247832523, + "grad_norm": 0.2878686785697937, + "learning_rate": 2.7082051282051287e-06, + "loss": 0.0044, + "step": 14725 + }, + { + "epoch": 6.238105307676041, + "grad_norm": 0.2550056576728821, + "learning_rate": 2.695384615384616e-06, + "loss": 0.0064, + "step": 14750 + }, + { + "epoch": 6.2486783675195605, + "grad_norm": 0.6837669610977173, + "learning_rate": 2.682564102564103e-06, + "loss": 0.0046, + "step": 14775 + }, + { + "epoch": 6.259251427363079, + "grad_norm": 0.244142547249794, + "learning_rate": 2.6697435897435896e-06, + "loss": 0.0033, + "step": 14800 + }, + { + "epoch": 6.269824487206598, + "grad_norm": 0.15427115559577942, + "learning_rate": 2.656923076923077e-06, + "loss": 0.0055, + "step": 14825 + }, + { + "epoch": 6.280397547050116, + "grad_norm": 0.7226958274841309, + "learning_rate": 2.644102564102564e-06, + "loss": 0.0053, + "step": 14850 + }, + { + "epoch": 6.290970606893635, + "grad_norm": 0.2157323956489563, + "learning_rate": 2.6312820512820513e-06, + "loss": 0.007, + "step": 14875 + }, + { + "epoch": 6.301543666737154, + "grad_norm": 2.2182655334472656, + "learning_rate": 2.6184615384615385e-06, + "loss": 0.0079, + "step": 14900 + }, + { + "epoch": 6.312116726580673, + "grad_norm": 0.17396897077560425, + "learning_rate": 2.605641025641026e-06, + "loss": 0.0059, + "step": 14925 + }, + { + "epoch": 6.322689786424191, + "grad_norm": 0.8146637082099915, + "learning_rate": 2.592820512820513e-06, + "loss": 0.0048, + "step": 14950 + }, + { + "epoch": 6.33326284626771, + "grad_norm": 0.2695443332195282, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.0049, + "step": 14975 + }, + { + "epoch": 6.343835906111229, + "grad_norm": 0.7036349177360535, + "learning_rate": 2.5671794871794875e-06, + "loss": 0.0064, + "step": 15000 + }, + { + "epoch": 6.343835906111229, + "eval_loss": 0.1409808248281479, + "eval_runtime": 474.6061, + "eval_samples_per_second": 7.672, + "eval_steps_per_second": 0.961, + "eval_wer": 0.11146688941482204, + "step": 15000 + }, + { + "epoch": 6.354408965954748, + "grad_norm": 0.4002957344055176, + "learning_rate": 2.5543589743589748e-06, + "loss": 0.0055, + "step": 15025 + }, + { + "epoch": 6.364982025798266, + "grad_norm": 2.1062963008880615, + "learning_rate": 2.541538461538462e-06, + "loss": 0.0038, + "step": 15050 + }, + { + "epoch": 6.375555085641785, + "grad_norm": 0.4128209948539734, + "learning_rate": 2.5287179487179493e-06, + "loss": 0.0062, + "step": 15075 + }, + { + "epoch": 6.386128145485303, + "grad_norm": 0.24224917590618134, + "learning_rate": 2.5158974358974357e-06, + "loss": 0.0076, + "step": 15100 + }, + { + "epoch": 6.396701205328823, + "grad_norm": 0.31282123923301697, + "learning_rate": 2.503076923076923e-06, + "loss": 0.0066, + "step": 15125 + }, + { + "epoch": 6.407274265172341, + "grad_norm": 0.28958702087402344, + "learning_rate": 2.4902564102564106e-06, + "loss": 0.0054, + "step": 15150 + }, + { + "epoch": 6.41784732501586, + "grad_norm": 0.23962663114070892, + "learning_rate": 2.4774358974358974e-06, + "loss": 0.0043, + "step": 15175 + }, + { + "epoch": 6.428420384859378, + "grad_norm": 0.22258694469928741, + "learning_rate": 2.4646153846153847e-06, + "loss": 0.0098, + "step": 15200 + }, + { + "epoch": 6.438993444702897, + "grad_norm": 0.219313845038414, + "learning_rate": 2.451794871794872e-06, + "loss": 0.0064, + "step": 15225 + }, + { + "epoch": 6.449566504546416, + "grad_norm": 0.24351318180561066, + "learning_rate": 2.438974358974359e-06, + "loss": 0.0053, + "step": 15250 + }, + { + "epoch": 6.460139564389935, + "grad_norm": 1.7564576864242554, + "learning_rate": 2.4261538461538464e-06, + "loss": 0.0106, + "step": 15275 + }, + { + "epoch": 6.470712624233453, + "grad_norm": 1.4861177206039429, + "learning_rate": 2.4133333333333337e-06, + "loss": 0.0063, + "step": 15300 + }, + { + "epoch": 6.481285684076972, + "grad_norm": 0.41736364364624023, + "learning_rate": 2.4005128205128205e-06, + "loss": 0.0052, + "step": 15325 + }, + { + "epoch": 6.49185874392049, + "grad_norm": 0.21913078427314758, + "learning_rate": 2.3876923076923077e-06, + "loss": 0.0063, + "step": 15350 + }, + { + "epoch": 6.50243180376401, + "grad_norm": 0.18712382018566132, + "learning_rate": 2.374871794871795e-06, + "loss": 0.01, + "step": 15375 + }, + { + "epoch": 6.513004863607528, + "grad_norm": 0.7857060432434082, + "learning_rate": 2.3620512820512822e-06, + "loss": 0.0081, + "step": 15400 + }, + { + "epoch": 6.523577923451047, + "grad_norm": 0.23228535056114197, + "learning_rate": 2.3492307692307695e-06, + "loss": 0.0055, + "step": 15425 + }, + { + "epoch": 6.534150983294565, + "grad_norm": 0.2794516682624817, + "learning_rate": 2.3364102564102567e-06, + "loss": 0.0118, + "step": 15450 + }, + { + "epoch": 6.544724043138084, + "grad_norm": 0.3594117760658264, + "learning_rate": 2.3235897435897436e-06, + "loss": 0.0049, + "step": 15475 + }, + { + "epoch": 6.555297102981603, + "grad_norm": 0.38121312856674194, + "learning_rate": 2.310769230769231e-06, + "loss": 0.0098, + "step": 15500 + }, + { + "epoch": 6.565870162825122, + "grad_norm": 0.48879972100257874, + "learning_rate": 2.2984615384615386e-06, + "loss": 0.0055, + "step": 15525 + }, + { + "epoch": 6.57644322266864, + "grad_norm": 0.3019927442073822, + "learning_rate": 2.285641025641026e-06, + "loss": 0.0066, + "step": 15550 + }, + { + "epoch": 6.587016282512159, + "grad_norm": 2.143726348876953, + "learning_rate": 2.272820512820513e-06, + "loss": 0.0078, + "step": 15575 + }, + { + "epoch": 6.597589342355677, + "grad_norm": 0.1708030104637146, + "learning_rate": 2.2600000000000004e-06, + "loss": 0.004, + "step": 15600 + }, + { + "epoch": 6.608162402199197, + "grad_norm": 1.7582453489303589, + "learning_rate": 2.2471794871794876e-06, + "loss": 0.0092, + "step": 15625 + }, + { + "epoch": 6.618735462042715, + "grad_norm": 0.3129083514213562, + "learning_rate": 2.2343589743589745e-06, + "loss": 0.0073, + "step": 15650 + }, + { + "epoch": 6.629308521886234, + "grad_norm": 0.3408079743385315, + "learning_rate": 2.2215384615384617e-06, + "loss": 0.0069, + "step": 15675 + }, + { + "epoch": 6.639881581729752, + "grad_norm": 11.103797912597656, + "learning_rate": 2.208717948717949e-06, + "loss": 0.0105, + "step": 15700 + }, + { + "epoch": 6.650454641573271, + "grad_norm": 0.554865300655365, + "learning_rate": 2.195897435897436e-06, + "loss": 0.0082, + "step": 15725 + }, + { + "epoch": 6.66102770141679, + "grad_norm": 0.32903775572776794, + "learning_rate": 2.1830769230769234e-06, + "loss": 0.0033, + "step": 15750 + }, + { + "epoch": 6.671600761260309, + "grad_norm": 0.2150058150291443, + "learning_rate": 2.1702564102564107e-06, + "loss": 0.0119, + "step": 15775 + }, + { + "epoch": 6.682173821103827, + "grad_norm": 0.253409743309021, + "learning_rate": 2.1574358974358975e-06, + "loss": 0.0046, + "step": 15800 + }, + { + "epoch": 6.692746880947346, + "grad_norm": 2.6021835803985596, + "learning_rate": 2.1446153846153848e-06, + "loss": 0.0077, + "step": 15825 + }, + { + "epoch": 6.703319940790865, + "grad_norm": 1.3991787433624268, + "learning_rate": 2.131794871794872e-06, + "loss": 0.0072, + "step": 15850 + }, + { + "epoch": 6.713893000634384, + "grad_norm": 0.40495139360427856, + "learning_rate": 2.1189743589743593e-06, + "loss": 0.013, + "step": 15875 + }, + { + "epoch": 6.724466060477902, + "grad_norm": 0.28818759322166443, + "learning_rate": 2.1061538461538465e-06, + "loss": 0.0065, + "step": 15900 + }, + { + "epoch": 6.735039120321421, + "grad_norm": 0.6223933696746826, + "learning_rate": 2.0933333333333338e-06, + "loss": 0.0057, + "step": 15925 + }, + { + "epoch": 6.745612180164939, + "grad_norm": 0.24611905217170715, + "learning_rate": 2.0805128205128206e-06, + "loss": 0.0049, + "step": 15950 + }, + { + "epoch": 6.756185240008459, + "grad_norm": 0.6771820783615112, + "learning_rate": 2.067692307692308e-06, + "loss": 0.0048, + "step": 15975 + }, + { + "epoch": 6.766758299851977, + "grad_norm": 14.729320526123047, + "learning_rate": 2.054871794871795e-06, + "loss": 0.0066, + "step": 16000 + }, + { + "epoch": 6.766758299851977, + "eval_loss": 0.14149095118045807, + "eval_runtime": 479.8155, + "eval_samples_per_second": 7.588, + "eval_steps_per_second": 0.95, + "eval_wer": 0.11842776927003573, + "step": 16000 + }, + { + "epoch": 6.777331359695496, + "grad_norm": 0.3452744781970978, + "learning_rate": 2.0420512820512823e-06, + "loss": 0.0052, + "step": 16025 + }, + { + "epoch": 6.787904419539014, + "grad_norm": 0.376597136259079, + "learning_rate": 2.0292307692307696e-06, + "loss": 0.0044, + "step": 16050 + }, + { + "epoch": 6.798477479382534, + "grad_norm": 0.2520189583301544, + "learning_rate": 2.016410256410257e-06, + "loss": 0.0088, + "step": 16075 + }, + { + "epoch": 6.809050539226052, + "grad_norm": 0.19255167245864868, + "learning_rate": 2.0035897435897436e-06, + "loss": 0.0046, + "step": 16100 + }, + { + "epoch": 6.819623599069571, + "grad_norm": 0.38311922550201416, + "learning_rate": 1.990769230769231e-06, + "loss": 0.0028, + "step": 16125 + }, + { + "epoch": 6.830196658913089, + "grad_norm": 0.2503233850002289, + "learning_rate": 1.977948717948718e-06, + "loss": 0.0038, + "step": 16150 + }, + { + "epoch": 6.840769718756608, + "grad_norm": 0.8586848378181458, + "learning_rate": 1.9651282051282054e-06, + "loss": 0.0052, + "step": 16175 + }, + { + "epoch": 6.851342778600127, + "grad_norm": 0.256672203540802, + "learning_rate": 1.9523076923076926e-06, + "loss": 0.0031, + "step": 16200 + }, + { + "epoch": 6.861915838443646, + "grad_norm": 0.3382306396961212, + "learning_rate": 1.93948717948718e-06, + "loss": 0.0072, + "step": 16225 + }, + { + "epoch": 6.872488898287164, + "grad_norm": 1.6218035221099854, + "learning_rate": 1.926666666666667e-06, + "loss": 0.0071, + "step": 16250 + }, + { + "epoch": 6.883061958130683, + "grad_norm": 0.33794450759887695, + "learning_rate": 1.913846153846154e-06, + "loss": 0.0062, + "step": 16275 + }, + { + "epoch": 6.893635017974201, + "grad_norm": 0.20873773097991943, + "learning_rate": 1.9010256410256412e-06, + "loss": 0.0047, + "step": 16300 + }, + { + "epoch": 6.904208077817721, + "grad_norm": 0.2936864495277405, + "learning_rate": 1.8882051282051285e-06, + "loss": 0.0034, + "step": 16325 + }, + { + "epoch": 6.914781137661239, + "grad_norm": 0.5449936389923096, + "learning_rate": 1.8753846153846155e-06, + "loss": 0.0065, + "step": 16350 + }, + { + "epoch": 6.925354197504758, + "grad_norm": 0.29695820808410645, + "learning_rate": 1.8625641025641027e-06, + "loss": 0.0125, + "step": 16375 + }, + { + "epoch": 6.935927257348276, + "grad_norm": 0.4382512867450714, + "learning_rate": 1.84974358974359e-06, + "loss": 0.0058, + "step": 16400 + }, + { + "epoch": 6.946500317191795, + "grad_norm": 0.5371158719062805, + "learning_rate": 1.836923076923077e-06, + "loss": 0.0036, + "step": 16425 + }, + { + "epoch": 6.957073377035314, + "grad_norm": 0.21529348194599152, + "learning_rate": 1.8241025641025643e-06, + "loss": 0.0042, + "step": 16450 + }, + { + "epoch": 6.967646436878833, + "grad_norm": 0.25407838821411133, + "learning_rate": 1.8112820512820515e-06, + "loss": 0.0048, + "step": 16475 + }, + { + "epoch": 6.978219496722351, + "grad_norm": 0.41775551438331604, + "learning_rate": 1.7984615384615386e-06, + "loss": 0.0042, + "step": 16500 + }, + { + "epoch": 6.98879255656587, + "grad_norm": 0.13575445115566254, + "learning_rate": 1.7856410256410258e-06, + "loss": 0.0057, + "step": 16525 + }, + { + "epoch": 6.9993656164093885, + "grad_norm": 0.21849684417247772, + "learning_rate": 1.772820512820513e-06, + "loss": 0.0065, + "step": 16550 + }, + { + "epoch": 7.009938676252908, + "grad_norm": 0.17400676012039185, + "learning_rate": 1.76e-06, + "loss": 0.0069, + "step": 16575 + }, + { + "epoch": 7.020511736096426, + "grad_norm": 0.2058832198381424, + "learning_rate": 1.7471794871794873e-06, + "loss": 0.005, + "step": 16600 + }, + { + "epoch": 7.031084795939945, + "grad_norm": 0.14491191506385803, + "learning_rate": 1.7343589743589746e-06, + "loss": 0.0026, + "step": 16625 + }, + { + "epoch": 7.0416578557834635, + "grad_norm": 2.8785014152526855, + "learning_rate": 1.7215384615384616e-06, + "loss": 0.0053, + "step": 16650 + }, + { + "epoch": 7.052230915626982, + "grad_norm": 0.17464618384838104, + "learning_rate": 1.7087179487179489e-06, + "loss": 0.0024, + "step": 16675 + }, + { + "epoch": 7.062803975470501, + "grad_norm": 0.17902244627475739, + "learning_rate": 1.6958974358974361e-06, + "loss": 0.004, + "step": 16700 + }, + { + "epoch": 7.07337703531402, + "grad_norm": 1.190285563468933, + "learning_rate": 1.6830769230769232e-06, + "loss": 0.0023, + "step": 16725 + }, + { + "epoch": 7.0839500951575385, + "grad_norm": 0.24982137978076935, + "learning_rate": 1.6702564102564104e-06, + "loss": 0.0029, + "step": 16750 + }, + { + "epoch": 7.094523155001057, + "grad_norm": 0.21240869164466858, + "learning_rate": 1.6574358974358976e-06, + "loss": 0.0034, + "step": 16775 + }, + { + "epoch": 7.105096214844576, + "grad_norm": 0.22856955230236053, + "learning_rate": 1.6446153846153847e-06, + "loss": 0.0028, + "step": 16800 + }, + { + "epoch": 7.115669274688095, + "grad_norm": 0.2120029181241989, + "learning_rate": 1.631794871794872e-06, + "loss": 0.0026, + "step": 16825 + }, + { + "epoch": 7.1262423345316135, + "grad_norm": 0.10486655682325363, + "learning_rate": 1.6189743589743592e-06, + "loss": 0.0063, + "step": 16850 + }, + { + "epoch": 7.136815394375132, + "grad_norm": 0.1727411448955536, + "learning_rate": 1.6061538461538462e-06, + "loss": 0.0058, + "step": 16875 + }, + { + "epoch": 7.1473884542186505, + "grad_norm": 0.2189004272222519, + "learning_rate": 1.5933333333333335e-06, + "loss": 0.0043, + "step": 16900 + }, + { + "epoch": 7.15796151406217, + "grad_norm": 0.224544957280159, + "learning_rate": 1.5805128205128207e-06, + "loss": 0.005, + "step": 16925 + }, + { + "epoch": 7.1685345739056885, + "grad_norm": 0.17426727712154388, + "learning_rate": 1.5676923076923078e-06, + "loss": 0.0039, + "step": 16950 + }, + { + "epoch": 7.179107633749207, + "grad_norm": 0.1636071801185608, + "learning_rate": 1.554871794871795e-06, + "loss": 0.0027, + "step": 16975 + }, + { + "epoch": 7.1896806935927255, + "grad_norm": 0.13778026401996613, + "learning_rate": 1.5420512820512822e-06, + "loss": 0.0029, + "step": 17000 + }, + { + "epoch": 7.1896806935927255, + "eval_loss": 0.14258068799972534, + "eval_runtime": 481.0853, + "eval_samples_per_second": 7.568, + "eval_steps_per_second": 0.948, + "eval_wer": 0.11903104552415425, + "step": 17000 + }, + { + "epoch": 7.200253753436244, + "grad_norm": 0.1793927550315857, + "learning_rate": 1.5292307692307693e-06, + "loss": 0.0054, + "step": 17025 + }, + { + "epoch": 7.2108268132797635, + "grad_norm": 0.17130379378795624, + "learning_rate": 1.5164102564102565e-06, + "loss": 0.0036, + "step": 17050 + }, + { + "epoch": 7.221399873123282, + "grad_norm": 0.20788300037384033, + "learning_rate": 1.5035897435897438e-06, + "loss": 0.0023, + "step": 17075 + }, + { + "epoch": 7.2319729329668005, + "grad_norm": 0.28084486722946167, + "learning_rate": 1.4907692307692308e-06, + "loss": 0.0027, + "step": 17100 + }, + { + "epoch": 7.242545992810319, + "grad_norm": 0.1289544254541397, + "learning_rate": 1.477948717948718e-06, + "loss": 0.0074, + "step": 17125 + }, + { + "epoch": 7.253119052653838, + "grad_norm": 0.14958246052265167, + "learning_rate": 1.4651282051282053e-06, + "loss": 0.0063, + "step": 17150 + }, + { + "epoch": 7.263692112497357, + "grad_norm": 0.7993505001068115, + "learning_rate": 1.4523076923076923e-06, + "loss": 0.0069, + "step": 17175 + }, + { + "epoch": 7.2742651723408756, + "grad_norm": 0.30847251415252686, + "learning_rate": 1.4394871794871796e-06, + "loss": 0.0072, + "step": 17200 + }, + { + "epoch": 7.284838232184394, + "grad_norm": 0.17055633664131165, + "learning_rate": 1.4266666666666668e-06, + "loss": 0.0024, + "step": 17225 + }, + { + "epoch": 7.295411292027913, + "grad_norm": 0.13648909330368042, + "learning_rate": 1.4138461538461539e-06, + "loss": 0.0022, + "step": 17250 + }, + { + "epoch": 7.305984351871432, + "grad_norm": 0.17881402373313904, + "learning_rate": 1.4010256410256411e-06, + "loss": 0.0033, + "step": 17275 + }, + { + "epoch": 7.3165574117149506, + "grad_norm": 0.11867067217826843, + "learning_rate": 1.3882051282051284e-06, + "loss": 0.0034, + "step": 17300 + }, + { + "epoch": 7.327130471558469, + "grad_norm": 0.1998029202222824, + "learning_rate": 1.3753846153846154e-06, + "loss": 0.0026, + "step": 17325 + }, + { + "epoch": 7.337703531401988, + "grad_norm": 0.19812311232089996, + "learning_rate": 1.3625641025641027e-06, + "loss": 0.0022, + "step": 17350 + }, + { + "epoch": 7.348276591245506, + "grad_norm": 0.20121921598911285, + "learning_rate": 1.34974358974359e-06, + "loss": 0.0025, + "step": 17375 + }, + { + "epoch": 7.358849651089026, + "grad_norm": 0.11519061774015427, + "learning_rate": 1.336923076923077e-06, + "loss": 0.0038, + "step": 17400 + }, + { + "epoch": 7.369422710932544, + "grad_norm": 0.2508073151111603, + "learning_rate": 1.3241025641025642e-06, + "loss": 0.0033, + "step": 17425 + }, + { + "epoch": 7.379995770776063, + "grad_norm": 0.1964157670736313, + "learning_rate": 1.3112820512820514e-06, + "loss": 0.003, + "step": 17450 + }, + { + "epoch": 7.390568830619581, + "grad_norm": 0.1781347244977951, + "learning_rate": 1.2984615384615385e-06, + "loss": 0.0111, + "step": 17475 + }, + { + "epoch": 7.4011418904631, + "grad_norm": 2.567892551422119, + "learning_rate": 1.2856410256410257e-06, + "loss": 0.0045, + "step": 17500 + }, + { + "epoch": 7.411714950306619, + "grad_norm": 4.157364368438721, + "learning_rate": 1.272820512820513e-06, + "loss": 0.0084, + "step": 17525 + }, + { + "epoch": 7.422288010150138, + "grad_norm": 0.1634252667427063, + "learning_rate": 1.26e-06, + "loss": 0.0045, + "step": 17550 + }, + { + "epoch": 7.432861069993656, + "grad_norm": 0.5479081273078918, + "learning_rate": 1.2471794871794873e-06, + "loss": 0.0075, + "step": 17575 + }, + { + "epoch": 7.443434129837175, + "grad_norm": 0.1536071002483368, + "learning_rate": 1.2343589743589745e-06, + "loss": 0.0038, + "step": 17600 + }, + { + "epoch": 7.454007189680693, + "grad_norm": 2.710284948348999, + "learning_rate": 1.2215384615384618e-06, + "loss": 0.0055, + "step": 17625 + }, + { + "epoch": 7.464580249524213, + "grad_norm": 0.19599756598472595, + "learning_rate": 1.2087179487179488e-06, + "loss": 0.0043, + "step": 17650 + }, + { + "epoch": 7.475153309367731, + "grad_norm": 0.1853959560394287, + "learning_rate": 1.195897435897436e-06, + "loss": 0.0038, + "step": 17675 + }, + { + "epoch": 7.48572636921125, + "grad_norm": 0.223322793841362, + "learning_rate": 1.1830769230769233e-06, + "loss": 0.0073, + "step": 17700 + }, + { + "epoch": 7.496299429054768, + "grad_norm": 0.33646872639656067, + "learning_rate": 1.1702564102564103e-06, + "loss": 0.0044, + "step": 17725 + }, + { + "epoch": 7.506872488898287, + "grad_norm": 0.24336710572242737, + "learning_rate": 1.1574358974358976e-06, + "loss": 0.0034, + "step": 17750 + }, + { + "epoch": 7.517445548741806, + "grad_norm": 0.16662658751010895, + "learning_rate": 1.1446153846153848e-06, + "loss": 0.0041, + "step": 17775 + }, + { + "epoch": 7.528018608585325, + "grad_norm": 0.1463777869939804, + "learning_rate": 1.1317948717948719e-06, + "loss": 0.0128, + "step": 17800 + }, + { + "epoch": 7.538591668428843, + "grad_norm": 0.26682642102241516, + "learning_rate": 1.118974358974359e-06, + "loss": 0.0035, + "step": 17825 + }, + { + "epoch": 7.549164728272362, + "grad_norm": 0.2606651782989502, + "learning_rate": 1.1061538461538463e-06, + "loss": 0.0038, + "step": 17850 + }, + { + "epoch": 7.55973778811588, + "grad_norm": 0.24456697702407837, + "learning_rate": 1.0933333333333334e-06, + "loss": 0.0069, + "step": 17875 + }, + { + "epoch": 7.5703108479594, + "grad_norm": 0.9612045884132385, + "learning_rate": 1.0805128205128206e-06, + "loss": 0.0047, + "step": 17900 + }, + { + "epoch": 7.580883907802918, + "grad_norm": 0.20409923791885376, + "learning_rate": 1.0676923076923079e-06, + "loss": 0.0019, + "step": 17925 + }, + { + "epoch": 7.591456967646437, + "grad_norm": 0.14264066517353058, + "learning_rate": 1.054871794871795e-06, + "loss": 0.0033, + "step": 17950 + }, + { + "epoch": 7.602030027489955, + "grad_norm": 0.43361735343933105, + "learning_rate": 1.0420512820512822e-06, + "loss": 0.0022, + "step": 17975 + }, + { + "epoch": 7.612603087333475, + "grad_norm": 0.25520941615104675, + "learning_rate": 1.0292307692307694e-06, + "loss": 0.0024, + "step": 18000 + }, + { + "epoch": 7.612603087333475, + "eval_loss": 0.14294278621673584, + "eval_runtime": 482.1709, + "eval_samples_per_second": 7.551, + "eval_steps_per_second": 0.946, + "eval_wer": 0.11782449301591721, + "step": 18000 + }, + { + "epoch": 7.623176147176993, + "grad_norm": 0.2945314943790436, + "learning_rate": 1.0164102564102564e-06, + "loss": 0.0025, + "step": 18025 + }, + { + "epoch": 7.633749207020512, + "grad_norm": 0.1885748952627182, + "learning_rate": 1.0035897435897437e-06, + "loss": 0.0031, + "step": 18050 + }, + { + "epoch": 7.64432226686403, + "grad_norm": 0.16705763339996338, + "learning_rate": 9.90769230769231e-07, + "loss": 0.0081, + "step": 18075 + }, + { + "epoch": 7.654895326707549, + "grad_norm": 0.1523977816104889, + "learning_rate": 9.77948717948718e-07, + "loss": 0.0028, + "step": 18100 + }, + { + "epoch": 7.665468386551068, + "grad_norm": 0.2194071263074875, + "learning_rate": 9.651282051282052e-07, + "loss": 0.0027, + "step": 18125 + }, + { + "epoch": 7.676041446394587, + "grad_norm": 0.23766735196113586, + "learning_rate": 9.523076923076924e-07, + "loss": 0.0031, + "step": 18150 + }, + { + "epoch": 7.686614506238105, + "grad_norm": 0.43536919355392456, + "learning_rate": 9.394871794871796e-07, + "loss": 0.0059, + "step": 18175 + }, + { + "epoch": 7.697187566081624, + "grad_norm": 0.20056919753551483, + "learning_rate": 9.266666666666667e-07, + "loss": 0.0027, + "step": 18200 + }, + { + "epoch": 7.707760625925143, + "grad_norm": 0.267135888338089, + "learning_rate": 9.138461538461539e-07, + "loss": 0.0046, + "step": 18225 + }, + { + "epoch": 7.718333685768662, + "grad_norm": 0.12573710083961487, + "learning_rate": 9.010256410256411e-07, + "loss": 0.0043, + "step": 18250 + }, + { + "epoch": 7.72890674561218, + "grad_norm": 0.12320298701524734, + "learning_rate": 8.882051282051282e-07, + "loss": 0.0047, + "step": 18275 + }, + { + "epoch": 7.739479805455699, + "grad_norm": 0.18382315337657928, + "learning_rate": 8.753846153846154e-07, + "loss": 0.0029, + "step": 18300 + }, + { + "epoch": 7.750052865299217, + "grad_norm": 1.264244794845581, + "learning_rate": 8.625641025641027e-07, + "loss": 0.0029, + "step": 18325 + }, + { + "epoch": 7.760625925142737, + "grad_norm": 0.31755584478378296, + "learning_rate": 8.497435897435897e-07, + "loss": 0.0023, + "step": 18350 + }, + { + "epoch": 7.771198984986255, + "grad_norm": 0.14317964017391205, + "learning_rate": 8.36923076923077e-07, + "loss": 0.0037, + "step": 18375 + }, + { + "epoch": 7.781772044829774, + "grad_norm": 0.1884177327156067, + "learning_rate": 8.241025641025642e-07, + "loss": 0.0022, + "step": 18400 + }, + { + "epoch": 7.792345104673292, + "grad_norm": 0.24375373125076294, + "learning_rate": 8.112820512820512e-07, + "loss": 0.0048, + "step": 18425 + }, + { + "epoch": 7.802918164516811, + "grad_norm": 0.2112288922071457, + "learning_rate": 7.984615384615385e-07, + "loss": 0.0038, + "step": 18450 + }, + { + "epoch": 7.81349122436033, + "grad_norm": 0.21161673963069916, + "learning_rate": 7.856410256410257e-07, + "loss": 0.0036, + "step": 18475 + }, + { + "epoch": 7.824064284203849, + "grad_norm": 0.17189516127109528, + "learning_rate": 7.728205128205128e-07, + "loss": 0.0029, + "step": 18500 + }, + { + "epoch": 7.834637344047367, + "grad_norm": 0.28677284717559814, + "learning_rate": 7.6e-07, + "loss": 0.0069, + "step": 18525 + }, + { + "epoch": 7.845210403890886, + "grad_norm": 0.5086202025413513, + "learning_rate": 7.471794871794873e-07, + "loss": 0.0024, + "step": 18550 + }, + { + "epoch": 7.855783463734404, + "grad_norm": 0.20552664995193481, + "learning_rate": 7.343589743589743e-07, + "loss": 0.0064, + "step": 18575 + }, + { + "epoch": 7.866356523577924, + "grad_norm": 0.17549338936805725, + "learning_rate": 7.215384615384616e-07, + "loss": 0.0026, + "step": 18600 + }, + { + "epoch": 7.876929583421442, + "grad_norm": 0.28019997477531433, + "learning_rate": 7.087179487179488e-07, + "loss": 0.0037, + "step": 18625 + }, + { + "epoch": 7.887502643264961, + "grad_norm": 0.1256396323442459, + "learning_rate": 6.958974358974358e-07, + "loss": 0.0045, + "step": 18650 + }, + { + "epoch": 7.898075703108479, + "grad_norm": 0.3035779893398285, + "learning_rate": 6.830769230769231e-07, + "loss": 0.0047, + "step": 18675 + }, + { + "epoch": 7.908648762951998, + "grad_norm": 0.2889888882637024, + "learning_rate": 6.702564102564103e-07, + "loss": 0.0045, + "step": 18700 + }, + { + "epoch": 7.919221822795517, + "grad_norm": 0.17718684673309326, + "learning_rate": 6.574358974358976e-07, + "loss": 0.0028, + "step": 18725 + }, + { + "epoch": 7.929794882639036, + "grad_norm": 0.16636326909065247, + "learning_rate": 6.446153846153846e-07, + "loss": 0.0023, + "step": 18750 + }, + { + "epoch": 7.940367942482554, + "grad_norm": 0.21917343139648438, + "learning_rate": 6.317948717948719e-07, + "loss": 0.0066, + "step": 18775 + }, + { + "epoch": 7.950941002326073, + "grad_norm": 0.1804770529270172, + "learning_rate": 6.18974358974359e-07, + "loss": 0.0022, + "step": 18800 + }, + { + "epoch": 7.9615140621695915, + "grad_norm": 0.15297465026378632, + "learning_rate": 6.061538461538462e-07, + "loss": 0.0025, + "step": 18825 + }, + { + "epoch": 7.972087122013111, + "grad_norm": 2.2187414169311523, + "learning_rate": 5.933333333333334e-07, + "loss": 0.0036, + "step": 18850 + }, + { + "epoch": 7.982660181856629, + "grad_norm": 0.14911529421806335, + "learning_rate": 5.805128205128205e-07, + "loss": 0.0035, + "step": 18875 + }, + { + "epoch": 7.993233241700148, + "grad_norm": 0.3315187692642212, + "learning_rate": 5.676923076923077e-07, + "loss": 0.0031, + "step": 18900 + }, + { + "epoch": 8.003806301543667, + "grad_norm": 0.2590886950492859, + "learning_rate": 5.548717948717949e-07, + "loss": 0.0037, + "step": 18925 + }, + { + "epoch": 8.014379361387185, + "grad_norm": 0.15489330887794495, + "learning_rate": 5.420512820512821e-07, + "loss": 0.0018, + "step": 18950 + }, + { + "epoch": 8.024952421230704, + "grad_norm": 0.18797287344932556, + "learning_rate": 5.292307692307692e-07, + "loss": 0.0018, + "step": 18975 + }, + { + "epoch": 8.035525481074222, + "grad_norm": 0.1324182152748108, + "learning_rate": 5.164102564102565e-07, + "loss": 0.0021, + "step": 19000 + }, + { + "epoch": 8.035525481074222, + "eval_loss": 0.1434488594532013, + "eval_runtime": 479.0168, + "eval_samples_per_second": 7.601, + "eval_steps_per_second": 0.952, + "eval_wer": 0.1180101164787229, + "step": 19000 + }, + { + "epoch": 8.046098540917741, + "grad_norm": 0.15593498945236206, + "learning_rate": 5.035897435897436e-07, + "loss": 0.0018, + "step": 19025 + }, + { + "epoch": 8.05667160076126, + "grad_norm": 0.14166687428951263, + "learning_rate": 4.907692307692308e-07, + "loss": 0.0025, + "step": 19050 + }, + { + "epoch": 8.067244660604779, + "grad_norm": 0.06104918569326401, + "learning_rate": 4.77948717948718e-07, + "loss": 0.0018, + "step": 19075 + }, + { + "epoch": 8.077817720448298, + "grad_norm": 0.11749964207410812, + "learning_rate": 4.6512820512820514e-07, + "loss": 0.0031, + "step": 19100 + }, + { + "epoch": 8.088390780291816, + "grad_norm": 0.09838801622390747, + "learning_rate": 4.523076923076924e-07, + "loss": 0.0031, + "step": 19125 + }, + { + "epoch": 8.098963840135335, + "grad_norm": 0.1402641385793686, + "learning_rate": 4.3948717948717953e-07, + "loss": 0.0027, + "step": 19150 + }, + { + "epoch": 8.109536899978854, + "grad_norm": 0.16856901347637177, + "learning_rate": 4.266666666666667e-07, + "loss": 0.0023, + "step": 19175 + }, + { + "epoch": 8.120109959822372, + "grad_norm": 0.12997014820575714, + "learning_rate": 4.138461538461539e-07, + "loss": 0.0058, + "step": 19200 + }, + { + "epoch": 8.130683019665891, + "grad_norm": 0.14169169962406158, + "learning_rate": 4.0102564102564107e-07, + "loss": 0.0036, + "step": 19225 + }, + { + "epoch": 8.14125607950941, + "grad_norm": 0.09970966726541519, + "learning_rate": 3.882051282051282e-07, + "loss": 0.0022, + "step": 19250 + }, + { + "epoch": 8.151829139352929, + "grad_norm": 1.0204663276672363, + "learning_rate": 3.7538461538461546e-07, + "loss": 0.0023, + "step": 19275 + }, + { + "epoch": 8.162402199196448, + "grad_norm": 0.24392949044704437, + "learning_rate": 3.625641025641026e-07, + "loss": 0.0029, + "step": 19300 + }, + { + "epoch": 8.172975259039966, + "grad_norm": 0.12695328891277313, + "learning_rate": 3.4974358974358974e-07, + "loss": 0.0035, + "step": 19325 + }, + { + "epoch": 8.183548318883485, + "grad_norm": 0.15973243117332458, + "learning_rate": 3.36923076923077e-07, + "loss": 0.0019, + "step": 19350 + }, + { + "epoch": 8.194121378727004, + "grad_norm": 0.2357715517282486, + "learning_rate": 3.2410256410256413e-07, + "loss": 0.0019, + "step": 19375 + }, + { + "epoch": 8.204694438570522, + "grad_norm": 0.15530213713645935, + "learning_rate": 3.112820512820513e-07, + "loss": 0.0019, + "step": 19400 + }, + { + "epoch": 8.215267498414041, + "grad_norm": 3.935415267944336, + "learning_rate": 2.9846153846153847e-07, + "loss": 0.0065, + "step": 19425 + }, + { + "epoch": 8.225840558257559, + "grad_norm": 3.079641580581665, + "learning_rate": 2.8564102564102566e-07, + "loss": 0.004, + "step": 19450 + }, + { + "epoch": 8.236413618101079, + "grad_norm": 0.1831885129213333, + "learning_rate": 2.7282051282051286e-07, + "loss": 0.0018, + "step": 19475 + }, + { + "epoch": 8.246986677944598, + "grad_norm": 0.11785794049501419, + "learning_rate": 2.6e-07, + "loss": 0.0018, + "step": 19500 + }, + { + "epoch": 8.257559737788116, + "grad_norm": 0.3607770800590515, + "learning_rate": 2.471794871794872e-07, + "loss": 0.0032, + "step": 19525 + }, + { + "epoch": 8.268132797631635, + "grad_norm": 0.2167283445596695, + "learning_rate": 2.343589743589744e-07, + "loss": 0.0019, + "step": 19550 + }, + { + "epoch": 8.278705857475153, + "grad_norm": 0.1869659721851349, + "learning_rate": 2.2153846153846153e-07, + "loss": 0.0021, + "step": 19575 + }, + { + "epoch": 8.289278917318672, + "grad_norm": 0.16754354536533356, + "learning_rate": 2.0871794871794873e-07, + "loss": 0.002, + "step": 19600 + }, + { + "epoch": 8.299851977162191, + "grad_norm": 0.8075239658355713, + "learning_rate": 1.9589743589743592e-07, + "loss": 0.003, + "step": 19625 + }, + { + "epoch": 8.31042503700571, + "grad_norm": 2.266139507293701, + "learning_rate": 1.8307692307692306e-07, + "loss": 0.0065, + "step": 19650 + }, + { + "epoch": 8.320998096849229, + "grad_norm": 0.1474863439798355, + "learning_rate": 1.7025641025641026e-07, + "loss": 0.0056, + "step": 19675 + }, + { + "epoch": 8.331571156692746, + "grad_norm": 0.1871362030506134, + "learning_rate": 1.5743589743589745e-07, + "loss": 0.0047, + "step": 19700 + }, + { + "epoch": 8.342144216536266, + "grad_norm": 0.14866340160369873, + "learning_rate": 1.4461538461538462e-07, + "loss": 0.0037, + "step": 19725 + }, + { + "epoch": 8.352717276379785, + "grad_norm": 0.15600642561912537, + "learning_rate": 1.317948717948718e-07, + "loss": 0.0038, + "step": 19750 + }, + { + "epoch": 8.363290336223303, + "grad_norm": 0.18436697125434875, + "learning_rate": 1.1897435897435898e-07, + "loss": 0.0024, + "step": 19775 + }, + { + "epoch": 8.373863396066822, + "grad_norm": 0.12687981128692627, + "learning_rate": 1.0615384615384615e-07, + "loss": 0.0018, + "step": 19800 + }, + { + "epoch": 8.38443645591034, + "grad_norm": 0.11940109729766846, + "learning_rate": 9.333333333333335e-08, + "loss": 0.0025, + "step": 19825 + }, + { + "epoch": 8.39500951575386, + "grad_norm": 0.16202621161937714, + "learning_rate": 8.051282051282052e-08, + "loss": 0.0028, + "step": 19850 + }, + { + "epoch": 8.405582575597379, + "grad_norm": 0.09489905834197998, + "learning_rate": 6.76923076923077e-08, + "loss": 0.0053, + "step": 19875 + }, + { + "epoch": 8.416155635440896, + "grad_norm": 1.8258126974105835, + "learning_rate": 5.4871794871794874e-08, + "loss": 0.0029, + "step": 19900 + }, + { + "epoch": 8.426728695284416, + "grad_norm": 0.18347540497779846, + "learning_rate": 4.2051282051282056e-08, + "loss": 0.002, + "step": 19925 + }, + { + "epoch": 8.437301755127933, + "grad_norm": 0.19024056196212769, + "learning_rate": 2.9230769230769234e-08, + "loss": 0.0029, + "step": 19950 + }, + { + "epoch": 8.447874814971453, + "grad_norm": 0.21589778363704681, + "learning_rate": 1.641025641025641e-08, + "loss": 0.0031, + "step": 19975 + }, + { + "epoch": 8.458447874814972, + "grad_norm": 0.12975341081619263, + "learning_rate": 3.5897435897435903e-09, + "loss": 0.0018, + "step": 20000 + }, + { + "epoch": 8.458447874814972, + "eval_loss": 0.1440751701593399, + "eval_runtime": 481.615, + "eval_samples_per_second": 7.56, + "eval_steps_per_second": 0.947, + "eval_wer": 0.1183813634043343, + "step": 20000 + }, + { + "epoch": 8.458447874814972, + "step": 20000, + "total_flos": 2.07526043713536e+19, + "train_loss": 0.09297830064073205, + "train_runtime": 50820.1517, + "train_samples_per_second": 6.297, + "train_steps_per_second": 0.394 + } + ], + "logging_steps": 25, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.07526043713536e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}