diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,37913 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.6187745924676786, + "eval_steps": 500, + "global_step": 5400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00029979389169945664, + "grad_norm": 4.617544174194336, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.26, + "step": 1 + }, + { + "epoch": 0.0005995877833989133, + "grad_norm": 5.574990272521973, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.2193, + "step": 2 + }, + { + "epoch": 0.0008993816750983699, + "grad_norm": 2.685803174972534, + "learning_rate": 8.999999999999999e-05, + "loss": 1.2098, + "step": 3 + }, + { + "epoch": 0.0011991755667978266, + "grad_norm": 1.2137216329574585, + "learning_rate": 0.00011999999999999999, + "loss": 1.0505, + "step": 4 + }, + { + "epoch": 0.001498969458497283, + "grad_norm": 1.1230342388153076, + "learning_rate": 0.00015, + "loss": 0.8918, + "step": 5 + }, + { + "epoch": 0.0017987633501967398, + "grad_norm": 0.9145472049713135, + "learning_rate": 0.00017999999999999998, + "loss": 0.8306, + "step": 6 + }, + { + "epoch": 0.0020985572418961962, + "grad_norm": 1.5094902515411377, + "learning_rate": 0.00020999999999999998, + "loss": 0.788, + "step": 7 + }, + { + "epoch": 0.002398351133595653, + "grad_norm": 0.5805755257606506, + "learning_rate": 0.00023999999999999998, + "loss": 0.8026, + "step": 8 + }, + { + "epoch": 0.0026981450252951096, + "grad_norm": 0.338878333568573, + "learning_rate": 0.00027, + "loss": 0.723, + "step": 9 + }, + { + "epoch": 0.002997938916994566, + "grad_norm": 0.34122711420059204, + "learning_rate": 0.0003, + "loss": 0.705, + "step": 10 + }, + { + "epoch": 0.003297732808694023, + "grad_norm": 0.3165666162967682, + "learning_rate": 0.0002999549549549549, + "loss": 0.7282, + "step": 11 + }, + { + "epoch": 0.0035975267003934795, + "grad_norm": 0.3415158689022064, + "learning_rate": 0.0002999099099099099, + "loss": 0.6833, + "step": 12 + }, + { + "epoch": 0.003897320592092936, + "grad_norm": 0.3431508243083954, + "learning_rate": 0.00029986486486486484, + "loss": 0.713, + "step": 13 + }, + { + "epoch": 0.0041971144837923925, + "grad_norm": 0.3494497239589691, + "learning_rate": 0.0002998198198198198, + "loss": 0.6822, + "step": 14 + }, + { + "epoch": 0.004496908375491849, + "grad_norm": 0.29291579127311707, + "learning_rate": 0.00029977477477477477, + "loss": 0.6348, + "step": 15 + }, + { + "epoch": 0.004796702267191306, + "grad_norm": 0.3061606287956238, + "learning_rate": 0.0002997297297297297, + "loss": 0.6584, + "step": 16 + }, + { + "epoch": 0.005096496158890762, + "grad_norm": 0.32851532101631165, + "learning_rate": 0.00029968468468468464, + "loss": 0.6792, + "step": 17 + }, + { + "epoch": 0.005396290050590219, + "grad_norm": 0.3087822198867798, + "learning_rate": 0.00029963963963963963, + "loss": 0.686, + "step": 18 + }, + { + "epoch": 0.005696083942289676, + "grad_norm": 0.2965148687362671, + "learning_rate": 0.00029959459459459457, + "loss": 0.6549, + "step": 19 + }, + { + "epoch": 0.005995877833989132, + "grad_norm": 0.32235461473464966, + "learning_rate": 0.0002995495495495495, + "loss": 0.6653, + "step": 20 + }, + { + "epoch": 0.006295671725688589, + "grad_norm": 0.29656529426574707, + "learning_rate": 0.0002995045045045045, + "loss": 0.6423, + "step": 21 + }, + { + "epoch": 0.006595465617388046, + "grad_norm": 0.3357860743999481, + "learning_rate": 0.00029945945945945943, + "loss": 0.6957, + "step": 22 + }, + { + "epoch": 0.006895259509087502, + "grad_norm": 0.28682708740234375, + "learning_rate": 0.00029941441441441437, + "loss": 0.6475, + "step": 23 + }, + { + "epoch": 0.007195053400786959, + "grad_norm": 0.29643768072128296, + "learning_rate": 0.0002993693693693693, + "loss": 0.6651, + "step": 24 + }, + { + "epoch": 0.007494847292486416, + "grad_norm": 0.2794937789440155, + "learning_rate": 0.0002993243243243243, + "loss": 0.6423, + "step": 25 + }, + { + "epoch": 0.007794641184185872, + "grad_norm": 0.3023228943347931, + "learning_rate": 0.00029927927927927923, + "loss": 0.6728, + "step": 26 + }, + { + "epoch": 0.008094435075885328, + "grad_norm": 0.27937522530555725, + "learning_rate": 0.00029923423423423417, + "loss": 0.6538, + "step": 27 + }, + { + "epoch": 0.008394228967584785, + "grad_norm": 0.2845768332481384, + "learning_rate": 0.00029918918918918916, + "loss": 0.6588, + "step": 28 + }, + { + "epoch": 0.008694022859284242, + "grad_norm": 0.2637081742286682, + "learning_rate": 0.0002991441441441441, + "loss": 0.6253, + "step": 29 + }, + { + "epoch": 0.008993816750983699, + "grad_norm": 0.25597694516181946, + "learning_rate": 0.00029909909909909903, + "loss": 0.5973, + "step": 30 + }, + { + "epoch": 0.009293610642683156, + "grad_norm": 0.28932246565818787, + "learning_rate": 0.000299054054054054, + "loss": 0.6607, + "step": 31 + }, + { + "epoch": 0.009593404534382613, + "grad_norm": 0.28410351276397705, + "learning_rate": 0.00029900900900900896, + "loss": 0.6496, + "step": 32 + }, + { + "epoch": 0.009893198426082068, + "grad_norm": 0.2708141803741455, + "learning_rate": 0.00029896396396396395, + "loss": 0.6161, + "step": 33 + }, + { + "epoch": 0.010192992317781525, + "grad_norm": 0.27171120047569275, + "learning_rate": 0.0002989189189189189, + "loss": 0.6179, + "step": 34 + }, + { + "epoch": 0.010492786209480982, + "grad_norm": 0.2806681990623474, + "learning_rate": 0.0002988738738738738, + "loss": 0.6111, + "step": 35 + }, + { + "epoch": 0.010792580101180439, + "grad_norm": 0.36722084879875183, + "learning_rate": 0.0002988288288288288, + "loss": 0.5778, + "step": 36 + }, + { + "epoch": 0.011092373992879895, + "grad_norm": 0.26182547211647034, + "learning_rate": 0.00029878378378378375, + "loss": 0.5941, + "step": 37 + }, + { + "epoch": 0.011392167884579352, + "grad_norm": 0.26753902435302734, + "learning_rate": 0.0002987387387387387, + "loss": 0.6019, + "step": 38 + }, + { + "epoch": 0.011691961776278808, + "grad_norm": 0.28038090467453003, + "learning_rate": 0.0002986936936936937, + "loss": 0.6379, + "step": 39 + }, + { + "epoch": 0.011991755667978264, + "grad_norm": 0.29290881752967834, + "learning_rate": 0.0002986486486486486, + "loss": 0.6743, + "step": 40 + }, + { + "epoch": 0.012291549559677721, + "grad_norm": 0.28465205430984497, + "learning_rate": 0.00029860360360360356, + "loss": 0.6214, + "step": 41 + }, + { + "epoch": 0.012591343451377178, + "grad_norm": 0.26730677485466003, + "learning_rate": 0.00029855855855855855, + "loss": 0.6085, + "step": 42 + }, + { + "epoch": 0.012891137343076635, + "grad_norm": 0.2801668643951416, + "learning_rate": 0.0002985135135135135, + "loss": 0.6541, + "step": 43 + }, + { + "epoch": 0.013190931234776092, + "grad_norm": 0.2741893529891968, + "learning_rate": 0.0002984684684684684, + "loss": 0.6102, + "step": 44 + }, + { + "epoch": 0.013490725126475547, + "grad_norm": 0.26284873485565186, + "learning_rate": 0.0002984234234234234, + "loss": 0.5762, + "step": 45 + }, + { + "epoch": 0.013790519018175004, + "grad_norm": 0.26464149355888367, + "learning_rate": 0.00029837837837837835, + "loss": 0.6238, + "step": 46 + }, + { + "epoch": 0.014090312909874461, + "grad_norm": 0.2674684226512909, + "learning_rate": 0.00029833333333333334, + "loss": 0.6342, + "step": 47 + }, + { + "epoch": 0.014390106801573918, + "grad_norm": 0.2591281533241272, + "learning_rate": 0.0002982882882882883, + "loss": 0.5993, + "step": 48 + }, + { + "epoch": 0.014689900693273375, + "grad_norm": 0.29611843824386597, + "learning_rate": 0.0002982432432432432, + "loss": 0.6376, + "step": 49 + }, + { + "epoch": 0.014989694584972832, + "grad_norm": 0.26478683948516846, + "learning_rate": 0.0002981981981981982, + "loss": 0.5935, + "step": 50 + }, + { + "epoch": 0.015289488476672287, + "grad_norm": 0.25142884254455566, + "learning_rate": 0.00029815315315315314, + "loss": 0.5522, + "step": 51 + }, + { + "epoch": 0.015589282368371744, + "grad_norm": 0.26863306760787964, + "learning_rate": 0.0002981081081081081, + "loss": 0.5808, + "step": 52 + }, + { + "epoch": 0.015889076260071203, + "grad_norm": 0.26126888394355774, + "learning_rate": 0.00029806306306306307, + "loss": 0.6101, + "step": 53 + }, + { + "epoch": 0.016188870151770656, + "grad_norm": 0.24878369271755219, + "learning_rate": 0.000298018018018018, + "loss": 0.5787, + "step": 54 + }, + { + "epoch": 0.016488664043470113, + "grad_norm": 0.2513170838356018, + "learning_rate": 0.00029797297297297294, + "loss": 0.5743, + "step": 55 + }, + { + "epoch": 0.01678845793516957, + "grad_norm": 0.26510271430015564, + "learning_rate": 0.00029792792792792793, + "loss": 0.6023, + "step": 56 + }, + { + "epoch": 0.017088251826869027, + "grad_norm": 0.2809905409812927, + "learning_rate": 0.00029788288288288287, + "loss": 0.6336, + "step": 57 + }, + { + "epoch": 0.017388045718568484, + "grad_norm": 0.250021755695343, + "learning_rate": 0.0002978378378378378, + "loss": 0.5385, + "step": 58 + }, + { + "epoch": 0.01768783961026794, + "grad_norm": 0.248112291097641, + "learning_rate": 0.0002977927927927928, + "loss": 0.5633, + "step": 59 + }, + { + "epoch": 0.017987633501967398, + "grad_norm": 0.2756728529930115, + "learning_rate": 0.00029774774774774773, + "loss": 0.5988, + "step": 60 + }, + { + "epoch": 0.018287427393666855, + "grad_norm": 0.26302388310432434, + "learning_rate": 0.00029770270270270267, + "loss": 0.5738, + "step": 61 + }, + { + "epoch": 0.01858722128536631, + "grad_norm": 0.2736607789993286, + "learning_rate": 0.00029765765765765766, + "loss": 0.5908, + "step": 62 + }, + { + "epoch": 0.01888701517706577, + "grad_norm": 0.26531675457954407, + "learning_rate": 0.0002976126126126126, + "loss": 0.6112, + "step": 63 + }, + { + "epoch": 0.019186809068765225, + "grad_norm": 0.252244234085083, + "learning_rate": 0.00029756756756756753, + "loss": 0.5921, + "step": 64 + }, + { + "epoch": 0.019486602960464682, + "grad_norm": 0.26548662781715393, + "learning_rate": 0.0002975225225225225, + "loss": 0.604, + "step": 65 + }, + { + "epoch": 0.019786396852164136, + "grad_norm": 0.2575795352458954, + "learning_rate": 0.00029747747747747746, + "loss": 0.564, + "step": 66 + }, + { + "epoch": 0.020086190743863593, + "grad_norm": 0.25535911321640015, + "learning_rate": 0.0002974324324324324, + "loss": 0.5962, + "step": 67 + }, + { + "epoch": 0.02038598463556305, + "grad_norm": 0.26686182618141174, + "learning_rate": 0.0002973873873873874, + "loss": 0.6268, + "step": 68 + }, + { + "epoch": 0.020685778527262506, + "grad_norm": 0.25553348660469055, + "learning_rate": 0.00029734234234234233, + "loss": 0.5583, + "step": 69 + }, + { + "epoch": 0.020985572418961963, + "grad_norm": 0.2537127435207367, + "learning_rate": 0.00029729729729729726, + "loss": 0.5734, + "step": 70 + }, + { + "epoch": 0.02128536631066142, + "grad_norm": 0.27586349844932556, + "learning_rate": 0.0002972522522522522, + "loss": 0.5667, + "step": 71 + }, + { + "epoch": 0.021585160202360877, + "grad_norm": 0.2614293694496155, + "learning_rate": 0.0002972072072072072, + "loss": 0.5831, + "step": 72 + }, + { + "epoch": 0.021884954094060334, + "grad_norm": 0.263944536447525, + "learning_rate": 0.00029716216216216213, + "loss": 0.5862, + "step": 73 + }, + { + "epoch": 0.02218474798575979, + "grad_norm": 0.24802720546722412, + "learning_rate": 0.00029711711711711707, + "loss": 0.5788, + "step": 74 + }, + { + "epoch": 0.022484541877459248, + "grad_norm": 0.2746400833129883, + "learning_rate": 0.00029707207207207206, + "loss": 0.591, + "step": 75 + }, + { + "epoch": 0.022784335769158705, + "grad_norm": 0.2553657591342926, + "learning_rate": 0.000297027027027027, + "loss": 0.5735, + "step": 76 + }, + { + "epoch": 0.02308412966085816, + "grad_norm": 0.2704835534095764, + "learning_rate": 0.00029698198198198193, + "loss": 0.5809, + "step": 77 + }, + { + "epoch": 0.023383923552557615, + "grad_norm": 0.2556537985801697, + "learning_rate": 0.0002969369369369369, + "loss": 0.5496, + "step": 78 + }, + { + "epoch": 0.023683717444257072, + "grad_norm": 0.2523523271083832, + "learning_rate": 0.00029689189189189186, + "loss": 0.5758, + "step": 79 + }, + { + "epoch": 0.02398351133595653, + "grad_norm": 0.25214141607284546, + "learning_rate": 0.0002968468468468468, + "loss": 0.5844, + "step": 80 + }, + { + "epoch": 0.024283305227655986, + "grad_norm": 0.2590884268283844, + "learning_rate": 0.0002968018018018018, + "loss": 0.5862, + "step": 81 + }, + { + "epoch": 0.024583099119355443, + "grad_norm": 0.24399450421333313, + "learning_rate": 0.0002967567567567567, + "loss": 0.5685, + "step": 82 + }, + { + "epoch": 0.0248828930110549, + "grad_norm": 0.24457746744155884, + "learning_rate": 0.00029671171171171166, + "loss": 0.5684, + "step": 83 + }, + { + "epoch": 0.025182686902754357, + "grad_norm": 0.23214662075042725, + "learning_rate": 0.00029666666666666665, + "loss": 0.5682, + "step": 84 + }, + { + "epoch": 0.025482480794453814, + "grad_norm": 0.24058258533477783, + "learning_rate": 0.0002966216216216216, + "loss": 0.5688, + "step": 85 + }, + { + "epoch": 0.02578227468615327, + "grad_norm": 0.2338346391916275, + "learning_rate": 0.0002965765765765765, + "loss": 0.5485, + "step": 86 + }, + { + "epoch": 0.026082068577852727, + "grad_norm": 0.2553529143333435, + "learning_rate": 0.0002965315315315315, + "loss": 0.5985, + "step": 87 + }, + { + "epoch": 0.026381862469552184, + "grad_norm": 0.2518393099308014, + "learning_rate": 0.00029648648648648645, + "loss": 0.6079, + "step": 88 + }, + { + "epoch": 0.02668165636125164, + "grad_norm": 0.23418371379375458, + "learning_rate": 0.0002964414414414414, + "loss": 0.5942, + "step": 89 + }, + { + "epoch": 0.026981450252951095, + "grad_norm": 0.2454022616147995, + "learning_rate": 0.0002963963963963964, + "loss": 0.5519, + "step": 90 + }, + { + "epoch": 0.02728124414465055, + "grad_norm": 0.25474220514297485, + "learning_rate": 0.0002963513513513513, + "loss": 0.5771, + "step": 91 + }, + { + "epoch": 0.02758103803635001, + "grad_norm": 0.2332638055086136, + "learning_rate": 0.00029630630630630625, + "loss": 0.5477, + "step": 92 + }, + { + "epoch": 0.027880831928049465, + "grad_norm": 0.23931057751178741, + "learning_rate": 0.00029626126126126124, + "loss": 0.5391, + "step": 93 + }, + { + "epoch": 0.028180625819748922, + "grad_norm": 0.23103167116641998, + "learning_rate": 0.0002962162162162162, + "loss": 0.5622, + "step": 94 + }, + { + "epoch": 0.02848041971144838, + "grad_norm": 0.2356046438217163, + "learning_rate": 0.0002961711711711711, + "loss": 0.5729, + "step": 95 + }, + { + "epoch": 0.028780213603147836, + "grad_norm": 0.26015767455101013, + "learning_rate": 0.0002961261261261261, + "loss": 0.5878, + "step": 96 + }, + { + "epoch": 0.029080007494847293, + "grad_norm": 0.2299613654613495, + "learning_rate": 0.00029608108108108104, + "loss": 0.5385, + "step": 97 + }, + { + "epoch": 0.02937980138654675, + "grad_norm": 0.24516573548316956, + "learning_rate": 0.000296036036036036, + "loss": 0.5688, + "step": 98 + }, + { + "epoch": 0.029679595278246207, + "grad_norm": 0.25019732117652893, + "learning_rate": 0.00029599099099099097, + "loss": 0.5562, + "step": 99 + }, + { + "epoch": 0.029979389169945664, + "grad_norm": 0.23934195935726166, + "learning_rate": 0.0002959459459459459, + "loss": 0.5572, + "step": 100 + }, + { + "epoch": 0.030279183061645117, + "grad_norm": 0.23651225864887238, + "learning_rate": 0.00029590090090090085, + "loss": 0.5517, + "step": 101 + }, + { + "epoch": 0.030578976953344574, + "grad_norm": 0.2545163631439209, + "learning_rate": 0.00029585585585585584, + "loss": 0.5553, + "step": 102 + }, + { + "epoch": 0.03087877084504403, + "grad_norm": 0.2259800285100937, + "learning_rate": 0.0002958108108108108, + "loss": 0.5216, + "step": 103 + }, + { + "epoch": 0.031178564736743488, + "grad_norm": 0.25137075781822205, + "learning_rate": 0.00029576576576576576, + "loss": 0.559, + "step": 104 + }, + { + "epoch": 0.03147835862844295, + "grad_norm": 0.2526615262031555, + "learning_rate": 0.0002957207207207207, + "loss": 0.6025, + "step": 105 + }, + { + "epoch": 0.031778152520142405, + "grad_norm": 0.2406662553548813, + "learning_rate": 0.00029567567567567564, + "loss": 0.562, + "step": 106 + }, + { + "epoch": 0.032077946411841855, + "grad_norm": 0.23598824441432953, + "learning_rate": 0.00029563063063063063, + "loss": 0.5749, + "step": 107 + }, + { + "epoch": 0.03237774030354131, + "grad_norm": 0.24079738557338715, + "learning_rate": 0.00029558558558558557, + "loss": 0.5697, + "step": 108 + }, + { + "epoch": 0.03267753419524077, + "grad_norm": 0.25064727663993835, + "learning_rate": 0.0002955405405405405, + "loss": 0.5821, + "step": 109 + }, + { + "epoch": 0.032977328086940226, + "grad_norm": 0.22557033598423004, + "learning_rate": 0.0002954954954954955, + "loss": 0.5367, + "step": 110 + }, + { + "epoch": 0.03327712197863968, + "grad_norm": 0.22542916238307953, + "learning_rate": 0.00029545045045045043, + "loss": 0.55, + "step": 111 + }, + { + "epoch": 0.03357691587033914, + "grad_norm": 0.24270494282245636, + "learning_rate": 0.00029540540540540537, + "loss": 0.5773, + "step": 112 + }, + { + "epoch": 0.0338767097620386, + "grad_norm": 0.23397719860076904, + "learning_rate": 0.00029536036036036036, + "loss": 0.5516, + "step": 113 + }, + { + "epoch": 0.034176503653738054, + "grad_norm": 0.2322990894317627, + "learning_rate": 0.0002953153153153153, + "loss": 0.5307, + "step": 114 + }, + { + "epoch": 0.03447629754543751, + "grad_norm": 0.2167540043592453, + "learning_rate": 0.0002952702702702703, + "loss": 0.5197, + "step": 115 + }, + { + "epoch": 0.03477609143713697, + "grad_norm": 0.2705434262752533, + "learning_rate": 0.0002952252252252252, + "loss": 0.564, + "step": 116 + }, + { + "epoch": 0.035075885328836424, + "grad_norm": 0.23195774853229523, + "learning_rate": 0.00029518018018018016, + "loss": 0.5397, + "step": 117 + }, + { + "epoch": 0.03537567922053588, + "grad_norm": 0.22944559156894684, + "learning_rate": 0.00029513513513513515, + "loss": 0.5197, + "step": 118 + }, + { + "epoch": 0.03567547311223534, + "grad_norm": 0.25140368938446045, + "learning_rate": 0.0002950900900900901, + "loss": 0.5661, + "step": 119 + }, + { + "epoch": 0.035975267003934795, + "grad_norm": 0.23137278854846954, + "learning_rate": 0.000295045045045045, + "loss": 0.5284, + "step": 120 + }, + { + "epoch": 0.03627506089563425, + "grad_norm": 0.2358487993478775, + "learning_rate": 0.00029499999999999996, + "loss": 0.551, + "step": 121 + }, + { + "epoch": 0.03657485478733371, + "grad_norm": 0.23212139308452606, + "learning_rate": 0.00029495495495495495, + "loss": 0.5624, + "step": 122 + }, + { + "epoch": 0.036874648679033166, + "grad_norm": 0.2504674196243286, + "learning_rate": 0.0002949099099099099, + "loss": 0.5649, + "step": 123 + }, + { + "epoch": 0.03717444257073262, + "grad_norm": 0.24574224650859833, + "learning_rate": 0.0002948648648648648, + "loss": 0.5415, + "step": 124 + }, + { + "epoch": 0.03747423646243208, + "grad_norm": 0.23007921874523163, + "learning_rate": 0.0002948198198198198, + "loss": 0.5624, + "step": 125 + }, + { + "epoch": 0.03777403035413154, + "grad_norm": 0.2503686547279358, + "learning_rate": 0.00029477477477477475, + "loss": 0.5484, + "step": 126 + }, + { + "epoch": 0.038073824245830994, + "grad_norm": 0.2467029094696045, + "learning_rate": 0.0002947297297297297, + "loss": 0.5442, + "step": 127 + }, + { + "epoch": 0.03837361813753045, + "grad_norm": 0.23382776975631714, + "learning_rate": 0.0002946846846846847, + "loss": 0.5485, + "step": 128 + }, + { + "epoch": 0.03867341202922991, + "grad_norm": 0.25080442428588867, + "learning_rate": 0.0002946396396396396, + "loss": 0.5726, + "step": 129 + }, + { + "epoch": 0.038973205920929364, + "grad_norm": 0.2410043627023697, + "learning_rate": 0.00029459459459459455, + "loss": 0.5475, + "step": 130 + }, + { + "epoch": 0.039272999812628814, + "grad_norm": 0.24639956653118134, + "learning_rate": 0.00029454954954954955, + "loss": 0.5461, + "step": 131 + }, + { + "epoch": 0.03957279370432827, + "grad_norm": 0.23122674226760864, + "learning_rate": 0.0002945045045045045, + "loss": 0.5605, + "step": 132 + }, + { + "epoch": 0.03987258759602773, + "grad_norm": 0.22207331657409668, + "learning_rate": 0.0002944594594594594, + "loss": 0.5045, + "step": 133 + }, + { + "epoch": 0.040172381487727185, + "grad_norm": 0.2344479262828827, + "learning_rate": 0.0002944144144144144, + "loss": 0.5724, + "step": 134 + }, + { + "epoch": 0.04047217537942664, + "grad_norm": 0.2410653978586197, + "learning_rate": 0.00029436936936936935, + "loss": 0.5266, + "step": 135 + }, + { + "epoch": 0.0407719692711261, + "grad_norm": 0.23028722405433655, + "learning_rate": 0.0002943243243243243, + "loss": 0.545, + "step": 136 + }, + { + "epoch": 0.041071763162825556, + "grad_norm": 0.23669356107711792, + "learning_rate": 0.0002942792792792793, + "loss": 0.5429, + "step": 137 + }, + { + "epoch": 0.04137155705452501, + "grad_norm": 0.24232889711856842, + "learning_rate": 0.0002942342342342342, + "loss": 0.5815, + "step": 138 + }, + { + "epoch": 0.04167135094622447, + "grad_norm": 0.22076158225536346, + "learning_rate": 0.00029418918918918915, + "loss": 0.5028, + "step": 139 + }, + { + "epoch": 0.04197114483792393, + "grad_norm": 0.23322072625160217, + "learning_rate": 0.00029414414414414414, + "loss": 0.5293, + "step": 140 + }, + { + "epoch": 0.042270938729623384, + "grad_norm": 0.24910598993301392, + "learning_rate": 0.0002940990990990991, + "loss": 0.5403, + "step": 141 + }, + { + "epoch": 0.04257073262132284, + "grad_norm": 0.24588559567928314, + "learning_rate": 0.000294054054054054, + "loss": 0.538, + "step": 142 + }, + { + "epoch": 0.0428705265130223, + "grad_norm": 0.20689445734024048, + "learning_rate": 0.00029400900900900895, + "loss": 0.4921, + "step": 143 + }, + { + "epoch": 0.043170320404721754, + "grad_norm": 0.2518969178199768, + "learning_rate": 0.00029396396396396394, + "loss": 0.5625, + "step": 144 + }, + { + "epoch": 0.04347011429642121, + "grad_norm": 0.25027644634246826, + "learning_rate": 0.0002939189189189189, + "loss": 0.5823, + "step": 145 + }, + { + "epoch": 0.04376990818812067, + "grad_norm": 0.24390611052513123, + "learning_rate": 0.0002938738738738738, + "loss": 0.5205, + "step": 146 + }, + { + "epoch": 0.044069702079820125, + "grad_norm": 0.24781368672847748, + "learning_rate": 0.0002938288288288288, + "loss": 0.5559, + "step": 147 + }, + { + "epoch": 0.04436949597151958, + "grad_norm": 0.23696455359458923, + "learning_rate": 0.00029378378378378374, + "loss": 0.5443, + "step": 148 + }, + { + "epoch": 0.04466928986321904, + "grad_norm": 0.23888202011585236, + "learning_rate": 0.0002937387387387387, + "loss": 0.5397, + "step": 149 + }, + { + "epoch": 0.044969083754918496, + "grad_norm": 0.22896665334701538, + "learning_rate": 0.00029369369369369367, + "loss": 0.5462, + "step": 150 + }, + { + "epoch": 0.04526887764661795, + "grad_norm": 0.23967792093753815, + "learning_rate": 0.0002936486486486486, + "loss": 0.5653, + "step": 151 + }, + { + "epoch": 0.04556867153831741, + "grad_norm": 0.2244715392589569, + "learning_rate": 0.00029360360360360354, + "loss": 0.5251, + "step": 152 + }, + { + "epoch": 0.045868465430016866, + "grad_norm": 0.2287164330482483, + "learning_rate": 0.00029355855855855853, + "loss": 0.5313, + "step": 153 + }, + { + "epoch": 0.04616825932171632, + "grad_norm": 0.2419881671667099, + "learning_rate": 0.00029351351351351347, + "loss": 0.5397, + "step": 154 + }, + { + "epoch": 0.04646805321341577, + "grad_norm": 0.2414129674434662, + "learning_rate": 0.0002934684684684684, + "loss": 0.574, + "step": 155 + }, + { + "epoch": 0.04676784710511523, + "grad_norm": 0.237702414393425, + "learning_rate": 0.0002934234234234234, + "loss": 0.5377, + "step": 156 + }, + { + "epoch": 0.04706764099681469, + "grad_norm": 0.24712331593036652, + "learning_rate": 0.00029337837837837833, + "loss": 0.5722, + "step": 157 + }, + { + "epoch": 0.047367434888514144, + "grad_norm": 0.23250596225261688, + "learning_rate": 0.00029333333333333327, + "loss": 0.5663, + "step": 158 + }, + { + "epoch": 0.0476672287802136, + "grad_norm": 0.22616314888000488, + "learning_rate": 0.00029328828828828826, + "loss": 0.4984, + "step": 159 + }, + { + "epoch": 0.04796702267191306, + "grad_norm": 0.23835696280002594, + "learning_rate": 0.0002932432432432432, + "loss": 0.5512, + "step": 160 + }, + { + "epoch": 0.048266816563612515, + "grad_norm": 0.23458854854106903, + "learning_rate": 0.0002931981981981982, + "loss": 0.5472, + "step": 161 + }, + { + "epoch": 0.04856661045531197, + "grad_norm": 0.2530427873134613, + "learning_rate": 0.0002931531531531531, + "loss": 0.5807, + "step": 162 + }, + { + "epoch": 0.04886640434701143, + "grad_norm": 0.23911581933498383, + "learning_rate": 0.00029310810810810806, + "loss": 0.5236, + "step": 163 + }, + { + "epoch": 0.049166198238710886, + "grad_norm": 0.24235881865024567, + "learning_rate": 0.00029306306306306305, + "loss": 0.5509, + "step": 164 + }, + { + "epoch": 0.04946599213041034, + "grad_norm": 0.23077982664108276, + "learning_rate": 0.000293018018018018, + "loss": 0.5265, + "step": 165 + }, + { + "epoch": 0.0497657860221098, + "grad_norm": 0.23159649968147278, + "learning_rate": 0.00029297297297297293, + "loss": 0.5521, + "step": 166 + }, + { + "epoch": 0.050065579913809256, + "grad_norm": 0.23901638388633728, + "learning_rate": 0.0002929279279279279, + "loss": 0.517, + "step": 167 + }, + { + "epoch": 0.05036537380550871, + "grad_norm": 0.2441125512123108, + "learning_rate": 0.00029288288288288286, + "loss": 0.5376, + "step": 168 + }, + { + "epoch": 0.05066516769720817, + "grad_norm": 0.2408357411623001, + "learning_rate": 0.0002928378378378378, + "loss": 0.5119, + "step": 169 + }, + { + "epoch": 0.05096496158890763, + "grad_norm": 0.24620820581912994, + "learning_rate": 0.0002927927927927928, + "loss": 0.5454, + "step": 170 + }, + { + "epoch": 0.051264755480607084, + "grad_norm": 0.23092059791088104, + "learning_rate": 0.0002927477477477477, + "loss": 0.5464, + "step": 171 + }, + { + "epoch": 0.05156454937230654, + "grad_norm": 0.23076176643371582, + "learning_rate": 0.0002927027027027027, + "loss": 0.5153, + "step": 172 + }, + { + "epoch": 0.051864343264006, + "grad_norm": 0.2518848478794098, + "learning_rate": 0.00029265765765765765, + "loss": 0.5416, + "step": 173 + }, + { + "epoch": 0.052164137155705455, + "grad_norm": 0.24004964530467987, + "learning_rate": 0.0002926126126126126, + "loss": 0.5625, + "step": 174 + }, + { + "epoch": 0.05246393104740491, + "grad_norm": 0.26169759035110474, + "learning_rate": 0.0002925675675675676, + "loss": 0.5322, + "step": 175 + }, + { + "epoch": 0.05276372493910437, + "grad_norm": 0.23733891546726227, + "learning_rate": 0.0002925225225225225, + "loss": 0.5526, + "step": 176 + }, + { + "epoch": 0.053063518830803826, + "grad_norm": 0.22720226645469666, + "learning_rate": 0.00029247747747747745, + "loss": 0.5108, + "step": 177 + }, + { + "epoch": 0.05336331272250328, + "grad_norm": 0.2425220012664795, + "learning_rate": 0.00029243243243243244, + "loss": 0.5337, + "step": 178 + }, + { + "epoch": 0.05366310661420273, + "grad_norm": 0.24859829246997833, + "learning_rate": 0.0002923873873873874, + "loss": 0.5239, + "step": 179 + }, + { + "epoch": 0.05396290050590219, + "grad_norm": 0.23471564054489136, + "learning_rate": 0.0002923423423423423, + "loss": 0.5372, + "step": 180 + }, + { + "epoch": 0.054262694397601646, + "grad_norm": 0.23243340849876404, + "learning_rate": 0.0002922972972972973, + "loss": 0.5283, + "step": 181 + }, + { + "epoch": 0.0545624882893011, + "grad_norm": 0.240774467587471, + "learning_rate": 0.00029225225225225224, + "loss": 0.523, + "step": 182 + }, + { + "epoch": 0.05486228218100056, + "grad_norm": 0.2518199384212494, + "learning_rate": 0.0002922072072072072, + "loss": 0.56, + "step": 183 + }, + { + "epoch": 0.05516207607270002, + "grad_norm": 0.22320473194122314, + "learning_rate": 0.00029216216216216217, + "loss": 0.5442, + "step": 184 + }, + { + "epoch": 0.055461869964399474, + "grad_norm": 0.24681055545806885, + "learning_rate": 0.0002921171171171171, + "loss": 0.5596, + "step": 185 + }, + { + "epoch": 0.05576166385609893, + "grad_norm": 0.23595312237739563, + "learning_rate": 0.00029207207207207204, + "loss": 0.5343, + "step": 186 + }, + { + "epoch": 0.05606145774779839, + "grad_norm": 0.2307872772216797, + "learning_rate": 0.00029202702702702703, + "loss": 0.5206, + "step": 187 + }, + { + "epoch": 0.056361251639497845, + "grad_norm": 0.24639058113098145, + "learning_rate": 0.00029198198198198197, + "loss": 0.5449, + "step": 188 + }, + { + "epoch": 0.0566610455311973, + "grad_norm": 0.24432207643985748, + "learning_rate": 0.0002919369369369369, + "loss": 0.5455, + "step": 189 + }, + { + "epoch": 0.05696083942289676, + "grad_norm": 0.23387478291988373, + "learning_rate": 0.0002918918918918919, + "loss": 0.5402, + "step": 190 + }, + { + "epoch": 0.057260633314596215, + "grad_norm": 0.2183016538619995, + "learning_rate": 0.00029184684684684684, + "loss": 0.4989, + "step": 191 + }, + { + "epoch": 0.05756042720629567, + "grad_norm": 0.23916368186473846, + "learning_rate": 0.00029180180180180177, + "loss": 0.5397, + "step": 192 + }, + { + "epoch": 0.05786022109799513, + "grad_norm": 0.24624724686145782, + "learning_rate": 0.0002917567567567567, + "loss": 0.5248, + "step": 193 + }, + { + "epoch": 0.058160014989694586, + "grad_norm": 0.2571977376937866, + "learning_rate": 0.0002917117117117117, + "loss": 0.587, + "step": 194 + }, + { + "epoch": 0.05845980888139404, + "grad_norm": 0.22828777134418488, + "learning_rate": 0.00029166666666666664, + "loss": 0.5424, + "step": 195 + }, + { + "epoch": 0.0587596027730935, + "grad_norm": 0.23892144858837128, + "learning_rate": 0.0002916216216216216, + "loss": 0.5228, + "step": 196 + }, + { + "epoch": 0.05905939666479296, + "grad_norm": 0.22904744744300842, + "learning_rate": 0.00029157657657657656, + "loss": 0.5052, + "step": 197 + }, + { + "epoch": 0.059359190556492414, + "grad_norm": 0.2399347573518753, + "learning_rate": 0.0002915315315315315, + "loss": 0.541, + "step": 198 + }, + { + "epoch": 0.05965898444819187, + "grad_norm": 0.2253408432006836, + "learning_rate": 0.00029148648648648644, + "loss": 0.5204, + "step": 199 + }, + { + "epoch": 0.05995877833989133, + "grad_norm": 0.23211434483528137, + "learning_rate": 0.00029144144144144143, + "loss": 0.5248, + "step": 200 + }, + { + "epoch": 0.060258572231590785, + "grad_norm": 0.24016128480434418, + "learning_rate": 0.00029139639639639637, + "loss": 0.5386, + "step": 201 + }, + { + "epoch": 0.060558366123290235, + "grad_norm": 0.2374415397644043, + "learning_rate": 0.0002913513513513513, + "loss": 0.4904, + "step": 202 + }, + { + "epoch": 0.06085816001498969, + "grad_norm": 0.257735013961792, + "learning_rate": 0.0002913063063063063, + "loss": 0.5253, + "step": 203 + }, + { + "epoch": 0.06115795390668915, + "grad_norm": 0.23416665196418762, + "learning_rate": 0.00029126126126126123, + "loss": 0.5164, + "step": 204 + }, + { + "epoch": 0.061457747798388605, + "grad_norm": 0.24382436275482178, + "learning_rate": 0.00029121621621621617, + "loss": 0.5507, + "step": 205 + }, + { + "epoch": 0.06175754169008806, + "grad_norm": 0.2488170564174652, + "learning_rate": 0.00029117117117117116, + "loss": 0.543, + "step": 206 + }, + { + "epoch": 0.06205733558178752, + "grad_norm": 0.24262337386608124, + "learning_rate": 0.0002911261261261261, + "loss": 0.5533, + "step": 207 + }, + { + "epoch": 0.062357129473486976, + "grad_norm": 0.2239450216293335, + "learning_rate": 0.00029108108108108103, + "loss": 0.4955, + "step": 208 + }, + { + "epoch": 0.06265692336518644, + "grad_norm": 0.22644869983196259, + "learning_rate": 0.000291036036036036, + "loss": 0.5362, + "step": 209 + }, + { + "epoch": 0.0629567172568859, + "grad_norm": 0.23194913566112518, + "learning_rate": 0.00029099099099099096, + "loss": 0.5226, + "step": 210 + }, + { + "epoch": 0.06325651114858535, + "grad_norm": 0.22114375233650208, + "learning_rate": 0.0002909459459459459, + "loss": 0.472, + "step": 211 + }, + { + "epoch": 0.06355630504028481, + "grad_norm": 0.24242356419563293, + "learning_rate": 0.0002909009009009009, + "loss": 0.5041, + "step": 212 + }, + { + "epoch": 0.06385609893198427, + "grad_norm": 0.24195095896720886, + "learning_rate": 0.0002908558558558558, + "loss": 0.511, + "step": 213 + }, + { + "epoch": 0.06415589282368371, + "grad_norm": 0.23720628023147583, + "learning_rate": 0.00029081081081081076, + "loss": 0.5257, + "step": 214 + }, + { + "epoch": 0.06445568671538317, + "grad_norm": 0.22656656801700592, + "learning_rate": 0.00029076576576576575, + "loss": 0.5336, + "step": 215 + }, + { + "epoch": 0.06475548060708262, + "grad_norm": 0.23326708376407623, + "learning_rate": 0.0002907207207207207, + "loss": 0.5184, + "step": 216 + }, + { + "epoch": 0.06505527449878208, + "grad_norm": 0.2327839583158493, + "learning_rate": 0.0002906756756756756, + "loss": 0.5434, + "step": 217 + }, + { + "epoch": 0.06535506839048154, + "grad_norm": 0.24181734025478363, + "learning_rate": 0.0002906306306306306, + "loss": 0.534, + "step": 218 + }, + { + "epoch": 0.065654862282181, + "grad_norm": 0.23325061798095703, + "learning_rate": 0.00029058558558558555, + "loss": 0.5438, + "step": 219 + }, + { + "epoch": 0.06595465617388045, + "grad_norm": 0.2559988498687744, + "learning_rate": 0.0002905405405405405, + "loss": 0.5229, + "step": 220 + }, + { + "epoch": 0.06625445006557991, + "grad_norm": 0.2432825118303299, + "learning_rate": 0.0002904954954954955, + "loss": 0.5223, + "step": 221 + }, + { + "epoch": 0.06655424395727937, + "grad_norm": 0.23795264959335327, + "learning_rate": 0.0002904504504504504, + "loss": 0.5199, + "step": 222 + }, + { + "epoch": 0.06685403784897882, + "grad_norm": 0.22999484837055206, + "learning_rate": 0.00029040540540540535, + "loss": 0.506, + "step": 223 + }, + { + "epoch": 0.06715383174067828, + "grad_norm": 0.22168515622615814, + "learning_rate": 0.00029036036036036034, + "loss": 0.4873, + "step": 224 + }, + { + "epoch": 0.06745362563237774, + "grad_norm": 0.23448102176189423, + "learning_rate": 0.0002903153153153153, + "loss": 0.5533, + "step": 225 + }, + { + "epoch": 0.0677534195240772, + "grad_norm": 0.23822058737277985, + "learning_rate": 0.0002902702702702702, + "loss": 0.5221, + "step": 226 + }, + { + "epoch": 0.06805321341577665, + "grad_norm": 0.22952431440353394, + "learning_rate": 0.0002902252252252252, + "loss": 0.5402, + "step": 227 + }, + { + "epoch": 0.06835300730747611, + "grad_norm": 0.2197679728269577, + "learning_rate": 0.00029018018018018015, + "loss": 0.5115, + "step": 228 + }, + { + "epoch": 0.06865280119917556, + "grad_norm": 0.23495204746723175, + "learning_rate": 0.00029013513513513514, + "loss": 0.4963, + "step": 229 + }, + { + "epoch": 0.06895259509087502, + "grad_norm": 0.2233276218175888, + "learning_rate": 0.0002900900900900901, + "loss": 0.5257, + "step": 230 + }, + { + "epoch": 0.06925238898257448, + "grad_norm": 0.23294100165367126, + "learning_rate": 0.000290045045045045, + "loss": 0.5231, + "step": 231 + }, + { + "epoch": 0.06955218287427394, + "grad_norm": 0.24349254369735718, + "learning_rate": 0.00029, + "loss": 0.5198, + "step": 232 + }, + { + "epoch": 0.06985197676597339, + "grad_norm": 0.22396647930145264, + "learning_rate": 0.00028995495495495494, + "loss": 0.5032, + "step": 233 + }, + { + "epoch": 0.07015177065767285, + "grad_norm": 0.23737500607967377, + "learning_rate": 0.0002899099099099099, + "loss": 0.5071, + "step": 234 + }, + { + "epoch": 0.0704515645493723, + "grad_norm": 0.22798973321914673, + "learning_rate": 0.00028986486486486487, + "loss": 0.4844, + "step": 235 + }, + { + "epoch": 0.07075135844107176, + "grad_norm": 0.24095383286476135, + "learning_rate": 0.0002898198198198198, + "loss": 0.5351, + "step": 236 + }, + { + "epoch": 0.07105115233277122, + "grad_norm": 0.23701000213623047, + "learning_rate": 0.00028977477477477474, + "loss": 0.5051, + "step": 237 + }, + { + "epoch": 0.07135094622447068, + "grad_norm": 0.23588530719280243, + "learning_rate": 0.00028972972972972973, + "loss": 0.4992, + "step": 238 + }, + { + "epoch": 0.07165074011617013, + "grad_norm": 0.27736473083496094, + "learning_rate": 0.00028968468468468467, + "loss": 0.5399, + "step": 239 + }, + { + "epoch": 0.07195053400786959, + "grad_norm": 0.2486957311630249, + "learning_rate": 0.0002896396396396396, + "loss": 0.543, + "step": 240 + }, + { + "epoch": 0.07225032789956905, + "grad_norm": 0.23474164307117462, + "learning_rate": 0.0002895945945945946, + "loss": 0.5275, + "step": 241 + }, + { + "epoch": 0.0725501217912685, + "grad_norm": 0.26560667157173157, + "learning_rate": 0.00028954954954954953, + "loss": 0.5576, + "step": 242 + }, + { + "epoch": 0.07284991568296796, + "grad_norm": 0.2260173261165619, + "learning_rate": 0.00028950450450450447, + "loss": 0.5046, + "step": 243 + }, + { + "epoch": 0.07314970957466742, + "grad_norm": 0.24497725069522858, + "learning_rate": 0.00028945945945945946, + "loss": 0.5093, + "step": 244 + }, + { + "epoch": 0.07344950346636687, + "grad_norm": 0.24871525168418884, + "learning_rate": 0.0002894144144144144, + "loss": 0.5353, + "step": 245 + }, + { + "epoch": 0.07374929735806633, + "grad_norm": 0.2592950761318207, + "learning_rate": 0.00028936936936936933, + "loss": 0.553, + "step": 246 + }, + { + "epoch": 0.07404909124976579, + "grad_norm": 0.23617064952850342, + "learning_rate": 0.0002893243243243243, + "loss": 0.5276, + "step": 247 + }, + { + "epoch": 0.07434888514146525, + "grad_norm": 0.23108692467212677, + "learning_rate": 0.00028927927927927926, + "loss": 0.5027, + "step": 248 + }, + { + "epoch": 0.0746486790331647, + "grad_norm": 0.2418566793203354, + "learning_rate": 0.0002892342342342342, + "loss": 0.5142, + "step": 249 + }, + { + "epoch": 0.07494847292486416, + "grad_norm": 0.2367243766784668, + "learning_rate": 0.0002891891891891892, + "loss": 0.5062, + "step": 250 + }, + { + "epoch": 0.07524826681656362, + "grad_norm": 0.2405940294265747, + "learning_rate": 0.0002891441441441441, + "loss": 0.4964, + "step": 251 + }, + { + "epoch": 0.07554806070826307, + "grad_norm": 0.22956568002700806, + "learning_rate": 0.00028909909909909906, + "loss": 0.5081, + "step": 252 + }, + { + "epoch": 0.07584785459996253, + "grad_norm": 0.24594880640506744, + "learning_rate": 0.00028905405405405405, + "loss": 0.5522, + "step": 253 + }, + { + "epoch": 0.07614764849166199, + "grad_norm": 0.25554656982421875, + "learning_rate": 0.000289009009009009, + "loss": 0.5579, + "step": 254 + }, + { + "epoch": 0.07644744238336144, + "grad_norm": 0.24296589195728302, + "learning_rate": 0.0002889639639639639, + "loss": 0.515, + "step": 255 + }, + { + "epoch": 0.0767472362750609, + "grad_norm": 0.2128724306821823, + "learning_rate": 0.0002889189189189189, + "loss": 0.5046, + "step": 256 + }, + { + "epoch": 0.07704703016676036, + "grad_norm": 0.2304687201976776, + "learning_rate": 0.00028887387387387385, + "loss": 0.5275, + "step": 257 + }, + { + "epoch": 0.07734682405845981, + "grad_norm": 0.23740506172180176, + "learning_rate": 0.0002888288288288288, + "loss": 0.5308, + "step": 258 + }, + { + "epoch": 0.07764661795015927, + "grad_norm": 0.2270033061504364, + "learning_rate": 0.0002887837837837838, + "loss": 0.5232, + "step": 259 + }, + { + "epoch": 0.07794641184185873, + "grad_norm": 0.24655793607234955, + "learning_rate": 0.0002887387387387387, + "loss": 0.5087, + "step": 260 + }, + { + "epoch": 0.07824620573355819, + "grad_norm": 0.25199684500694275, + "learning_rate": 0.00028869369369369366, + "loss": 0.5616, + "step": 261 + }, + { + "epoch": 0.07854599962525763, + "grad_norm": 0.2466064840555191, + "learning_rate": 0.00028864864864864865, + "loss": 0.5343, + "step": 262 + }, + { + "epoch": 0.07884579351695709, + "grad_norm": 0.2401546686887741, + "learning_rate": 0.0002886036036036036, + "loss": 0.5326, + "step": 263 + }, + { + "epoch": 0.07914558740865654, + "grad_norm": 0.2478310465812683, + "learning_rate": 0.0002885585585585585, + "loss": 0.5377, + "step": 264 + }, + { + "epoch": 0.079445381300356, + "grad_norm": 0.25078094005584717, + "learning_rate": 0.00028851351351351346, + "loss": 0.5065, + "step": 265 + }, + { + "epoch": 0.07974517519205546, + "grad_norm": 0.24395832419395447, + "learning_rate": 0.00028846846846846845, + "loss": 0.5066, + "step": 266 + }, + { + "epoch": 0.08004496908375491, + "grad_norm": 0.23809655010700226, + "learning_rate": 0.0002884234234234234, + "loss": 0.5032, + "step": 267 + }, + { + "epoch": 0.08034476297545437, + "grad_norm": 0.25203442573547363, + "learning_rate": 0.0002883783783783783, + "loss": 0.5539, + "step": 268 + }, + { + "epoch": 0.08064455686715383, + "grad_norm": 0.22731390595436096, + "learning_rate": 0.0002883333333333333, + "loss": 0.5056, + "step": 269 + }, + { + "epoch": 0.08094435075885328, + "grad_norm": 0.26038557291030884, + "learning_rate": 0.00028828828828828825, + "loss": 0.5554, + "step": 270 + }, + { + "epoch": 0.08124414465055274, + "grad_norm": 0.22735048830509186, + "learning_rate": 0.0002882432432432432, + "loss": 0.513, + "step": 271 + }, + { + "epoch": 0.0815439385422522, + "grad_norm": 0.2459019273519516, + "learning_rate": 0.0002881981981981982, + "loss": 0.5074, + "step": 272 + }, + { + "epoch": 0.08184373243395165, + "grad_norm": 0.23949427902698517, + "learning_rate": 0.0002881531531531531, + "loss": 0.5289, + "step": 273 + }, + { + "epoch": 0.08214352632565111, + "grad_norm": 0.22845999896526337, + "learning_rate": 0.00028810810810810805, + "loss": 0.5046, + "step": 274 + }, + { + "epoch": 0.08244332021735057, + "grad_norm": 0.23616977035999298, + "learning_rate": 0.00028806306306306304, + "loss": 0.4766, + "step": 275 + }, + { + "epoch": 0.08274311410905003, + "grad_norm": 0.24239955842494965, + "learning_rate": 0.000288018018018018, + "loss": 0.5177, + "step": 276 + }, + { + "epoch": 0.08304290800074948, + "grad_norm": 0.22668063640594482, + "learning_rate": 0.0002879729729729729, + "loss": 0.5343, + "step": 277 + }, + { + "epoch": 0.08334270189244894, + "grad_norm": 0.23789212107658386, + "learning_rate": 0.0002879279279279279, + "loss": 0.5325, + "step": 278 + }, + { + "epoch": 0.0836424957841484, + "grad_norm": 0.23370423913002014, + "learning_rate": 0.00028788288288288284, + "loss": 0.5054, + "step": 279 + }, + { + "epoch": 0.08394228967584785, + "grad_norm": 0.2391810417175293, + "learning_rate": 0.0002878378378378378, + "loss": 0.5128, + "step": 280 + }, + { + "epoch": 0.08424208356754731, + "grad_norm": 0.2511520981788635, + "learning_rate": 0.00028779279279279277, + "loss": 0.5205, + "step": 281 + }, + { + "epoch": 0.08454187745924677, + "grad_norm": 0.25624001026153564, + "learning_rate": 0.0002877477477477477, + "loss": 0.5341, + "step": 282 + }, + { + "epoch": 0.08484167135094622, + "grad_norm": 0.22454805672168732, + "learning_rate": 0.00028770270270270264, + "loss": 0.4776, + "step": 283 + }, + { + "epoch": 0.08514146524264568, + "grad_norm": 0.24822303652763367, + "learning_rate": 0.00028765765765765764, + "loss": 0.5545, + "step": 284 + }, + { + "epoch": 0.08544125913434514, + "grad_norm": 0.24899134039878845, + "learning_rate": 0.00028761261261261257, + "loss": 0.5081, + "step": 285 + }, + { + "epoch": 0.0857410530260446, + "grad_norm": 0.22409197688102722, + "learning_rate": 0.00028756756756756756, + "loss": 0.4964, + "step": 286 + }, + { + "epoch": 0.08604084691774405, + "grad_norm": 0.24713397026062012, + "learning_rate": 0.0002875225225225225, + "loss": 0.5604, + "step": 287 + }, + { + "epoch": 0.08634064080944351, + "grad_norm": 0.2386777251958847, + "learning_rate": 0.00028747747747747744, + "loss": 0.5158, + "step": 288 + }, + { + "epoch": 0.08664043470114297, + "grad_norm": 0.2695513963699341, + "learning_rate": 0.00028743243243243243, + "loss": 0.5289, + "step": 289 + }, + { + "epoch": 0.08694022859284242, + "grad_norm": 0.23554596304893494, + "learning_rate": 0.00028738738738738736, + "loss": 0.5002, + "step": 290 + }, + { + "epoch": 0.08724002248454188, + "grad_norm": 0.2348342090845108, + "learning_rate": 0.0002873423423423423, + "loss": 0.5026, + "step": 291 + }, + { + "epoch": 0.08753981637624134, + "grad_norm": 0.24448086321353912, + "learning_rate": 0.0002872972972972973, + "loss": 0.508, + "step": 292 + }, + { + "epoch": 0.0878396102679408, + "grad_norm": 0.22410009801387787, + "learning_rate": 0.00028725225225225223, + "loss": 0.4932, + "step": 293 + }, + { + "epoch": 0.08813940415964025, + "grad_norm": 0.25097909569740295, + "learning_rate": 0.00028720720720720717, + "loss": 0.5196, + "step": 294 + }, + { + "epoch": 0.0884391980513397, + "grad_norm": 0.23564878106117249, + "learning_rate": 0.00028716216216216216, + "loss": 0.4871, + "step": 295 + }, + { + "epoch": 0.08873899194303916, + "grad_norm": 0.22870436310768127, + "learning_rate": 0.0002871171171171171, + "loss": 0.4717, + "step": 296 + }, + { + "epoch": 0.08903878583473862, + "grad_norm": 0.24509447813034058, + "learning_rate": 0.00028707207207207203, + "loss": 0.5417, + "step": 297 + }, + { + "epoch": 0.08933857972643808, + "grad_norm": 0.22554557025432587, + "learning_rate": 0.000287027027027027, + "loss": 0.5081, + "step": 298 + }, + { + "epoch": 0.08963837361813753, + "grad_norm": 0.2863612473011017, + "learning_rate": 0.00028698198198198196, + "loss": 0.5453, + "step": 299 + }, + { + "epoch": 0.08993816750983699, + "grad_norm": 0.23862136900424957, + "learning_rate": 0.00028693693693693695, + "loss": 0.4938, + "step": 300 + }, + { + "epoch": 0.09023796140153645, + "grad_norm": 0.2628205120563507, + "learning_rate": 0.0002868918918918919, + "loss": 0.5392, + "step": 301 + }, + { + "epoch": 0.0905377552932359, + "grad_norm": 0.23156946897506714, + "learning_rate": 0.0002868468468468468, + "loss": 0.4983, + "step": 302 + }, + { + "epoch": 0.09083754918493536, + "grad_norm": 0.27579790353775024, + "learning_rate": 0.0002868018018018018, + "loss": 0.5343, + "step": 303 + }, + { + "epoch": 0.09113734307663482, + "grad_norm": 0.22691994905471802, + "learning_rate": 0.00028675675675675675, + "loss": 0.4828, + "step": 304 + }, + { + "epoch": 0.09143713696833428, + "grad_norm": 0.23544538021087646, + "learning_rate": 0.0002867117117117117, + "loss": 0.5143, + "step": 305 + }, + { + "epoch": 0.09173693086003373, + "grad_norm": 0.22597934305667877, + "learning_rate": 0.0002866666666666667, + "loss": 0.5059, + "step": 306 + }, + { + "epoch": 0.09203672475173319, + "grad_norm": 0.23851321637630463, + "learning_rate": 0.0002866216216216216, + "loss": 0.5127, + "step": 307 + }, + { + "epoch": 0.09233651864343265, + "grad_norm": 0.2260097861289978, + "learning_rate": 0.00028657657657657655, + "loss": 0.4901, + "step": 308 + }, + { + "epoch": 0.09263631253513209, + "grad_norm": 0.2568291127681732, + "learning_rate": 0.00028653153153153154, + "loss": 0.556, + "step": 309 + }, + { + "epoch": 0.09293610642683155, + "grad_norm": 0.23510834574699402, + "learning_rate": 0.0002864864864864865, + "loss": 0.5254, + "step": 310 + }, + { + "epoch": 0.093235900318531, + "grad_norm": 0.24556247889995575, + "learning_rate": 0.0002864414414414414, + "loss": 0.5292, + "step": 311 + }, + { + "epoch": 0.09353569421023046, + "grad_norm": 0.2357538938522339, + "learning_rate": 0.00028639639639639635, + "loss": 0.5321, + "step": 312 + }, + { + "epoch": 0.09383548810192992, + "grad_norm": 0.25484514236450195, + "learning_rate": 0.00028635135135135134, + "loss": 0.5449, + "step": 313 + }, + { + "epoch": 0.09413528199362937, + "grad_norm": 0.243759885430336, + "learning_rate": 0.0002863063063063063, + "loss": 0.4766, + "step": 314 + }, + { + "epoch": 0.09443507588532883, + "grad_norm": 0.232033833861351, + "learning_rate": 0.0002862612612612612, + "loss": 0.481, + "step": 315 + }, + { + "epoch": 0.09473486977702829, + "grad_norm": 0.24136248230934143, + "learning_rate": 0.0002862162162162162, + "loss": 0.541, + "step": 316 + }, + { + "epoch": 0.09503466366872775, + "grad_norm": 0.23708300292491913, + "learning_rate": 0.00028617117117117114, + "loss": 0.5298, + "step": 317 + }, + { + "epoch": 0.0953344575604272, + "grad_norm": 0.22408358752727509, + "learning_rate": 0.0002861261261261261, + "loss": 0.4818, + "step": 318 + }, + { + "epoch": 0.09563425145212666, + "grad_norm": 0.22969000041484833, + "learning_rate": 0.00028608108108108107, + "loss": 0.4668, + "step": 319 + }, + { + "epoch": 0.09593404534382612, + "grad_norm": 0.233534574508667, + "learning_rate": 0.000286036036036036, + "loss": 0.506, + "step": 320 + }, + { + "epoch": 0.09623383923552557, + "grad_norm": 0.2442328929901123, + "learning_rate": 0.00028599099099099095, + "loss": 0.5208, + "step": 321 + }, + { + "epoch": 0.09653363312722503, + "grad_norm": 0.22215022146701813, + "learning_rate": 0.00028594594594594594, + "loss": 0.4963, + "step": 322 + }, + { + "epoch": 0.09683342701892449, + "grad_norm": 0.23993152379989624, + "learning_rate": 0.0002859009009009009, + "loss": 0.4945, + "step": 323 + }, + { + "epoch": 0.09713322091062394, + "grad_norm": 0.22987253963947296, + "learning_rate": 0.0002858558558558558, + "loss": 0.4895, + "step": 324 + }, + { + "epoch": 0.0974330148023234, + "grad_norm": 0.25025674700737, + "learning_rate": 0.0002858108108108108, + "loss": 0.5053, + "step": 325 + }, + { + "epoch": 0.09773280869402286, + "grad_norm": 0.21882835030555725, + "learning_rate": 0.00028576576576576574, + "loss": 0.4844, + "step": 326 + }, + { + "epoch": 0.09803260258572231, + "grad_norm": 0.23361052572727203, + "learning_rate": 0.0002857207207207207, + "loss": 0.5159, + "step": 327 + }, + { + "epoch": 0.09833239647742177, + "grad_norm": 0.23713140189647675, + "learning_rate": 0.00028567567567567567, + "loss": 0.5419, + "step": 328 + }, + { + "epoch": 0.09863219036912123, + "grad_norm": 0.2439422458410263, + "learning_rate": 0.0002856306306306306, + "loss": 0.5068, + "step": 329 + }, + { + "epoch": 0.09893198426082069, + "grad_norm": 0.23442302644252777, + "learning_rate": 0.00028558558558558554, + "loss": 0.5041, + "step": 330 + }, + { + "epoch": 0.09923177815252014, + "grad_norm": 0.2295604944229126, + "learning_rate": 0.00028554054054054053, + "loss": 0.4838, + "step": 331 + }, + { + "epoch": 0.0995315720442196, + "grad_norm": 0.2328769713640213, + "learning_rate": 0.00028549549549549547, + "loss": 0.5118, + "step": 332 + }, + { + "epoch": 0.09983136593591906, + "grad_norm": 0.21605005860328674, + "learning_rate": 0.0002854504504504504, + "loss": 0.4786, + "step": 333 + }, + { + "epoch": 0.10013115982761851, + "grad_norm": 0.22740024328231812, + "learning_rate": 0.0002854054054054054, + "loss": 0.5116, + "step": 334 + }, + { + "epoch": 0.10043095371931797, + "grad_norm": 0.24437595903873444, + "learning_rate": 0.00028536036036036033, + "loss": 0.5467, + "step": 335 + }, + { + "epoch": 0.10073074761101743, + "grad_norm": 0.2281450480222702, + "learning_rate": 0.00028531531531531527, + "loss": 0.5269, + "step": 336 + }, + { + "epoch": 0.10103054150271688, + "grad_norm": 0.2343592643737793, + "learning_rate": 0.0002852702702702702, + "loss": 0.5176, + "step": 337 + }, + { + "epoch": 0.10133033539441634, + "grad_norm": 0.24275358021259308, + "learning_rate": 0.0002852252252252252, + "loss": 0.5103, + "step": 338 + }, + { + "epoch": 0.1016301292861158, + "grad_norm": 0.2498926818370819, + "learning_rate": 0.00028518018018018013, + "loss": 0.5282, + "step": 339 + }, + { + "epoch": 0.10192992317781525, + "grad_norm": 0.23029617965221405, + "learning_rate": 0.00028513513513513507, + "loss": 0.512, + "step": 340 + }, + { + "epoch": 0.10222971706951471, + "grad_norm": 0.2608179450035095, + "learning_rate": 0.00028509009009009006, + "loss": 0.5419, + "step": 341 + }, + { + "epoch": 0.10252951096121417, + "grad_norm": 0.24309523403644562, + "learning_rate": 0.000285045045045045, + "loss": 0.5178, + "step": 342 + }, + { + "epoch": 0.10282930485291362, + "grad_norm": 0.22357851266860962, + "learning_rate": 0.000285, + "loss": 0.487, + "step": 343 + }, + { + "epoch": 0.10312909874461308, + "grad_norm": 0.2575152814388275, + "learning_rate": 0.0002849549549549549, + "loss": 0.5163, + "step": 344 + }, + { + "epoch": 0.10342889263631254, + "grad_norm": 0.25661131739616394, + "learning_rate": 0.00028490990990990986, + "loss": 0.5229, + "step": 345 + }, + { + "epoch": 0.103728686528012, + "grad_norm": 0.25328096747398376, + "learning_rate": 0.00028486486486486485, + "loss": 0.5456, + "step": 346 + }, + { + "epoch": 0.10402848041971145, + "grad_norm": 0.2357887625694275, + "learning_rate": 0.0002848198198198198, + "loss": 0.4824, + "step": 347 + }, + { + "epoch": 0.10432827431141091, + "grad_norm": 0.2275196760892868, + "learning_rate": 0.0002847747747747747, + "loss": 0.4899, + "step": 348 + }, + { + "epoch": 0.10462806820311037, + "grad_norm": 0.2616022229194641, + "learning_rate": 0.0002847297297297297, + "loss": 0.543, + "step": 349 + }, + { + "epoch": 0.10492786209480982, + "grad_norm": 0.23371827602386475, + "learning_rate": 0.00028468468468468465, + "loss": 0.4876, + "step": 350 + }, + { + "epoch": 0.10522765598650928, + "grad_norm": 0.21652457118034363, + "learning_rate": 0.0002846396396396396, + "loss": 0.5051, + "step": 351 + }, + { + "epoch": 0.10552744987820874, + "grad_norm": 0.2464732825756073, + "learning_rate": 0.0002845945945945946, + "loss": 0.4993, + "step": 352 + }, + { + "epoch": 0.1058272437699082, + "grad_norm": 0.24194246530532837, + "learning_rate": 0.0002845495495495495, + "loss": 0.492, + "step": 353 + }, + { + "epoch": 0.10612703766160765, + "grad_norm": 0.21986526250839233, + "learning_rate": 0.00028450450450450446, + "loss": 0.4668, + "step": 354 + }, + { + "epoch": 0.10642683155330711, + "grad_norm": 0.2386123389005661, + "learning_rate": 0.00028445945945945945, + "loss": 0.4823, + "step": 355 + }, + { + "epoch": 0.10672662544500656, + "grad_norm": 0.2368634194135666, + "learning_rate": 0.0002844144144144144, + "loss": 0.5119, + "step": 356 + }, + { + "epoch": 0.10702641933670601, + "grad_norm": 0.24802769720554352, + "learning_rate": 0.0002843693693693694, + "loss": 0.5298, + "step": 357 + }, + { + "epoch": 0.10732621322840546, + "grad_norm": 0.25534310936927795, + "learning_rate": 0.0002843243243243243, + "loss": 0.5107, + "step": 358 + }, + { + "epoch": 0.10762600712010492, + "grad_norm": 0.24845390021800995, + "learning_rate": 0.00028427927927927925, + "loss": 0.539, + "step": 359 + }, + { + "epoch": 0.10792580101180438, + "grad_norm": 0.22670955955982208, + "learning_rate": 0.00028423423423423424, + "loss": 0.5013, + "step": 360 + }, + { + "epoch": 0.10822559490350384, + "grad_norm": 0.21695859730243683, + "learning_rate": 0.0002841891891891892, + "loss": 0.468, + "step": 361 + }, + { + "epoch": 0.10852538879520329, + "grad_norm": 0.24691784381866455, + "learning_rate": 0.0002841441441441441, + "loss": 0.5011, + "step": 362 + }, + { + "epoch": 0.10882518268690275, + "grad_norm": 0.23244348168373108, + "learning_rate": 0.0002840990990990991, + "loss": 0.481, + "step": 363 + }, + { + "epoch": 0.1091249765786022, + "grad_norm": 0.23756597936153412, + "learning_rate": 0.00028405405405405404, + "loss": 0.4869, + "step": 364 + }, + { + "epoch": 0.10942477047030166, + "grad_norm": 0.26336169242858887, + "learning_rate": 0.000284009009009009, + "loss": 0.5216, + "step": 365 + }, + { + "epoch": 0.10972456436200112, + "grad_norm": 0.23734593391418457, + "learning_rate": 0.00028396396396396397, + "loss": 0.502, + "step": 366 + }, + { + "epoch": 0.11002435825370058, + "grad_norm": 0.2188960462808609, + "learning_rate": 0.0002839189189189189, + "loss": 0.4784, + "step": 367 + }, + { + "epoch": 0.11032415214540003, + "grad_norm": 0.26715338230133057, + "learning_rate": 0.00028387387387387384, + "loss": 0.4997, + "step": 368 + }, + { + "epoch": 0.11062394603709949, + "grad_norm": 0.23621448874473572, + "learning_rate": 0.00028382882882882883, + "loss": 0.4819, + "step": 369 + }, + { + "epoch": 0.11092373992879895, + "grad_norm": 0.23166733980178833, + "learning_rate": 0.00028378378378378377, + "loss": 0.4949, + "step": 370 + }, + { + "epoch": 0.1112235338204984, + "grad_norm": 0.22275973856449127, + "learning_rate": 0.0002837387387387387, + "loss": 0.4893, + "step": 371 + }, + { + "epoch": 0.11152332771219786, + "grad_norm": 0.22097566723823547, + "learning_rate": 0.0002836936936936937, + "loss": 0.5011, + "step": 372 + }, + { + "epoch": 0.11182312160389732, + "grad_norm": 0.25464650988578796, + "learning_rate": 0.00028364864864864863, + "loss": 0.5369, + "step": 373 + }, + { + "epoch": 0.11212291549559678, + "grad_norm": 0.2584247589111328, + "learning_rate": 0.00028360360360360357, + "loss": 0.5271, + "step": 374 + }, + { + "epoch": 0.11242270938729623, + "grad_norm": 0.2229800671339035, + "learning_rate": 0.00028355855855855856, + "loss": 0.4919, + "step": 375 + }, + { + "epoch": 0.11272250327899569, + "grad_norm": 0.25196340680122375, + "learning_rate": 0.0002835135135135135, + "loss": 0.4935, + "step": 376 + }, + { + "epoch": 0.11302229717069515, + "grad_norm": 0.23945283889770508, + "learning_rate": 0.00028346846846846843, + "loss": 0.5158, + "step": 377 + }, + { + "epoch": 0.1133220910623946, + "grad_norm": 0.22441525757312775, + "learning_rate": 0.0002834234234234234, + "loss": 0.5106, + "step": 378 + }, + { + "epoch": 0.11362188495409406, + "grad_norm": 0.24109874665737152, + "learning_rate": 0.00028337837837837836, + "loss": 0.4646, + "step": 379 + }, + { + "epoch": 0.11392167884579352, + "grad_norm": 0.23156128823757172, + "learning_rate": 0.0002833333333333333, + "loss": 0.4752, + "step": 380 + }, + { + "epoch": 0.11422147273749297, + "grad_norm": 0.24328507483005524, + "learning_rate": 0.0002832882882882883, + "loss": 0.5225, + "step": 381 + }, + { + "epoch": 0.11452126662919243, + "grad_norm": 0.2364337146282196, + "learning_rate": 0.00028324324324324323, + "loss": 0.4965, + "step": 382 + }, + { + "epoch": 0.11482106052089189, + "grad_norm": 0.24130620062351227, + "learning_rate": 0.00028319819819819816, + "loss": 0.5058, + "step": 383 + }, + { + "epoch": 0.11512085441259134, + "grad_norm": 0.2482600063085556, + "learning_rate": 0.00028315315315315315, + "loss": 0.5142, + "step": 384 + }, + { + "epoch": 0.1154206483042908, + "grad_norm": 0.2426941990852356, + "learning_rate": 0.0002831081081081081, + "loss": 0.5185, + "step": 385 + }, + { + "epoch": 0.11572044219599026, + "grad_norm": 0.24431252479553223, + "learning_rate": 0.00028306306306306303, + "loss": 0.4946, + "step": 386 + }, + { + "epoch": 0.11602023608768972, + "grad_norm": 0.22974024713039398, + "learning_rate": 0.00028301801801801797, + "loss": 0.4845, + "step": 387 + }, + { + "epoch": 0.11632002997938917, + "grad_norm": 0.26395878195762634, + "learning_rate": 0.00028297297297297296, + "loss": 0.512, + "step": 388 + }, + { + "epoch": 0.11661982387108863, + "grad_norm": 0.2300662398338318, + "learning_rate": 0.0002829279279279279, + "loss": 0.5161, + "step": 389 + }, + { + "epoch": 0.11691961776278809, + "grad_norm": 0.23376531898975372, + "learning_rate": 0.00028288288288288283, + "loss": 0.5165, + "step": 390 + }, + { + "epoch": 0.11721941165448754, + "grad_norm": 0.23775231838226318, + "learning_rate": 0.0002828378378378378, + "loss": 0.5028, + "step": 391 + }, + { + "epoch": 0.117519205546187, + "grad_norm": 0.22368961572647095, + "learning_rate": 0.00028279279279279276, + "loss": 0.4724, + "step": 392 + }, + { + "epoch": 0.11781899943788646, + "grad_norm": 0.25789904594421387, + "learning_rate": 0.0002827477477477477, + "loss": 0.5545, + "step": 393 + }, + { + "epoch": 0.11811879332958591, + "grad_norm": 0.22045502066612244, + "learning_rate": 0.0002827027027027027, + "loss": 0.4739, + "step": 394 + }, + { + "epoch": 0.11841858722128537, + "grad_norm": 0.24297311902046204, + "learning_rate": 0.0002826576576576576, + "loss": 0.5284, + "step": 395 + }, + { + "epoch": 0.11871838111298483, + "grad_norm": 0.22944821417331696, + "learning_rate": 0.00028261261261261256, + "loss": 0.5071, + "step": 396 + }, + { + "epoch": 0.11901817500468428, + "grad_norm": 0.22093501687049866, + "learning_rate": 0.00028256756756756755, + "loss": 0.5016, + "step": 397 + }, + { + "epoch": 0.11931796889638374, + "grad_norm": 0.24359877407550812, + "learning_rate": 0.0002825225225225225, + "loss": 0.494, + "step": 398 + }, + { + "epoch": 0.1196177627880832, + "grad_norm": 0.23019848763942719, + "learning_rate": 0.0002824774774774774, + "loss": 0.4774, + "step": 399 + }, + { + "epoch": 0.11991755667978266, + "grad_norm": 0.22894737124443054, + "learning_rate": 0.0002824324324324324, + "loss": 0.4804, + "step": 400 + }, + { + "epoch": 0.12021735057148211, + "grad_norm": 0.23049385845661163, + "learning_rate": 0.00028238738738738735, + "loss": 0.4816, + "step": 401 + }, + { + "epoch": 0.12051714446318157, + "grad_norm": 0.21988703310489655, + "learning_rate": 0.0002823423423423423, + "loss": 0.479, + "step": 402 + }, + { + "epoch": 0.12081693835488103, + "grad_norm": 0.2389378845691681, + "learning_rate": 0.0002822972972972973, + "loss": 0.4993, + "step": 403 + }, + { + "epoch": 0.12111673224658047, + "grad_norm": 0.2271224707365036, + "learning_rate": 0.0002822522522522522, + "loss": 0.479, + "step": 404 + }, + { + "epoch": 0.12141652613827993, + "grad_norm": 0.253499835729599, + "learning_rate": 0.00028220720720720715, + "loss": 0.5289, + "step": 405 + }, + { + "epoch": 0.12171632002997938, + "grad_norm": 0.2213011533021927, + "learning_rate": 0.00028216216216216214, + "loss": 0.4759, + "step": 406 + }, + { + "epoch": 0.12201611392167884, + "grad_norm": 0.2327415943145752, + "learning_rate": 0.0002821171171171171, + "loss": 0.4822, + "step": 407 + }, + { + "epoch": 0.1223159078133783, + "grad_norm": 0.2592346668243408, + "learning_rate": 0.000282072072072072, + "loss": 0.5331, + "step": 408 + }, + { + "epoch": 0.12261570170507775, + "grad_norm": 0.23039428889751434, + "learning_rate": 0.000282027027027027, + "loss": 0.4882, + "step": 409 + }, + { + "epoch": 0.12291549559677721, + "grad_norm": 0.2345624566078186, + "learning_rate": 0.00028198198198198194, + "loss": 0.4859, + "step": 410 + }, + { + "epoch": 0.12321528948847667, + "grad_norm": 0.22759179770946503, + "learning_rate": 0.0002819369369369369, + "loss": 0.484, + "step": 411 + }, + { + "epoch": 0.12351508338017612, + "grad_norm": 0.24670295417308807, + "learning_rate": 0.00028189189189189187, + "loss": 0.5047, + "step": 412 + }, + { + "epoch": 0.12381487727187558, + "grad_norm": 0.2306670844554901, + "learning_rate": 0.0002818468468468468, + "loss": 0.5041, + "step": 413 + }, + { + "epoch": 0.12411467116357504, + "grad_norm": 0.2440173178911209, + "learning_rate": 0.0002818018018018018, + "loss": 0.4942, + "step": 414 + }, + { + "epoch": 0.1244144650552745, + "grad_norm": 0.2239232212305069, + "learning_rate": 0.00028175675675675674, + "loss": 0.4677, + "step": 415 + }, + { + "epoch": 0.12471425894697395, + "grad_norm": 0.22838331758975983, + "learning_rate": 0.0002817117117117117, + "loss": 0.5061, + "step": 416 + }, + { + "epoch": 0.12501405283867342, + "grad_norm": 0.23874101042747498, + "learning_rate": 0.00028166666666666666, + "loss": 0.5261, + "step": 417 + }, + { + "epoch": 0.12531384673037288, + "grad_norm": 0.2522803544998169, + "learning_rate": 0.0002816216216216216, + "loss": 0.5525, + "step": 418 + }, + { + "epoch": 0.12561364062207234, + "grad_norm": 0.2339329719543457, + "learning_rate": 0.00028157657657657654, + "loss": 0.5157, + "step": 419 + }, + { + "epoch": 0.1259134345137718, + "grad_norm": 0.23074860870838165, + "learning_rate": 0.00028153153153153153, + "loss": 0.5062, + "step": 420 + }, + { + "epoch": 0.12621322840547125, + "grad_norm": 0.22877013683319092, + "learning_rate": 0.00028148648648648647, + "loss": 0.4827, + "step": 421 + }, + { + "epoch": 0.1265130222971707, + "grad_norm": 0.2384471893310547, + "learning_rate": 0.0002814414414414414, + "loss": 0.5035, + "step": 422 + }, + { + "epoch": 0.12681281618887016, + "grad_norm": 0.24479855597019196, + "learning_rate": 0.0002813963963963964, + "loss": 0.4645, + "step": 423 + }, + { + "epoch": 0.12711261008056962, + "grad_norm": 0.23424111306667328, + "learning_rate": 0.00028135135135135133, + "loss": 0.4927, + "step": 424 + }, + { + "epoch": 0.12741240397226908, + "grad_norm": 0.24012430012226105, + "learning_rate": 0.0002813063063063063, + "loss": 0.4904, + "step": 425 + }, + { + "epoch": 0.12771219786396854, + "grad_norm": 0.24606235325336456, + "learning_rate": 0.00028126126126126126, + "loss": 0.5143, + "step": 426 + }, + { + "epoch": 0.128011991755668, + "grad_norm": 0.24893403053283691, + "learning_rate": 0.0002812162162162162, + "loss": 0.5232, + "step": 427 + }, + { + "epoch": 0.12831178564736742, + "grad_norm": 0.2424110770225525, + "learning_rate": 0.0002811711711711712, + "loss": 0.5232, + "step": 428 + }, + { + "epoch": 0.12861157953906688, + "grad_norm": 0.2312486171722412, + "learning_rate": 0.0002811261261261261, + "loss": 0.4645, + "step": 429 + }, + { + "epoch": 0.12891137343076634, + "grad_norm": 0.22639836370944977, + "learning_rate": 0.00028108108108108106, + "loss": 0.4642, + "step": 430 + }, + { + "epoch": 0.1292111673224658, + "grad_norm": 0.23626941442489624, + "learning_rate": 0.00028103603603603605, + "loss": 0.5038, + "step": 431 + }, + { + "epoch": 0.12951096121416525, + "grad_norm": 0.2625383138656616, + "learning_rate": 0.000280990990990991, + "loss": 0.4867, + "step": 432 + }, + { + "epoch": 0.1298107551058647, + "grad_norm": 0.24292655289173126, + "learning_rate": 0.0002809459459459459, + "loss": 0.5081, + "step": 433 + }, + { + "epoch": 0.13011054899756416, + "grad_norm": 0.23609769344329834, + "learning_rate": 0.00028090090090090086, + "loss": 0.4832, + "step": 434 + }, + { + "epoch": 0.13041034288926362, + "grad_norm": 0.22934478521347046, + "learning_rate": 0.00028085585585585585, + "loss": 0.4872, + "step": 435 + }, + { + "epoch": 0.13071013678096308, + "grad_norm": 0.22949008643627167, + "learning_rate": 0.0002808108108108108, + "loss": 0.5129, + "step": 436 + }, + { + "epoch": 0.13100993067266253, + "grad_norm": 0.2302381694316864, + "learning_rate": 0.0002807657657657657, + "loss": 0.4793, + "step": 437 + }, + { + "epoch": 0.131309724564362, + "grad_norm": 0.23368242383003235, + "learning_rate": 0.0002807207207207207, + "loss": 0.4989, + "step": 438 + }, + { + "epoch": 0.13160951845606145, + "grad_norm": 0.21572020649909973, + "learning_rate": 0.00028067567567567565, + "loss": 0.4546, + "step": 439 + }, + { + "epoch": 0.1319093123477609, + "grad_norm": 0.2268449366092682, + "learning_rate": 0.0002806306306306306, + "loss": 0.4954, + "step": 440 + }, + { + "epoch": 0.13220910623946036, + "grad_norm": 0.23617544770240784, + "learning_rate": 0.0002805855855855856, + "loss": 0.4865, + "step": 441 + }, + { + "epoch": 0.13250890013115982, + "grad_norm": 0.24015142023563385, + "learning_rate": 0.0002805405405405405, + "loss": 0.5211, + "step": 442 + }, + { + "epoch": 0.13280869402285927, + "grad_norm": 0.21798421442508698, + "learning_rate": 0.00028049549549549545, + "loss": 0.4482, + "step": 443 + }, + { + "epoch": 0.13310848791455873, + "grad_norm": 0.23476584255695343, + "learning_rate": 0.00028045045045045045, + "loss": 0.4871, + "step": 444 + }, + { + "epoch": 0.1334082818062582, + "grad_norm": 0.2404216080904007, + "learning_rate": 0.0002804054054054054, + "loss": 0.488, + "step": 445 + }, + { + "epoch": 0.13370807569795765, + "grad_norm": 0.25073060393333435, + "learning_rate": 0.0002803603603603603, + "loss": 0.5194, + "step": 446 + }, + { + "epoch": 0.1340078695896571, + "grad_norm": 0.24332286417484283, + "learning_rate": 0.0002803153153153153, + "loss": 0.5347, + "step": 447 + }, + { + "epoch": 0.13430766348135656, + "grad_norm": 0.2420525699853897, + "learning_rate": 0.00028027027027027025, + "loss": 0.5109, + "step": 448 + }, + { + "epoch": 0.13460745737305602, + "grad_norm": 0.22389326989650726, + "learning_rate": 0.0002802252252252252, + "loss": 0.5075, + "step": 449 + }, + { + "epoch": 0.13490725126475547, + "grad_norm": 0.23522739112377167, + "learning_rate": 0.0002801801801801802, + "loss": 0.4794, + "step": 450 + }, + { + "epoch": 0.13520704515645493, + "grad_norm": 0.221591517329216, + "learning_rate": 0.0002801351351351351, + "loss": 0.4821, + "step": 451 + }, + { + "epoch": 0.1355068390481544, + "grad_norm": 0.24136802554130554, + "learning_rate": 0.00028009009009009005, + "loss": 0.4949, + "step": 452 + }, + { + "epoch": 0.13580663293985384, + "grad_norm": 0.23207104206085205, + "learning_rate": 0.00028004504504504504, + "loss": 0.4803, + "step": 453 + }, + { + "epoch": 0.1361064268315533, + "grad_norm": 0.25318193435668945, + "learning_rate": 0.00028, + "loss": 0.5345, + "step": 454 + }, + { + "epoch": 0.13640622072325276, + "grad_norm": 0.25274619460105896, + "learning_rate": 0.0002799549549549549, + "loss": 0.5159, + "step": 455 + }, + { + "epoch": 0.13670601461495221, + "grad_norm": 0.22540399432182312, + "learning_rate": 0.0002799099099099099, + "loss": 0.4772, + "step": 456 + }, + { + "epoch": 0.13700580850665167, + "grad_norm": 0.2346925288438797, + "learning_rate": 0.00027986486486486484, + "loss": 0.4916, + "step": 457 + }, + { + "epoch": 0.13730560239835113, + "grad_norm": 0.2226891815662384, + "learning_rate": 0.0002798198198198198, + "loss": 0.4651, + "step": 458 + }, + { + "epoch": 0.13760539629005059, + "grad_norm": 0.25540515780448914, + "learning_rate": 0.0002797747747747747, + "loss": 0.5316, + "step": 459 + }, + { + "epoch": 0.13790519018175004, + "grad_norm": 0.22934426367282867, + "learning_rate": 0.0002797297297297297, + "loss": 0.477, + "step": 460 + }, + { + "epoch": 0.1382049840734495, + "grad_norm": 0.22268570959568024, + "learning_rate": 0.00027968468468468464, + "loss": 0.4611, + "step": 461 + }, + { + "epoch": 0.13850477796514896, + "grad_norm": 0.23548570275306702, + "learning_rate": 0.0002796396396396396, + "loss": 0.5012, + "step": 462 + }, + { + "epoch": 0.1388045718568484, + "grad_norm": 0.22782792150974274, + "learning_rate": 0.00027959459459459457, + "loss": 0.4878, + "step": 463 + }, + { + "epoch": 0.13910436574854787, + "grad_norm": 0.24569828808307648, + "learning_rate": 0.0002795495495495495, + "loss": 0.4882, + "step": 464 + }, + { + "epoch": 0.13940415964024733, + "grad_norm": 0.23523476719856262, + "learning_rate": 0.00027950450450450444, + "loss": 0.498, + "step": 465 + }, + { + "epoch": 0.13970395353194678, + "grad_norm": 0.24249842762947083, + "learning_rate": 0.00027945945945945943, + "loss": 0.5156, + "step": 466 + }, + { + "epoch": 0.14000374742364624, + "grad_norm": 0.22582505643367767, + "learning_rate": 0.00027941441441441437, + "loss": 0.4428, + "step": 467 + }, + { + "epoch": 0.1403035413153457, + "grad_norm": 0.2527635395526886, + "learning_rate": 0.0002793693693693693, + "loss": 0.507, + "step": 468 + }, + { + "epoch": 0.14060333520704515, + "grad_norm": 0.2490163892507553, + "learning_rate": 0.0002793243243243243, + "loss": 0.5119, + "step": 469 + }, + { + "epoch": 0.1409031290987446, + "grad_norm": 0.26502713561058044, + "learning_rate": 0.00027927927927927923, + "loss": 0.4994, + "step": 470 + }, + { + "epoch": 0.14120292299044407, + "grad_norm": 0.25225281715393066, + "learning_rate": 0.0002792342342342342, + "loss": 0.5029, + "step": 471 + }, + { + "epoch": 0.14150271688214353, + "grad_norm": 0.235763818025589, + "learning_rate": 0.00027918918918918916, + "loss": 0.5154, + "step": 472 + }, + { + "epoch": 0.14180251077384298, + "grad_norm": 0.24403534829616547, + "learning_rate": 0.0002791441441441441, + "loss": 0.5241, + "step": 473 + }, + { + "epoch": 0.14210230466554244, + "grad_norm": 0.24656488001346588, + "learning_rate": 0.0002790990990990991, + "loss": 0.4767, + "step": 474 + }, + { + "epoch": 0.1424020985572419, + "grad_norm": 0.2506386935710907, + "learning_rate": 0.000279054054054054, + "loss": 0.5066, + "step": 475 + }, + { + "epoch": 0.14270189244894135, + "grad_norm": 0.24634157121181488, + "learning_rate": 0.00027900900900900896, + "loss": 0.4815, + "step": 476 + }, + { + "epoch": 0.1430016863406408, + "grad_norm": 0.23619256913661957, + "learning_rate": 0.00027896396396396395, + "loss": 0.4929, + "step": 477 + }, + { + "epoch": 0.14330148023234027, + "grad_norm": 0.23421134054660797, + "learning_rate": 0.0002789189189189189, + "loss": 0.4841, + "step": 478 + }, + { + "epoch": 0.14360127412403972, + "grad_norm": 0.2287687510251999, + "learning_rate": 0.00027887387387387383, + "loss": 0.4976, + "step": 479 + }, + { + "epoch": 0.14390106801573918, + "grad_norm": 0.2362293004989624, + "learning_rate": 0.0002788288288288288, + "loss": 0.5054, + "step": 480 + }, + { + "epoch": 0.14420086190743864, + "grad_norm": 0.23907198011875153, + "learning_rate": 0.00027878378378378376, + "loss": 0.4879, + "step": 481 + }, + { + "epoch": 0.1445006557991381, + "grad_norm": 0.21802479028701782, + "learning_rate": 0.00027873873873873875, + "loss": 0.4807, + "step": 482 + }, + { + "epoch": 0.14480044969083755, + "grad_norm": 0.2445833832025528, + "learning_rate": 0.0002786936936936937, + "loss": 0.519, + "step": 483 + }, + { + "epoch": 0.145100243582537, + "grad_norm": 0.24606822431087494, + "learning_rate": 0.0002786486486486486, + "loss": 0.4956, + "step": 484 + }, + { + "epoch": 0.14540003747423647, + "grad_norm": 0.24663852155208588, + "learning_rate": 0.0002786036036036036, + "loss": 0.5029, + "step": 485 + }, + { + "epoch": 0.14569983136593592, + "grad_norm": 0.22668293118476868, + "learning_rate": 0.00027855855855855855, + "loss": 0.4987, + "step": 486 + }, + { + "epoch": 0.14599962525763538, + "grad_norm": 0.2292596995830536, + "learning_rate": 0.0002785135135135135, + "loss": 0.4808, + "step": 487 + }, + { + "epoch": 0.14629941914933484, + "grad_norm": 0.2296249270439148, + "learning_rate": 0.0002784684684684685, + "loss": 0.4733, + "step": 488 + }, + { + "epoch": 0.1465992130410343, + "grad_norm": 0.2463514357805252, + "learning_rate": 0.0002784234234234234, + "loss": 0.4856, + "step": 489 + }, + { + "epoch": 0.14689900693273375, + "grad_norm": 0.2578481435775757, + "learning_rate": 0.00027837837837837835, + "loss": 0.52, + "step": 490 + }, + { + "epoch": 0.1471988008244332, + "grad_norm": 0.22988936305046082, + "learning_rate": 0.00027833333333333334, + "loss": 0.469, + "step": 491 + }, + { + "epoch": 0.14749859471613266, + "grad_norm": 0.24489805102348328, + "learning_rate": 0.0002782882882882883, + "loss": 0.5108, + "step": 492 + }, + { + "epoch": 0.14779838860783212, + "grad_norm": 0.24594642221927643, + "learning_rate": 0.0002782432432432432, + "loss": 0.4977, + "step": 493 + }, + { + "epoch": 0.14809818249953158, + "grad_norm": 0.23341059684753418, + "learning_rate": 0.0002781981981981982, + "loss": 0.4784, + "step": 494 + }, + { + "epoch": 0.14839797639123103, + "grad_norm": 0.24211278557777405, + "learning_rate": 0.00027815315315315314, + "loss": 0.4966, + "step": 495 + }, + { + "epoch": 0.1486977702829305, + "grad_norm": 0.24049176275730133, + "learning_rate": 0.0002781081081081081, + "loss": 0.4807, + "step": 496 + }, + { + "epoch": 0.14899756417462995, + "grad_norm": 0.2326640784740448, + "learning_rate": 0.00027806306306306307, + "loss": 0.4571, + "step": 497 + }, + { + "epoch": 0.1492973580663294, + "grad_norm": 0.23826268315315247, + "learning_rate": 0.000278018018018018, + "loss": 0.4656, + "step": 498 + }, + { + "epoch": 0.14959715195802886, + "grad_norm": 0.2514077425003052, + "learning_rate": 0.00027797297297297294, + "loss": 0.5057, + "step": 499 + }, + { + "epoch": 0.14989694584972832, + "grad_norm": 0.22455474734306335, + "learning_rate": 0.00027792792792792793, + "loss": 0.4754, + "step": 500 + }, + { + "epoch": 0.14989694584972832, + "eval_loss": 0.49213707447052, + "eval_runtime": 564.4362, + "eval_samples_per_second": 3.825, + "eval_steps_per_second": 0.478, + "step": 500 + }, + { + "epoch": 0.15019673974142778, + "grad_norm": 0.24720892310142517, + "learning_rate": 0.00027788288288288287, + "loss": 0.5022, + "step": 501 + }, + { + "epoch": 0.15049653363312723, + "grad_norm": 0.24081699550151825, + "learning_rate": 0.0002778378378378378, + "loss": 0.4928, + "step": 502 + }, + { + "epoch": 0.1507963275248267, + "grad_norm": 0.24245139956474304, + "learning_rate": 0.0002777927927927928, + "loss": 0.5005, + "step": 503 + }, + { + "epoch": 0.15109612141652615, + "grad_norm": 0.23554831743240356, + "learning_rate": 0.00027774774774774774, + "loss": 0.4914, + "step": 504 + }, + { + "epoch": 0.1513959153082256, + "grad_norm": 0.2291078418493271, + "learning_rate": 0.00027770270270270267, + "loss": 0.4807, + "step": 505 + }, + { + "epoch": 0.15169570919992506, + "grad_norm": 0.23393088579177856, + "learning_rate": 0.0002776576576576576, + "loss": 0.496, + "step": 506 + }, + { + "epoch": 0.15199550309162452, + "grad_norm": 0.24490569531917572, + "learning_rate": 0.0002776126126126126, + "loss": 0.5, + "step": 507 + }, + { + "epoch": 0.15229529698332397, + "grad_norm": 0.21680289506912231, + "learning_rate": 0.00027756756756756754, + "loss": 0.4549, + "step": 508 + }, + { + "epoch": 0.15259509087502343, + "grad_norm": 0.22479134798049927, + "learning_rate": 0.0002775225225225225, + "loss": 0.471, + "step": 509 + }, + { + "epoch": 0.1528948847667229, + "grad_norm": 0.2381131947040558, + "learning_rate": 0.00027747747747747746, + "loss": 0.4949, + "step": 510 + }, + { + "epoch": 0.15319467865842235, + "grad_norm": 0.22718404233455658, + "learning_rate": 0.0002774324324324324, + "loss": 0.4883, + "step": 511 + }, + { + "epoch": 0.1534944725501218, + "grad_norm": 0.2359694391489029, + "learning_rate": 0.00027738738738738734, + "loss": 0.5052, + "step": 512 + }, + { + "epoch": 0.15379426644182126, + "grad_norm": 0.229795902967453, + "learning_rate": 0.00027734234234234233, + "loss": 0.4814, + "step": 513 + }, + { + "epoch": 0.15409406033352072, + "grad_norm": 0.2197142243385315, + "learning_rate": 0.00027729729729729727, + "loss": 0.4617, + "step": 514 + }, + { + "epoch": 0.15439385422522017, + "grad_norm": 0.23570996522903442, + "learning_rate": 0.0002772522522522522, + "loss": 0.4731, + "step": 515 + }, + { + "epoch": 0.15469364811691963, + "grad_norm": 0.23566411435604095, + "learning_rate": 0.0002772072072072072, + "loss": 0.4921, + "step": 516 + }, + { + "epoch": 0.1549934420086191, + "grad_norm": 0.21966999769210815, + "learning_rate": 0.00027716216216216213, + "loss": 0.4683, + "step": 517 + }, + { + "epoch": 0.15529323590031854, + "grad_norm": 0.2531338036060333, + "learning_rate": 0.00027711711711711707, + "loss": 0.5254, + "step": 518 + }, + { + "epoch": 0.155593029792018, + "grad_norm": 0.2375670224428177, + "learning_rate": 0.00027707207207207206, + "loss": 0.4988, + "step": 519 + }, + { + "epoch": 0.15589282368371746, + "grad_norm": 0.2455272376537323, + "learning_rate": 0.000277027027027027, + "loss": 0.501, + "step": 520 + }, + { + "epoch": 0.15619261757541691, + "grad_norm": 0.21289831399917603, + "learning_rate": 0.00027698198198198193, + "loss": 0.4575, + "step": 521 + }, + { + "epoch": 0.15649241146711637, + "grad_norm": 0.2653936743736267, + "learning_rate": 0.0002769369369369369, + "loss": 0.5251, + "step": 522 + }, + { + "epoch": 0.1567922053588158, + "grad_norm": 0.23822923004627228, + "learning_rate": 0.00027689189189189186, + "loss": 0.5095, + "step": 523 + }, + { + "epoch": 0.15709199925051526, + "grad_norm": 0.25067201256752014, + "learning_rate": 0.0002768468468468468, + "loss": 0.4841, + "step": 524 + }, + { + "epoch": 0.15739179314221471, + "grad_norm": 0.2340254783630371, + "learning_rate": 0.0002768018018018018, + "loss": 0.4959, + "step": 525 + }, + { + "epoch": 0.15769158703391417, + "grad_norm": 0.2431899458169937, + "learning_rate": 0.0002767567567567567, + "loss": 0.5035, + "step": 526 + }, + { + "epoch": 0.15799138092561363, + "grad_norm": 0.22817112505435944, + "learning_rate": 0.00027671171171171166, + "loss": 0.49, + "step": 527 + }, + { + "epoch": 0.15829117481731309, + "grad_norm": 0.21927404403686523, + "learning_rate": 0.00027666666666666665, + "loss": 0.4785, + "step": 528 + }, + { + "epoch": 0.15859096870901254, + "grad_norm": 0.2402762919664383, + "learning_rate": 0.0002766216216216216, + "loss": 0.4799, + "step": 529 + }, + { + "epoch": 0.158890762600712, + "grad_norm": 0.2559228241443634, + "learning_rate": 0.0002765765765765765, + "loss": 0.5065, + "step": 530 + }, + { + "epoch": 0.15919055649241146, + "grad_norm": 0.22883668541908264, + "learning_rate": 0.0002765315315315315, + "loss": 0.4797, + "step": 531 + }, + { + "epoch": 0.1594903503841109, + "grad_norm": 0.24328212440013885, + "learning_rate": 0.00027648648648648645, + "loss": 0.4796, + "step": 532 + }, + { + "epoch": 0.15979014427581037, + "grad_norm": 0.2543148398399353, + "learning_rate": 0.0002764414414414414, + "loss": 0.4785, + "step": 533 + }, + { + "epoch": 0.16008993816750983, + "grad_norm": 0.24784719944000244, + "learning_rate": 0.0002763963963963964, + "loss": 0.4981, + "step": 534 + }, + { + "epoch": 0.16038973205920928, + "grad_norm": 0.24210456013679504, + "learning_rate": 0.0002763513513513513, + "loss": 0.4939, + "step": 535 + }, + { + "epoch": 0.16068952595090874, + "grad_norm": 0.22924496233463287, + "learning_rate": 0.00027630630630630625, + "loss": 0.4524, + "step": 536 + }, + { + "epoch": 0.1609893198426082, + "grad_norm": 0.270022451877594, + "learning_rate": 0.00027626126126126124, + "loss": 0.5184, + "step": 537 + }, + { + "epoch": 0.16128911373430765, + "grad_norm": 0.2689591646194458, + "learning_rate": 0.0002762162162162162, + "loss": 0.4966, + "step": 538 + }, + { + "epoch": 0.1615889076260071, + "grad_norm": 0.23465842008590698, + "learning_rate": 0.00027617117117117117, + "loss": 0.4865, + "step": 539 + }, + { + "epoch": 0.16188870151770657, + "grad_norm": 0.23281508684158325, + "learning_rate": 0.0002761261261261261, + "loss": 0.4859, + "step": 540 + }, + { + "epoch": 0.16218849540940602, + "grad_norm": 0.25370529294013977, + "learning_rate": 0.00027608108108108105, + "loss": 0.4609, + "step": 541 + }, + { + "epoch": 0.16248828930110548, + "grad_norm": 0.2646511495113373, + "learning_rate": 0.00027603603603603604, + "loss": 0.4967, + "step": 542 + }, + { + "epoch": 0.16278808319280494, + "grad_norm": 0.22836188971996307, + "learning_rate": 0.000275990990990991, + "loss": 0.4982, + "step": 543 + }, + { + "epoch": 0.1630878770845044, + "grad_norm": 0.22948142886161804, + "learning_rate": 0.0002759459459459459, + "loss": 0.471, + "step": 544 + }, + { + "epoch": 0.16338767097620385, + "grad_norm": 0.2623734474182129, + "learning_rate": 0.0002759009009009009, + "loss": 0.4938, + "step": 545 + }, + { + "epoch": 0.1636874648679033, + "grad_norm": 0.2337695211172104, + "learning_rate": 0.00027585585585585584, + "loss": 0.4584, + "step": 546 + }, + { + "epoch": 0.16398725875960277, + "grad_norm": 0.2507021129131317, + "learning_rate": 0.0002758108108108108, + "loss": 0.5002, + "step": 547 + }, + { + "epoch": 0.16428705265130222, + "grad_norm": 0.23930178582668304, + "learning_rate": 0.00027576576576576577, + "loss": 0.4724, + "step": 548 + }, + { + "epoch": 0.16458684654300168, + "grad_norm": 0.24984320998191833, + "learning_rate": 0.0002757207207207207, + "loss": 0.4768, + "step": 549 + }, + { + "epoch": 0.16488664043470114, + "grad_norm": 0.2434365600347519, + "learning_rate": 0.00027567567567567564, + "loss": 0.4667, + "step": 550 + }, + { + "epoch": 0.1651864343264006, + "grad_norm": 0.22952896356582642, + "learning_rate": 0.00027563063063063063, + "loss": 0.4624, + "step": 551 + }, + { + "epoch": 0.16548622821810005, + "grad_norm": 0.2372165471315384, + "learning_rate": 0.00027558558558558557, + "loss": 0.477, + "step": 552 + }, + { + "epoch": 0.1657860221097995, + "grad_norm": 0.24741259217262268, + "learning_rate": 0.0002755405405405405, + "loss": 0.5077, + "step": 553 + }, + { + "epoch": 0.16608581600149896, + "grad_norm": 0.2387109249830246, + "learning_rate": 0.0002754954954954955, + "loss": 0.4817, + "step": 554 + }, + { + "epoch": 0.16638560989319842, + "grad_norm": 0.24962367117404938, + "learning_rate": 0.00027545045045045043, + "loss": 0.4775, + "step": 555 + }, + { + "epoch": 0.16668540378489788, + "grad_norm": 0.2375505119562149, + "learning_rate": 0.00027540540540540537, + "loss": 0.4843, + "step": 556 + }, + { + "epoch": 0.16698519767659734, + "grad_norm": 0.24189910292625427, + "learning_rate": 0.00027536036036036036, + "loss": 0.4952, + "step": 557 + }, + { + "epoch": 0.1672849915682968, + "grad_norm": 0.2314407229423523, + "learning_rate": 0.0002753153153153153, + "loss": 0.4676, + "step": 558 + }, + { + "epoch": 0.16758478545999625, + "grad_norm": 0.24112465977668762, + "learning_rate": 0.00027527027027027023, + "loss": 0.479, + "step": 559 + }, + { + "epoch": 0.1678845793516957, + "grad_norm": 0.22687260806560516, + "learning_rate": 0.0002752252252252252, + "loss": 0.4651, + "step": 560 + }, + { + "epoch": 0.16818437324339516, + "grad_norm": 0.23146574199199677, + "learning_rate": 0.00027518018018018016, + "loss": 0.4724, + "step": 561 + }, + { + "epoch": 0.16848416713509462, + "grad_norm": 0.23164650797843933, + "learning_rate": 0.0002751351351351351, + "loss": 0.4595, + "step": 562 + }, + { + "epoch": 0.16878396102679408, + "grad_norm": 0.2290349006652832, + "learning_rate": 0.0002750900900900901, + "loss": 0.469, + "step": 563 + }, + { + "epoch": 0.16908375491849353, + "grad_norm": 0.22324185073375702, + "learning_rate": 0.000275045045045045, + "loss": 0.4616, + "step": 564 + }, + { + "epoch": 0.169383548810193, + "grad_norm": 0.2320687472820282, + "learning_rate": 0.00027499999999999996, + "loss": 0.4706, + "step": 565 + }, + { + "epoch": 0.16968334270189245, + "grad_norm": 0.2461112141609192, + "learning_rate": 0.00027495495495495495, + "loss": 0.4886, + "step": 566 + }, + { + "epoch": 0.1699831365935919, + "grad_norm": 0.22541654109954834, + "learning_rate": 0.0002749099099099099, + "loss": 0.4676, + "step": 567 + }, + { + "epoch": 0.17028293048529136, + "grad_norm": 0.24664641916751862, + "learning_rate": 0.0002748648648648648, + "loss": 0.509, + "step": 568 + }, + { + "epoch": 0.17058272437699082, + "grad_norm": 0.23051698505878448, + "learning_rate": 0.0002748198198198198, + "loss": 0.4742, + "step": 569 + }, + { + "epoch": 0.17088251826869028, + "grad_norm": 0.21268148720264435, + "learning_rate": 0.00027477477477477475, + "loss": 0.4536, + "step": 570 + }, + { + "epoch": 0.17118231216038973, + "grad_norm": 0.25143638253211975, + "learning_rate": 0.0002747297297297297, + "loss": 0.5233, + "step": 571 + }, + { + "epoch": 0.1714821060520892, + "grad_norm": 0.21673695743083954, + "learning_rate": 0.0002746846846846847, + "loss": 0.445, + "step": 572 + }, + { + "epoch": 0.17178189994378865, + "grad_norm": 0.24307781457901, + "learning_rate": 0.0002746396396396396, + "loss": 0.493, + "step": 573 + }, + { + "epoch": 0.1720816938354881, + "grad_norm": 0.24256987869739532, + "learning_rate": 0.00027459459459459456, + "loss": 0.5249, + "step": 574 + }, + { + "epoch": 0.17238148772718756, + "grad_norm": 0.23426513373851776, + "learning_rate": 0.00027454954954954955, + "loss": 0.4956, + "step": 575 + }, + { + "epoch": 0.17268128161888702, + "grad_norm": 0.23137056827545166, + "learning_rate": 0.0002745045045045045, + "loss": 0.4909, + "step": 576 + }, + { + "epoch": 0.17298107551058647, + "grad_norm": 0.22946982085704803, + "learning_rate": 0.0002744594594594594, + "loss": 0.4733, + "step": 577 + }, + { + "epoch": 0.17328086940228593, + "grad_norm": 0.23843489587306976, + "learning_rate": 0.00027441441441441436, + "loss": 0.4944, + "step": 578 + }, + { + "epoch": 0.1735806632939854, + "grad_norm": 0.21571891009807587, + "learning_rate": 0.00027436936936936935, + "loss": 0.4488, + "step": 579 + }, + { + "epoch": 0.17388045718568484, + "grad_norm": 0.25007542967796326, + "learning_rate": 0.0002743243243243243, + "loss": 0.488, + "step": 580 + }, + { + "epoch": 0.1741802510773843, + "grad_norm": 0.24017852544784546, + "learning_rate": 0.0002742792792792792, + "loss": 0.4969, + "step": 581 + }, + { + "epoch": 0.17448004496908376, + "grad_norm": 0.23361638188362122, + "learning_rate": 0.0002742342342342342, + "loss": 0.5348, + "step": 582 + }, + { + "epoch": 0.17477983886078322, + "grad_norm": 0.22652795910835266, + "learning_rate": 0.00027418918918918915, + "loss": 0.4776, + "step": 583 + }, + { + "epoch": 0.17507963275248267, + "grad_norm": 0.23973309993743896, + "learning_rate": 0.0002741441441441441, + "loss": 0.4803, + "step": 584 + }, + { + "epoch": 0.17537942664418213, + "grad_norm": 0.23487752676010132, + "learning_rate": 0.0002740990990990991, + "loss": 0.482, + "step": 585 + }, + { + "epoch": 0.1756792205358816, + "grad_norm": 0.2464274764060974, + "learning_rate": 0.000274054054054054, + "loss": 0.5167, + "step": 586 + }, + { + "epoch": 0.17597901442758104, + "grad_norm": 0.2280922681093216, + "learning_rate": 0.00027400900900900895, + "loss": 0.4941, + "step": 587 + }, + { + "epoch": 0.1762788083192805, + "grad_norm": 0.24017813801765442, + "learning_rate": 0.00027396396396396394, + "loss": 0.4721, + "step": 588 + }, + { + "epoch": 0.17657860221097996, + "grad_norm": 0.24262118339538574, + "learning_rate": 0.0002739189189189189, + "loss": 0.5053, + "step": 589 + }, + { + "epoch": 0.1768783961026794, + "grad_norm": 0.24060070514678955, + "learning_rate": 0.0002738738738738738, + "loss": 0.4932, + "step": 590 + }, + { + "epoch": 0.17717818999437887, + "grad_norm": 0.2486894428730011, + "learning_rate": 0.0002738288288288288, + "loss": 0.4963, + "step": 591 + }, + { + "epoch": 0.17747798388607833, + "grad_norm": 0.22934255003929138, + "learning_rate": 0.00027378378378378374, + "loss": 0.4911, + "step": 592 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.23473136126995087, + "learning_rate": 0.0002737387387387387, + "loss": 0.4967, + "step": 593 + }, + { + "epoch": 0.17807757166947724, + "grad_norm": 0.24307146668434143, + "learning_rate": 0.00027369369369369367, + "loss": 0.5051, + "step": 594 + }, + { + "epoch": 0.1783773655611767, + "grad_norm": 0.25658494234085083, + "learning_rate": 0.0002736486486486486, + "loss": 0.4959, + "step": 595 + }, + { + "epoch": 0.17867715945287616, + "grad_norm": 0.23326924443244934, + "learning_rate": 0.0002736036036036036, + "loss": 0.5024, + "step": 596 + }, + { + "epoch": 0.1789769533445756, + "grad_norm": 0.2539668083190918, + "learning_rate": 0.00027355855855855854, + "loss": 0.5059, + "step": 597 + }, + { + "epoch": 0.17927674723627507, + "grad_norm": 0.24097499251365662, + "learning_rate": 0.00027351351351351347, + "loss": 0.4888, + "step": 598 + }, + { + "epoch": 0.17957654112797453, + "grad_norm": 0.24816173315048218, + "learning_rate": 0.00027346846846846846, + "loss": 0.4943, + "step": 599 + }, + { + "epoch": 0.17987633501967398, + "grad_norm": 0.25391021370887756, + "learning_rate": 0.0002734234234234234, + "loss": 0.4909, + "step": 600 + }, + { + "epoch": 0.18017612891137344, + "grad_norm": 0.25449302792549133, + "learning_rate": 0.00027337837837837834, + "loss": 0.499, + "step": 601 + }, + { + "epoch": 0.1804759228030729, + "grad_norm": 0.2581718862056732, + "learning_rate": 0.00027333333333333333, + "loss": 0.5093, + "step": 602 + }, + { + "epoch": 0.18077571669477235, + "grad_norm": 0.2509480118751526, + "learning_rate": 0.00027328828828828826, + "loss": 0.4974, + "step": 603 + }, + { + "epoch": 0.1810755105864718, + "grad_norm": 0.23391370475292206, + "learning_rate": 0.0002732432432432432, + "loss": 0.5176, + "step": 604 + }, + { + "epoch": 0.18137530447817127, + "grad_norm": 0.2365102618932724, + "learning_rate": 0.0002731981981981982, + "loss": 0.5021, + "step": 605 + }, + { + "epoch": 0.18167509836987072, + "grad_norm": 0.23687691986560822, + "learning_rate": 0.00027315315315315313, + "loss": 0.4728, + "step": 606 + }, + { + "epoch": 0.18197489226157018, + "grad_norm": 0.24404986202716827, + "learning_rate": 0.00027310810810810807, + "loss": 0.4842, + "step": 607 + }, + { + "epoch": 0.18227468615326964, + "grad_norm": 0.2473643720149994, + "learning_rate": 0.00027306306306306306, + "loss": 0.4823, + "step": 608 + }, + { + "epoch": 0.1825744800449691, + "grad_norm": 0.22777344286441803, + "learning_rate": 0.000273018018018018, + "loss": 0.4995, + "step": 609 + }, + { + "epoch": 0.18287427393666855, + "grad_norm": 0.22545696794986725, + "learning_rate": 0.000272972972972973, + "loss": 0.4634, + "step": 610 + }, + { + "epoch": 0.183174067828368, + "grad_norm": 0.2380336970090866, + "learning_rate": 0.0002729279279279279, + "loss": 0.4831, + "step": 611 + }, + { + "epoch": 0.18347386172006747, + "grad_norm": 0.22387194633483887, + "learning_rate": 0.00027288288288288286, + "loss": 0.4567, + "step": 612 + }, + { + "epoch": 0.18377365561176692, + "grad_norm": 0.2482718676328659, + "learning_rate": 0.00027283783783783785, + "loss": 0.4974, + "step": 613 + }, + { + "epoch": 0.18407344950346638, + "grad_norm": 0.2622338533401489, + "learning_rate": 0.0002727927927927928, + "loss": 0.5004, + "step": 614 + }, + { + "epoch": 0.18437324339516584, + "grad_norm": 0.22443141043186188, + "learning_rate": 0.0002727477477477477, + "loss": 0.4717, + "step": 615 + }, + { + "epoch": 0.1846730372868653, + "grad_norm": 0.23443573713302612, + "learning_rate": 0.0002727027027027027, + "loss": 0.5023, + "step": 616 + }, + { + "epoch": 0.18497283117856475, + "grad_norm": 0.2201087921857834, + "learning_rate": 0.00027265765765765765, + "loss": 0.4812, + "step": 617 + }, + { + "epoch": 0.18527262507026418, + "grad_norm": 0.24011440575122833, + "learning_rate": 0.0002726126126126126, + "loss": 0.4865, + "step": 618 + }, + { + "epoch": 0.18557241896196364, + "grad_norm": 0.2697189450263977, + "learning_rate": 0.0002725675675675676, + "loss": 0.5256, + "step": 619 + }, + { + "epoch": 0.1858722128536631, + "grad_norm": 0.22377456724643707, + "learning_rate": 0.0002725225225225225, + "loss": 0.4701, + "step": 620 + }, + { + "epoch": 0.18617200674536255, + "grad_norm": 0.2551979720592499, + "learning_rate": 0.00027247747747747745, + "loss": 0.4878, + "step": 621 + }, + { + "epoch": 0.186471800637062, + "grad_norm": 0.2368023544549942, + "learning_rate": 0.00027243243243243244, + "loss": 0.4768, + "step": 622 + }, + { + "epoch": 0.18677159452876146, + "grad_norm": 0.23817569017410278, + "learning_rate": 0.0002723873873873874, + "loss": 0.4914, + "step": 623 + }, + { + "epoch": 0.18707138842046092, + "grad_norm": 0.23484331369400024, + "learning_rate": 0.0002723423423423423, + "loss": 0.4896, + "step": 624 + }, + { + "epoch": 0.18737118231216038, + "grad_norm": 0.2473037838935852, + "learning_rate": 0.0002722972972972973, + "loss": 0.4841, + "step": 625 + }, + { + "epoch": 0.18767097620385984, + "grad_norm": 0.2387157678604126, + "learning_rate": 0.00027225225225225224, + "loss": 0.4913, + "step": 626 + }, + { + "epoch": 0.1879707700955593, + "grad_norm": 0.2485678642988205, + "learning_rate": 0.0002722072072072072, + "loss": 0.5155, + "step": 627 + }, + { + "epoch": 0.18827056398725875, + "grad_norm": 0.22908784449100494, + "learning_rate": 0.0002721621621621621, + "loss": 0.4874, + "step": 628 + }, + { + "epoch": 0.1885703578789582, + "grad_norm": 0.22057555615901947, + "learning_rate": 0.0002721171171171171, + "loss": 0.4846, + "step": 629 + }, + { + "epoch": 0.18887015177065766, + "grad_norm": 0.23972582817077637, + "learning_rate": 0.00027207207207207204, + "loss": 0.4785, + "step": 630 + }, + { + "epoch": 0.18916994566235712, + "grad_norm": 0.2453726977109909, + "learning_rate": 0.000272027027027027, + "loss": 0.4855, + "step": 631 + }, + { + "epoch": 0.18946973955405658, + "grad_norm": 0.23710183799266815, + "learning_rate": 0.00027198198198198197, + "loss": 0.4979, + "step": 632 + }, + { + "epoch": 0.18976953344575603, + "grad_norm": 0.24524274468421936, + "learning_rate": 0.0002719369369369369, + "loss": 0.4806, + "step": 633 + }, + { + "epoch": 0.1900693273374555, + "grad_norm": 0.22835515439510345, + "learning_rate": 0.00027189189189189185, + "loss": 0.4746, + "step": 634 + }, + { + "epoch": 0.19036912122915495, + "grad_norm": 0.23380154371261597, + "learning_rate": 0.00027184684684684684, + "loss": 0.4931, + "step": 635 + }, + { + "epoch": 0.1906689151208544, + "grad_norm": 0.22659477591514587, + "learning_rate": 0.0002718018018018018, + "loss": 0.4443, + "step": 636 + }, + { + "epoch": 0.19096870901255386, + "grad_norm": 0.22725367546081543, + "learning_rate": 0.0002717567567567567, + "loss": 0.4888, + "step": 637 + }, + { + "epoch": 0.19126850290425332, + "grad_norm": 0.233082115650177, + "learning_rate": 0.0002717117117117117, + "loss": 0.4752, + "step": 638 + }, + { + "epoch": 0.19156829679595277, + "grad_norm": 0.22560617327690125, + "learning_rate": 0.00027166666666666664, + "loss": 0.4585, + "step": 639 + }, + { + "epoch": 0.19186809068765223, + "grad_norm": 0.22963936626911163, + "learning_rate": 0.0002716216216216216, + "loss": 0.4774, + "step": 640 + }, + { + "epoch": 0.1921678845793517, + "grad_norm": 0.2543715238571167, + "learning_rate": 0.00027157657657657657, + "loss": 0.4896, + "step": 641 + }, + { + "epoch": 0.19246767847105115, + "grad_norm": 0.24594075977802277, + "learning_rate": 0.0002715315315315315, + "loss": 0.4753, + "step": 642 + }, + { + "epoch": 0.1927674723627506, + "grad_norm": 0.2333337366580963, + "learning_rate": 0.00027148648648648644, + "loss": 0.4759, + "step": 643 + }, + { + "epoch": 0.19306726625445006, + "grad_norm": 0.23100800812244415, + "learning_rate": 0.00027144144144144143, + "loss": 0.4536, + "step": 644 + }, + { + "epoch": 0.19336706014614952, + "grad_norm": 0.26426073908805847, + "learning_rate": 0.00027139639639639637, + "loss": 0.4882, + "step": 645 + }, + { + "epoch": 0.19366685403784897, + "grad_norm": 0.22670197486877441, + "learning_rate": 0.0002713513513513513, + "loss": 0.473, + "step": 646 + }, + { + "epoch": 0.19396664792954843, + "grad_norm": 0.23645341396331787, + "learning_rate": 0.0002713063063063063, + "loss": 0.4964, + "step": 647 + }, + { + "epoch": 0.1942664418212479, + "grad_norm": 0.2535463869571686, + "learning_rate": 0.00027126126126126123, + "loss": 0.5001, + "step": 648 + }, + { + "epoch": 0.19456623571294734, + "grad_norm": 0.25858014822006226, + "learning_rate": 0.00027121621621621617, + "loss": 0.5007, + "step": 649 + }, + { + "epoch": 0.1948660296046468, + "grad_norm": 0.2457359880208969, + "learning_rate": 0.0002711711711711711, + "loss": 0.4849, + "step": 650 + }, + { + "epoch": 0.19516582349634626, + "grad_norm": 0.25070154666900635, + "learning_rate": 0.0002711261261261261, + "loss": 0.4888, + "step": 651 + }, + { + "epoch": 0.19546561738804571, + "grad_norm": 0.22972378134727478, + "learning_rate": 0.00027108108108108103, + "loss": 0.4719, + "step": 652 + }, + { + "epoch": 0.19576541127974517, + "grad_norm": 0.23261764645576477, + "learning_rate": 0.000271036036036036, + "loss": 0.4633, + "step": 653 + }, + { + "epoch": 0.19606520517144463, + "grad_norm": 0.25028276443481445, + "learning_rate": 0.00027099099099099096, + "loss": 0.5209, + "step": 654 + }, + { + "epoch": 0.19636499906314409, + "grad_norm": 0.23486949503421783, + "learning_rate": 0.0002709459459459459, + "loss": 0.4757, + "step": 655 + }, + { + "epoch": 0.19666479295484354, + "grad_norm": 0.2393907606601715, + "learning_rate": 0.0002709009009009009, + "loss": 0.464, + "step": 656 + }, + { + "epoch": 0.196964586846543, + "grad_norm": 0.24317681789398193, + "learning_rate": 0.0002708558558558558, + "loss": 0.4843, + "step": 657 + }, + { + "epoch": 0.19726438073824246, + "grad_norm": 0.23742252588272095, + "learning_rate": 0.00027081081081081076, + "loss": 0.5025, + "step": 658 + }, + { + "epoch": 0.1975641746299419, + "grad_norm": 0.23426878452301025, + "learning_rate": 0.00027076576576576575, + "loss": 0.4776, + "step": 659 + }, + { + "epoch": 0.19786396852164137, + "grad_norm": 0.250949501991272, + "learning_rate": 0.0002707207207207207, + "loss": 0.4968, + "step": 660 + }, + { + "epoch": 0.19816376241334083, + "grad_norm": 0.23000746965408325, + "learning_rate": 0.0002706756756756756, + "loss": 0.4895, + "step": 661 + }, + { + "epoch": 0.19846355630504028, + "grad_norm": 0.26243215799331665, + "learning_rate": 0.0002706306306306306, + "loss": 0.5076, + "step": 662 + }, + { + "epoch": 0.19876335019673974, + "grad_norm": 0.22115159034729004, + "learning_rate": 0.00027058558558558555, + "loss": 0.4823, + "step": 663 + }, + { + "epoch": 0.1990631440884392, + "grad_norm": 0.22618655860424042, + "learning_rate": 0.0002705405405405405, + "loss": 0.4994, + "step": 664 + }, + { + "epoch": 0.19936293798013865, + "grad_norm": 0.22989815473556519, + "learning_rate": 0.0002704954954954955, + "loss": 0.4701, + "step": 665 + }, + { + "epoch": 0.1996627318718381, + "grad_norm": 0.24214977025985718, + "learning_rate": 0.0002704504504504504, + "loss": 0.4806, + "step": 666 + }, + { + "epoch": 0.19996252576353757, + "grad_norm": 0.21489010751247406, + "learning_rate": 0.0002704054054054054, + "loss": 0.4511, + "step": 667 + }, + { + "epoch": 0.20026231965523703, + "grad_norm": 0.2397059053182602, + "learning_rate": 0.00027036036036036035, + "loss": 0.4427, + "step": 668 + }, + { + "epoch": 0.20056211354693648, + "grad_norm": 0.2419203370809555, + "learning_rate": 0.0002703153153153153, + "loss": 0.4993, + "step": 669 + }, + { + "epoch": 0.20086190743863594, + "grad_norm": 0.24709810316562653, + "learning_rate": 0.0002702702702702703, + "loss": 0.5195, + "step": 670 + }, + { + "epoch": 0.2011617013303354, + "grad_norm": 0.25068792700767517, + "learning_rate": 0.0002702252252252252, + "loss": 0.508, + "step": 671 + }, + { + "epoch": 0.20146149522203485, + "grad_norm": 0.210756316781044, + "learning_rate": 0.00027018018018018015, + "loss": 0.4413, + "step": 672 + }, + { + "epoch": 0.2017612891137343, + "grad_norm": 0.2557854652404785, + "learning_rate": 0.00027013513513513514, + "loss": 0.4971, + "step": 673 + }, + { + "epoch": 0.20206108300543377, + "grad_norm": 0.23103776574134827, + "learning_rate": 0.0002700900900900901, + "loss": 0.4683, + "step": 674 + }, + { + "epoch": 0.20236087689713322, + "grad_norm": 0.21560733020305634, + "learning_rate": 0.000270045045045045, + "loss": 0.4443, + "step": 675 + }, + { + "epoch": 0.20266067078883268, + "grad_norm": 0.2477121204137802, + "learning_rate": 0.00027, + "loss": 0.5131, + "step": 676 + }, + { + "epoch": 0.20296046468053214, + "grad_norm": 0.24966078996658325, + "learning_rate": 0.00026995495495495494, + "loss": 0.4856, + "step": 677 + }, + { + "epoch": 0.2032602585722316, + "grad_norm": 0.23841539025306702, + "learning_rate": 0.0002699099099099099, + "loss": 0.4658, + "step": 678 + }, + { + "epoch": 0.20356005246393105, + "grad_norm": 0.2685762047767639, + "learning_rate": 0.00026986486486486487, + "loss": 0.5096, + "step": 679 + }, + { + "epoch": 0.2038598463556305, + "grad_norm": 0.25834083557128906, + "learning_rate": 0.0002698198198198198, + "loss": 0.5036, + "step": 680 + }, + { + "epoch": 0.20415964024732997, + "grad_norm": 0.2324528843164444, + "learning_rate": 0.00026977477477477474, + "loss": 0.4618, + "step": 681 + }, + { + "epoch": 0.20445943413902942, + "grad_norm": 0.22903920710086823, + "learning_rate": 0.00026972972972972973, + "loss": 0.4662, + "step": 682 + }, + { + "epoch": 0.20475922803072888, + "grad_norm": 0.24908147752285004, + "learning_rate": 0.00026968468468468467, + "loss": 0.4684, + "step": 683 + }, + { + "epoch": 0.20505902192242834, + "grad_norm": 0.2278299629688263, + "learning_rate": 0.0002696396396396396, + "loss": 0.4447, + "step": 684 + }, + { + "epoch": 0.2053588158141278, + "grad_norm": 0.2315731793642044, + "learning_rate": 0.0002695945945945946, + "loss": 0.4875, + "step": 685 + }, + { + "epoch": 0.20565860970582725, + "grad_norm": 0.23152673244476318, + "learning_rate": 0.00026954954954954953, + "loss": 0.4796, + "step": 686 + }, + { + "epoch": 0.2059584035975267, + "grad_norm": 0.23902982473373413, + "learning_rate": 0.00026950450450450447, + "loss": 0.5169, + "step": 687 + }, + { + "epoch": 0.20625819748922616, + "grad_norm": 0.23636193573474884, + "learning_rate": 0.00026945945945945946, + "loss": 0.493, + "step": 688 + }, + { + "epoch": 0.20655799138092562, + "grad_norm": 0.21632736921310425, + "learning_rate": 0.0002694144144144144, + "loss": 0.4594, + "step": 689 + }, + { + "epoch": 0.20685778527262508, + "grad_norm": 0.2258147895336151, + "learning_rate": 0.00026936936936936934, + "loss": 0.4625, + "step": 690 + }, + { + "epoch": 0.20715757916432453, + "grad_norm": 0.21552099287509918, + "learning_rate": 0.0002693243243243243, + "loss": 0.4481, + "step": 691 + }, + { + "epoch": 0.207457373056024, + "grad_norm": 0.23030760884284973, + "learning_rate": 0.00026927927927927926, + "loss": 0.4644, + "step": 692 + }, + { + "epoch": 0.20775716694772345, + "grad_norm": 0.23163190484046936, + "learning_rate": 0.0002692342342342342, + "loss": 0.4483, + "step": 693 + }, + { + "epoch": 0.2080569608394229, + "grad_norm": 0.2412249743938446, + "learning_rate": 0.0002691891891891892, + "loss": 0.4886, + "step": 694 + }, + { + "epoch": 0.20835675473112236, + "grad_norm": 0.23279330134391785, + "learning_rate": 0.00026914414414414413, + "loss": 0.4733, + "step": 695 + }, + { + "epoch": 0.20865654862282182, + "grad_norm": 0.2269987165927887, + "learning_rate": 0.00026909909909909906, + "loss": 0.4866, + "step": 696 + }, + { + "epoch": 0.20895634251452128, + "grad_norm": 0.23355835676193237, + "learning_rate": 0.00026905405405405406, + "loss": 0.4918, + "step": 697 + }, + { + "epoch": 0.20925613640622073, + "grad_norm": 0.26988187432289124, + "learning_rate": 0.000269009009009009, + "loss": 0.4953, + "step": 698 + }, + { + "epoch": 0.2095559302979202, + "grad_norm": 0.22978806495666504, + "learning_rate": 0.00026896396396396393, + "loss": 0.4622, + "step": 699 + }, + { + "epoch": 0.20985572418961965, + "grad_norm": 0.2823212146759033, + "learning_rate": 0.00026891891891891887, + "loss": 0.5105, + "step": 700 + }, + { + "epoch": 0.2101555180813191, + "grad_norm": 0.23818424344062805, + "learning_rate": 0.00026887387387387386, + "loss": 0.479, + "step": 701 + }, + { + "epoch": 0.21045531197301856, + "grad_norm": 0.23730318248271942, + "learning_rate": 0.0002688288288288288, + "loss": 0.4991, + "step": 702 + }, + { + "epoch": 0.21075510586471802, + "grad_norm": 0.23275551199913025, + "learning_rate": 0.00026878378378378373, + "loss": 0.466, + "step": 703 + }, + { + "epoch": 0.21105489975641747, + "grad_norm": 0.2296077013015747, + "learning_rate": 0.0002687387387387387, + "loss": 0.4964, + "step": 704 + }, + { + "epoch": 0.21135469364811693, + "grad_norm": 0.24341174960136414, + "learning_rate": 0.00026869369369369366, + "loss": 0.505, + "step": 705 + }, + { + "epoch": 0.2116544875398164, + "grad_norm": 0.22542104125022888, + "learning_rate": 0.0002686486486486486, + "loss": 0.4533, + "step": 706 + }, + { + "epoch": 0.21195428143151585, + "grad_norm": 0.22414691746234894, + "learning_rate": 0.0002686036036036036, + "loss": 0.4504, + "step": 707 + }, + { + "epoch": 0.2122540753232153, + "grad_norm": 0.2119213044643402, + "learning_rate": 0.0002685585585585585, + "loss": 0.4457, + "step": 708 + }, + { + "epoch": 0.21255386921491476, + "grad_norm": 0.21771720051765442, + "learning_rate": 0.00026851351351351346, + "loss": 0.4411, + "step": 709 + }, + { + "epoch": 0.21285366310661422, + "grad_norm": 0.2484733760356903, + "learning_rate": 0.00026846846846846845, + "loss": 0.5016, + "step": 710 + }, + { + "epoch": 0.21315345699831367, + "grad_norm": 0.23043473064899445, + "learning_rate": 0.0002684234234234234, + "loss": 0.4717, + "step": 711 + }, + { + "epoch": 0.21345325089001313, + "grad_norm": 0.2769162952899933, + "learning_rate": 0.0002683783783783783, + "loss": 0.5135, + "step": 712 + }, + { + "epoch": 0.21375304478171256, + "grad_norm": 0.22561033070087433, + "learning_rate": 0.0002683333333333333, + "loss": 0.4928, + "step": 713 + }, + { + "epoch": 0.21405283867341202, + "grad_norm": 0.2365075945854187, + "learning_rate": 0.00026828828828828825, + "loss": 0.4843, + "step": 714 + }, + { + "epoch": 0.21435263256511147, + "grad_norm": 0.23628349602222443, + "learning_rate": 0.0002682432432432432, + "loss": 0.4782, + "step": 715 + }, + { + "epoch": 0.21465242645681093, + "grad_norm": 0.22449685633182526, + "learning_rate": 0.0002681981981981982, + "loss": 0.4533, + "step": 716 + }, + { + "epoch": 0.2149522203485104, + "grad_norm": 0.23336151242256165, + "learning_rate": 0.0002681531531531531, + "loss": 0.4818, + "step": 717 + }, + { + "epoch": 0.21525201424020984, + "grad_norm": 0.2387206107378006, + "learning_rate": 0.00026810810810810805, + "loss": 0.4537, + "step": 718 + }, + { + "epoch": 0.2155518081319093, + "grad_norm": 0.23359011113643646, + "learning_rate": 0.00026806306306306304, + "loss": 0.4895, + "step": 719 + }, + { + "epoch": 0.21585160202360876, + "grad_norm": 0.240494042634964, + "learning_rate": 0.000268018018018018, + "loss": 0.4878, + "step": 720 + }, + { + "epoch": 0.21615139591530821, + "grad_norm": 0.23335424065589905, + "learning_rate": 0.0002679729729729729, + "loss": 0.4747, + "step": 721 + }, + { + "epoch": 0.21645118980700767, + "grad_norm": 0.2620643079280853, + "learning_rate": 0.0002679279279279279, + "loss": 0.4968, + "step": 722 + }, + { + "epoch": 0.21675098369870713, + "grad_norm": 0.2350034862756729, + "learning_rate": 0.00026788288288288284, + "loss": 0.4801, + "step": 723 + }, + { + "epoch": 0.21705077759040659, + "grad_norm": 0.2358752340078354, + "learning_rate": 0.00026783783783783784, + "loss": 0.5265, + "step": 724 + }, + { + "epoch": 0.21735057148210604, + "grad_norm": 0.2392471730709076, + "learning_rate": 0.00026779279279279277, + "loss": 0.4667, + "step": 725 + }, + { + "epoch": 0.2176503653738055, + "grad_norm": 0.23733973503112793, + "learning_rate": 0.0002677477477477477, + "loss": 0.4712, + "step": 726 + }, + { + "epoch": 0.21795015926550496, + "grad_norm": 0.2224283516407013, + "learning_rate": 0.0002677027027027027, + "loss": 0.4519, + "step": 727 + }, + { + "epoch": 0.2182499531572044, + "grad_norm": 0.22749215364456177, + "learning_rate": 0.00026765765765765764, + "loss": 0.4656, + "step": 728 + }, + { + "epoch": 0.21854974704890387, + "grad_norm": 0.29321786761283875, + "learning_rate": 0.0002676126126126126, + "loss": 0.5085, + "step": 729 + }, + { + "epoch": 0.21884954094060333, + "grad_norm": 0.23674741387367249, + "learning_rate": 0.00026756756756756756, + "loss": 0.4958, + "step": 730 + }, + { + "epoch": 0.21914933483230278, + "grad_norm": 0.21558566391468048, + "learning_rate": 0.0002675225225225225, + "loss": 0.4679, + "step": 731 + }, + { + "epoch": 0.21944912872400224, + "grad_norm": 0.2383924126625061, + "learning_rate": 0.00026747747747747744, + "loss": 0.4827, + "step": 732 + }, + { + "epoch": 0.2197489226157017, + "grad_norm": 0.23788924515247345, + "learning_rate": 0.00026743243243243243, + "loss": 0.486, + "step": 733 + }, + { + "epoch": 0.22004871650740115, + "grad_norm": 0.23550404608249664, + "learning_rate": 0.00026738738738738737, + "loss": 0.4603, + "step": 734 + }, + { + "epoch": 0.2203485103991006, + "grad_norm": 0.2342066466808319, + "learning_rate": 0.00026734234234234236, + "loss": 0.4591, + "step": 735 + }, + { + "epoch": 0.22064830429080007, + "grad_norm": 0.25759053230285645, + "learning_rate": 0.0002672972972972973, + "loss": 0.483, + "step": 736 + }, + { + "epoch": 0.22094809818249952, + "grad_norm": 0.22325725853443146, + "learning_rate": 0.00026725225225225223, + "loss": 0.4609, + "step": 737 + }, + { + "epoch": 0.22124789207419898, + "grad_norm": 0.22235055267810822, + "learning_rate": 0.0002672072072072072, + "loss": 0.4512, + "step": 738 + }, + { + "epoch": 0.22154768596589844, + "grad_norm": 0.23441246151924133, + "learning_rate": 0.00026716216216216216, + "loss": 0.4517, + "step": 739 + }, + { + "epoch": 0.2218474798575979, + "grad_norm": 0.2520740330219269, + "learning_rate": 0.0002671171171171171, + "loss": 0.4712, + "step": 740 + }, + { + "epoch": 0.22214727374929735, + "grad_norm": 0.22782452404499054, + "learning_rate": 0.0002670720720720721, + "loss": 0.4723, + "step": 741 + }, + { + "epoch": 0.2224470676409968, + "grad_norm": 0.2406499981880188, + "learning_rate": 0.000267027027027027, + "loss": 0.4909, + "step": 742 + }, + { + "epoch": 0.22274686153269627, + "grad_norm": 0.21733756363391876, + "learning_rate": 0.00026698198198198196, + "loss": 0.4402, + "step": 743 + }, + { + "epoch": 0.22304665542439572, + "grad_norm": 0.2329728901386261, + "learning_rate": 0.00026693693693693695, + "loss": 0.4659, + "step": 744 + }, + { + "epoch": 0.22334644931609518, + "grad_norm": 0.23359104990959167, + "learning_rate": 0.0002668918918918919, + "loss": 0.4848, + "step": 745 + }, + { + "epoch": 0.22364624320779464, + "grad_norm": 0.23723845183849335, + "learning_rate": 0.0002668468468468468, + "loss": 0.4674, + "step": 746 + }, + { + "epoch": 0.2239460370994941, + "grad_norm": 0.2128835916519165, + "learning_rate": 0.00026680180180180176, + "loss": 0.4617, + "step": 747 + }, + { + "epoch": 0.22424583099119355, + "grad_norm": 0.2343822568655014, + "learning_rate": 0.00026675675675675675, + "loss": 0.4624, + "step": 748 + }, + { + "epoch": 0.224545624882893, + "grad_norm": 0.24932916462421417, + "learning_rate": 0.0002667117117117117, + "loss": 0.476, + "step": 749 + }, + { + "epoch": 0.22484541877459246, + "grad_norm": 0.24181415140628815, + "learning_rate": 0.0002666666666666666, + "loss": 0.4681, + "step": 750 + }, + { + "epoch": 0.22514521266629192, + "grad_norm": 0.23665620386600494, + "learning_rate": 0.0002666216216216216, + "loss": 0.4858, + "step": 751 + }, + { + "epoch": 0.22544500655799138, + "grad_norm": 0.24904295802116394, + "learning_rate": 0.00026657657657657655, + "loss": 0.4957, + "step": 752 + }, + { + "epoch": 0.22574480044969084, + "grad_norm": 0.2285979986190796, + "learning_rate": 0.0002665315315315315, + "loss": 0.4515, + "step": 753 + }, + { + "epoch": 0.2260445943413903, + "grad_norm": 0.2505464553833008, + "learning_rate": 0.0002664864864864865, + "loss": 0.4884, + "step": 754 + }, + { + "epoch": 0.22634438823308975, + "grad_norm": 0.22328858077526093, + "learning_rate": 0.0002664414414414414, + "loss": 0.4463, + "step": 755 + }, + { + "epoch": 0.2266441821247892, + "grad_norm": 0.2543044984340668, + "learning_rate": 0.00026639639639639635, + "loss": 0.493, + "step": 756 + }, + { + "epoch": 0.22694397601648866, + "grad_norm": 0.2348204255104065, + "learning_rate": 0.00026635135135135135, + "loss": 0.4511, + "step": 757 + }, + { + "epoch": 0.22724376990818812, + "grad_norm": 0.25663718581199646, + "learning_rate": 0.0002663063063063063, + "loss": 0.5006, + "step": 758 + }, + { + "epoch": 0.22754356379988758, + "grad_norm": 0.24245639145374298, + "learning_rate": 0.0002662612612612612, + "loss": 0.4687, + "step": 759 + }, + { + "epoch": 0.22784335769158703, + "grad_norm": 0.2461511641740799, + "learning_rate": 0.0002662162162162162, + "loss": 0.5136, + "step": 760 + }, + { + "epoch": 0.2281431515832865, + "grad_norm": 0.22325679659843445, + "learning_rate": 0.00026617117117117115, + "loss": 0.472, + "step": 761 + }, + { + "epoch": 0.22844294547498595, + "grad_norm": 0.2652730345726013, + "learning_rate": 0.0002661261261261261, + "loss": 0.4732, + "step": 762 + }, + { + "epoch": 0.2287427393666854, + "grad_norm": 0.24870134890079498, + "learning_rate": 0.0002660810810810811, + "loss": 0.467, + "step": 763 + }, + { + "epoch": 0.22904253325838486, + "grad_norm": 0.23315280675888062, + "learning_rate": 0.000266036036036036, + "loss": 0.4994, + "step": 764 + }, + { + "epoch": 0.22934232715008432, + "grad_norm": 0.23781219124794006, + "learning_rate": 0.00026599099099099095, + "loss": 0.4923, + "step": 765 + }, + { + "epoch": 0.22964212104178378, + "grad_norm": 0.2272898256778717, + "learning_rate": 0.00026594594594594594, + "loss": 0.4683, + "step": 766 + }, + { + "epoch": 0.22994191493348323, + "grad_norm": 0.2321631759405136, + "learning_rate": 0.0002659009009009009, + "loss": 0.5018, + "step": 767 + }, + { + "epoch": 0.2302417088251827, + "grad_norm": 0.22698219120502472, + "learning_rate": 0.0002658558558558558, + "loss": 0.479, + "step": 768 + }, + { + "epoch": 0.23054150271688215, + "grad_norm": 0.23421627283096313, + "learning_rate": 0.0002658108108108108, + "loss": 0.4613, + "step": 769 + }, + { + "epoch": 0.2308412966085816, + "grad_norm": 0.21950644254684448, + "learning_rate": 0.00026576576576576574, + "loss": 0.4497, + "step": 770 + }, + { + "epoch": 0.23114109050028106, + "grad_norm": 0.2207535058259964, + "learning_rate": 0.0002657207207207207, + "loss": 0.4777, + "step": 771 + }, + { + "epoch": 0.23144088439198052, + "grad_norm": 0.22216112911701202, + "learning_rate": 0.0002656756756756756, + "loss": 0.4682, + "step": 772 + }, + { + "epoch": 0.23174067828367997, + "grad_norm": 0.2619054317474365, + "learning_rate": 0.0002656306306306306, + "loss": 0.4791, + "step": 773 + }, + { + "epoch": 0.23204047217537943, + "grad_norm": 0.2443225234746933, + "learning_rate": 0.00026558558558558554, + "loss": 0.4777, + "step": 774 + }, + { + "epoch": 0.2323402660670789, + "grad_norm": 0.21427664160728455, + "learning_rate": 0.0002655405405405405, + "loss": 0.4407, + "step": 775 + }, + { + "epoch": 0.23264005995877834, + "grad_norm": 0.21477638185024261, + "learning_rate": 0.00026549549549549547, + "loss": 0.4333, + "step": 776 + }, + { + "epoch": 0.2329398538504778, + "grad_norm": 0.23390546441078186, + "learning_rate": 0.0002654504504504504, + "loss": 0.4839, + "step": 777 + }, + { + "epoch": 0.23323964774217726, + "grad_norm": 0.2529938220977783, + "learning_rate": 0.00026540540540540534, + "loss": 0.4701, + "step": 778 + }, + { + "epoch": 0.23353944163387672, + "grad_norm": 0.24290771782398224, + "learning_rate": 0.00026536036036036033, + "loss": 0.492, + "step": 779 + }, + { + "epoch": 0.23383923552557617, + "grad_norm": 0.2573592960834503, + "learning_rate": 0.00026531531531531527, + "loss": 0.5105, + "step": 780 + }, + { + "epoch": 0.23413902941727563, + "grad_norm": 0.25404611229896545, + "learning_rate": 0.00026527027027027026, + "loss": 0.5054, + "step": 781 + }, + { + "epoch": 0.2344388233089751, + "grad_norm": 0.2394997775554657, + "learning_rate": 0.0002652252252252252, + "loss": 0.4786, + "step": 782 + }, + { + "epoch": 0.23473861720067454, + "grad_norm": 0.2353266179561615, + "learning_rate": 0.00026518018018018013, + "loss": 0.4613, + "step": 783 + }, + { + "epoch": 0.235038411092374, + "grad_norm": 0.22989432513713837, + "learning_rate": 0.0002651351351351351, + "loss": 0.4505, + "step": 784 + }, + { + "epoch": 0.23533820498407346, + "grad_norm": 0.21917951107025146, + "learning_rate": 0.00026509009009009006, + "loss": 0.438, + "step": 785 + }, + { + "epoch": 0.2356379988757729, + "grad_norm": 0.23011858761310577, + "learning_rate": 0.000265045045045045, + "loss": 0.4477, + "step": 786 + }, + { + "epoch": 0.23593779276747237, + "grad_norm": 0.22732798755168915, + "learning_rate": 0.000265, + "loss": 0.454, + "step": 787 + }, + { + "epoch": 0.23623758665917183, + "grad_norm": 0.22975054383277893, + "learning_rate": 0.0002649549549549549, + "loss": 0.4419, + "step": 788 + }, + { + "epoch": 0.23653738055087128, + "grad_norm": 0.25520968437194824, + "learning_rate": 0.00026490990990990986, + "loss": 0.4987, + "step": 789 + }, + { + "epoch": 0.23683717444257074, + "grad_norm": 0.2375541776418686, + "learning_rate": 0.00026486486486486485, + "loss": 0.473, + "step": 790 + }, + { + "epoch": 0.2371369683342702, + "grad_norm": 0.2304588407278061, + "learning_rate": 0.0002648198198198198, + "loss": 0.4734, + "step": 791 + }, + { + "epoch": 0.23743676222596966, + "grad_norm": 0.22878654301166534, + "learning_rate": 0.0002647747747747748, + "loss": 0.4684, + "step": 792 + }, + { + "epoch": 0.2377365561176691, + "grad_norm": 0.25825339555740356, + "learning_rate": 0.0002647297297297297, + "loss": 0.5006, + "step": 793 + }, + { + "epoch": 0.23803635000936857, + "grad_norm": 0.2332850843667984, + "learning_rate": 0.00026468468468468466, + "loss": 0.4603, + "step": 794 + }, + { + "epoch": 0.23833614390106803, + "grad_norm": 0.23115694522857666, + "learning_rate": 0.00026463963963963965, + "loss": 0.4879, + "step": 795 + }, + { + "epoch": 0.23863593779276748, + "grad_norm": 0.2409309297800064, + "learning_rate": 0.0002645945945945946, + "loss": 0.4662, + "step": 796 + }, + { + "epoch": 0.23893573168446694, + "grad_norm": 0.23094283044338226, + "learning_rate": 0.0002645495495495495, + "loss": 0.4477, + "step": 797 + }, + { + "epoch": 0.2392355255761664, + "grad_norm": 0.2324245125055313, + "learning_rate": 0.0002645045045045045, + "loss": 0.4539, + "step": 798 + }, + { + "epoch": 0.23953531946786585, + "grad_norm": 0.23273488879203796, + "learning_rate": 0.00026445945945945945, + "loss": 0.4926, + "step": 799 + }, + { + "epoch": 0.2398351133595653, + "grad_norm": 0.2254081666469574, + "learning_rate": 0.0002644144144144144, + "loss": 0.4487, + "step": 800 + }, + { + "epoch": 0.24013490725126477, + "grad_norm": 0.22009852528572083, + "learning_rate": 0.0002643693693693694, + "loss": 0.4373, + "step": 801 + }, + { + "epoch": 0.24043470114296422, + "grad_norm": 0.24840936064720154, + "learning_rate": 0.0002643243243243243, + "loss": 0.4803, + "step": 802 + }, + { + "epoch": 0.24073449503466368, + "grad_norm": 0.2305980920791626, + "learning_rate": 0.00026427927927927925, + "loss": 0.4727, + "step": 803 + }, + { + "epoch": 0.24103428892636314, + "grad_norm": 0.23277850449085236, + "learning_rate": 0.00026423423423423424, + "loss": 0.4775, + "step": 804 + }, + { + "epoch": 0.2413340828180626, + "grad_norm": 0.24016259610652924, + "learning_rate": 0.0002641891891891892, + "loss": 0.5002, + "step": 805 + }, + { + "epoch": 0.24163387670976205, + "grad_norm": 0.239017054438591, + "learning_rate": 0.0002641441441441441, + "loss": 0.4586, + "step": 806 + }, + { + "epoch": 0.2419336706014615, + "grad_norm": 0.23575717210769653, + "learning_rate": 0.0002640990990990991, + "loss": 0.481, + "step": 807 + }, + { + "epoch": 0.24223346449316094, + "grad_norm": 0.23028531670570374, + "learning_rate": 0.00026405405405405404, + "loss": 0.4546, + "step": 808 + }, + { + "epoch": 0.2425332583848604, + "grad_norm": 0.23798401653766632, + "learning_rate": 0.000264009009009009, + "loss": 0.4806, + "step": 809 + }, + { + "epoch": 0.24283305227655985, + "grad_norm": 0.23191827535629272, + "learning_rate": 0.00026396396396396397, + "loss": 0.4509, + "step": 810 + }, + { + "epoch": 0.2431328461682593, + "grad_norm": 0.2182149440050125, + "learning_rate": 0.0002639189189189189, + "loss": 0.4448, + "step": 811 + }, + { + "epoch": 0.24343264005995877, + "grad_norm": 0.2463945746421814, + "learning_rate": 0.00026387387387387384, + "loss": 0.5088, + "step": 812 + }, + { + "epoch": 0.24373243395165822, + "grad_norm": 0.2388424575328827, + "learning_rate": 0.00026382882882882883, + "loss": 0.4931, + "step": 813 + }, + { + "epoch": 0.24403222784335768, + "grad_norm": 0.2515762746334076, + "learning_rate": 0.00026378378378378377, + "loss": 0.4742, + "step": 814 + }, + { + "epoch": 0.24433202173505714, + "grad_norm": 0.23625001311302185, + "learning_rate": 0.0002637387387387387, + "loss": 0.4937, + "step": 815 + }, + { + "epoch": 0.2446318156267566, + "grad_norm": 0.2393738478422165, + "learning_rate": 0.0002636936936936937, + "loss": 0.4588, + "step": 816 + }, + { + "epoch": 0.24493160951845605, + "grad_norm": 0.23316219449043274, + "learning_rate": 0.00026364864864864864, + "loss": 0.4577, + "step": 817 + }, + { + "epoch": 0.2452314034101555, + "grad_norm": 0.2306746244430542, + "learning_rate": 0.00026360360360360357, + "loss": 0.4454, + "step": 818 + }, + { + "epoch": 0.24553119730185496, + "grad_norm": 0.26293689012527466, + "learning_rate": 0.0002635585585585585, + "loss": 0.4723, + "step": 819 + }, + { + "epoch": 0.24583099119355442, + "grad_norm": 0.23483715951442719, + "learning_rate": 0.0002635135135135135, + "loss": 0.4704, + "step": 820 + }, + { + "epoch": 0.24613078508525388, + "grad_norm": 0.2556680738925934, + "learning_rate": 0.00026346846846846844, + "loss": 0.5077, + "step": 821 + }, + { + "epoch": 0.24643057897695334, + "grad_norm": 0.25275811553001404, + "learning_rate": 0.0002634234234234234, + "loss": 0.483, + "step": 822 + }, + { + "epoch": 0.2467303728686528, + "grad_norm": 0.22292107343673706, + "learning_rate": 0.00026337837837837836, + "loss": 0.4684, + "step": 823 + }, + { + "epoch": 0.24703016676035225, + "grad_norm": 0.23125959932804108, + "learning_rate": 0.0002633333333333333, + "loss": 0.4618, + "step": 824 + }, + { + "epoch": 0.2473299606520517, + "grad_norm": 0.2515474259853363, + "learning_rate": 0.00026328828828828824, + "loss": 0.5111, + "step": 825 + }, + { + "epoch": 0.24762975454375116, + "grad_norm": 0.23193036019802094, + "learning_rate": 0.00026324324324324323, + "loss": 0.451, + "step": 826 + }, + { + "epoch": 0.24792954843545062, + "grad_norm": 0.22238105535507202, + "learning_rate": 0.00026319819819819817, + "loss": 0.4472, + "step": 827 + }, + { + "epoch": 0.24822934232715008, + "grad_norm": 0.23125764727592468, + "learning_rate": 0.0002631531531531531, + "loss": 0.4711, + "step": 828 + }, + { + "epoch": 0.24852913621884953, + "grad_norm": 0.23620037734508514, + "learning_rate": 0.0002631081081081081, + "loss": 0.4723, + "step": 829 + }, + { + "epoch": 0.248828930110549, + "grad_norm": 0.22470439970493317, + "learning_rate": 0.00026306306306306303, + "loss": 0.4814, + "step": 830 + }, + { + "epoch": 0.24912872400224845, + "grad_norm": 0.23267348110675812, + "learning_rate": 0.00026301801801801797, + "loss": 0.4669, + "step": 831 + }, + { + "epoch": 0.2494285178939479, + "grad_norm": 0.23558740317821503, + "learning_rate": 0.00026297297297297296, + "loss": 0.4407, + "step": 832 + }, + { + "epoch": 0.24972831178564736, + "grad_norm": 0.2202112227678299, + "learning_rate": 0.0002629279279279279, + "loss": 0.4546, + "step": 833 + }, + { + "epoch": 0.25002810567734685, + "grad_norm": 0.2349451333284378, + "learning_rate": 0.00026288288288288283, + "loss": 0.4741, + "step": 834 + }, + { + "epoch": 0.2503278995690463, + "grad_norm": 0.2210862636566162, + "learning_rate": 0.0002628378378378378, + "loss": 0.4624, + "step": 835 + }, + { + "epoch": 0.25062769346074576, + "grad_norm": 0.25249290466308594, + "learning_rate": 0.00026279279279279276, + "loss": 0.5213, + "step": 836 + }, + { + "epoch": 0.2509274873524452, + "grad_norm": 0.2458237111568451, + "learning_rate": 0.0002627477477477477, + "loss": 0.4937, + "step": 837 + }, + { + "epoch": 0.2512272812441447, + "grad_norm": 0.22827856242656708, + "learning_rate": 0.0002627027027027027, + "loss": 0.4286, + "step": 838 + }, + { + "epoch": 0.2515270751358441, + "grad_norm": 0.22871458530426025, + "learning_rate": 0.0002626576576576576, + "loss": 0.4181, + "step": 839 + }, + { + "epoch": 0.2518268690275436, + "grad_norm": 0.24196332693099976, + "learning_rate": 0.00026261261261261256, + "loss": 0.4704, + "step": 840 + }, + { + "epoch": 0.252126662919243, + "grad_norm": 0.24222321808338165, + "learning_rate": 0.00026256756756756755, + "loss": 0.474, + "step": 841 + }, + { + "epoch": 0.2524264568109425, + "grad_norm": 0.2258533090353012, + "learning_rate": 0.0002625225225225225, + "loss": 0.4644, + "step": 842 + }, + { + "epoch": 0.25272625070264193, + "grad_norm": 0.2234419882297516, + "learning_rate": 0.0002624774774774774, + "loss": 0.4506, + "step": 843 + }, + { + "epoch": 0.2530260445943414, + "grad_norm": 0.24231363832950592, + "learning_rate": 0.0002624324324324324, + "loss": 0.4975, + "step": 844 + }, + { + "epoch": 0.25332583848604084, + "grad_norm": 0.2430192083120346, + "learning_rate": 0.00026238738738738735, + "loss": 0.462, + "step": 845 + }, + { + "epoch": 0.25362563237774033, + "grad_norm": 0.23717942833900452, + "learning_rate": 0.0002623423423423423, + "loss": 0.48, + "step": 846 + }, + { + "epoch": 0.25392542626943976, + "grad_norm": 0.23983192443847656, + "learning_rate": 0.0002622972972972973, + "loss": 0.491, + "step": 847 + }, + { + "epoch": 0.25422522016113924, + "grad_norm": 0.24544605612754822, + "learning_rate": 0.0002622522522522522, + "loss": 0.4525, + "step": 848 + }, + { + "epoch": 0.25452501405283867, + "grad_norm": 0.25106650590896606, + "learning_rate": 0.0002622072072072072, + "loss": 0.4716, + "step": 849 + }, + { + "epoch": 0.25482480794453816, + "grad_norm": 0.2644721269607544, + "learning_rate": 0.00026216216216216215, + "loss": 0.4888, + "step": 850 + }, + { + "epoch": 0.2551246018362376, + "grad_norm": 0.2338344007730484, + "learning_rate": 0.0002621171171171171, + "loss": 0.4779, + "step": 851 + }, + { + "epoch": 0.25542439572793707, + "grad_norm": 0.2368081659078598, + "learning_rate": 0.00026207207207207207, + "loss": 0.458, + "step": 852 + }, + { + "epoch": 0.2557241896196365, + "grad_norm": 0.2628321051597595, + "learning_rate": 0.000262027027027027, + "loss": 0.4786, + "step": 853 + }, + { + "epoch": 0.256023983511336, + "grad_norm": 0.23109877109527588, + "learning_rate": 0.00026198198198198195, + "loss": 0.4396, + "step": 854 + }, + { + "epoch": 0.2563237774030354, + "grad_norm": 0.23273521661758423, + "learning_rate": 0.00026193693693693694, + "loss": 0.4579, + "step": 855 + }, + { + "epoch": 0.25662357129473484, + "grad_norm": 0.229389026761055, + "learning_rate": 0.0002618918918918919, + "loss": 0.4515, + "step": 856 + }, + { + "epoch": 0.2569233651864343, + "grad_norm": 0.24866041541099548, + "learning_rate": 0.0002618468468468468, + "loss": 0.4402, + "step": 857 + }, + { + "epoch": 0.25722315907813376, + "grad_norm": 0.24374257028102875, + "learning_rate": 0.0002618018018018018, + "loss": 0.5025, + "step": 858 + }, + { + "epoch": 0.25752295296983324, + "grad_norm": 0.2753133177757263, + "learning_rate": 0.00026175675675675674, + "loss": 0.5058, + "step": 859 + }, + { + "epoch": 0.25782274686153267, + "grad_norm": 0.236386239528656, + "learning_rate": 0.0002617117117117117, + "loss": 0.4741, + "step": 860 + }, + { + "epoch": 0.25812254075323215, + "grad_norm": 0.21907605230808258, + "learning_rate": 0.00026166666666666667, + "loss": 0.463, + "step": 861 + }, + { + "epoch": 0.2584223346449316, + "grad_norm": 0.25744542479515076, + "learning_rate": 0.0002616216216216216, + "loss": 0.4585, + "step": 862 + }, + { + "epoch": 0.25872212853663107, + "grad_norm": 0.25060373544692993, + "learning_rate": 0.0002615765765765766, + "loss": 0.476, + "step": 863 + }, + { + "epoch": 0.2590219224283305, + "grad_norm": 0.21545180678367615, + "learning_rate": 0.00026153153153153153, + "loss": 0.4379, + "step": 864 + }, + { + "epoch": 0.25932171632003, + "grad_norm": 0.2536545991897583, + "learning_rate": 0.00026148648648648647, + "loss": 0.4965, + "step": 865 + }, + { + "epoch": 0.2596215102117294, + "grad_norm": 0.22960424423217773, + "learning_rate": 0.00026144144144144146, + "loss": 0.4441, + "step": 866 + }, + { + "epoch": 0.2599213041034289, + "grad_norm": 0.22601282596588135, + "learning_rate": 0.0002613963963963964, + "loss": 0.4462, + "step": 867 + }, + { + "epoch": 0.2602210979951283, + "grad_norm": 0.23997683823108673, + "learning_rate": 0.00026135135135135133, + "loss": 0.4496, + "step": 868 + }, + { + "epoch": 0.2605208918868278, + "grad_norm": 0.24241064488887787, + "learning_rate": 0.00026130630630630627, + "loss": 0.4538, + "step": 869 + }, + { + "epoch": 0.26082068577852724, + "grad_norm": 0.2302720993757248, + "learning_rate": 0.00026126126126126126, + "loss": 0.4547, + "step": 870 + }, + { + "epoch": 0.2611204796702267, + "grad_norm": 0.2532520294189453, + "learning_rate": 0.0002612162162162162, + "loss": 0.4919, + "step": 871 + }, + { + "epoch": 0.26142027356192615, + "grad_norm": 0.2543450891971588, + "learning_rate": 0.00026117117117117113, + "loss": 0.4638, + "step": 872 + }, + { + "epoch": 0.26172006745362564, + "grad_norm": 0.24088874459266663, + "learning_rate": 0.0002611261261261261, + "loss": 0.4603, + "step": 873 + }, + { + "epoch": 0.26201986134532507, + "grad_norm": 0.22305645048618317, + "learning_rate": 0.00026108108108108106, + "loss": 0.4394, + "step": 874 + }, + { + "epoch": 0.26231965523702455, + "grad_norm": 0.261001318693161, + "learning_rate": 0.000261036036036036, + "loss": 0.5244, + "step": 875 + }, + { + "epoch": 0.262619449128724, + "grad_norm": 0.2550908029079437, + "learning_rate": 0.000260990990990991, + "loss": 0.4716, + "step": 876 + }, + { + "epoch": 0.26291924302042347, + "grad_norm": 0.2264460027217865, + "learning_rate": 0.0002609459459459459, + "loss": 0.4527, + "step": 877 + }, + { + "epoch": 0.2632190369121229, + "grad_norm": 0.2598486542701721, + "learning_rate": 0.00026090090090090086, + "loss": 0.51, + "step": 878 + }, + { + "epoch": 0.2635188308038224, + "grad_norm": 0.2528247833251953, + "learning_rate": 0.00026085585585585585, + "loss": 0.4619, + "step": 879 + }, + { + "epoch": 0.2638186246955218, + "grad_norm": 0.22703434526920319, + "learning_rate": 0.0002608108108108108, + "loss": 0.4519, + "step": 880 + }, + { + "epoch": 0.2641184185872213, + "grad_norm": 0.24291987717151642, + "learning_rate": 0.0002607657657657657, + "loss": 0.4588, + "step": 881 + }, + { + "epoch": 0.2644182124789207, + "grad_norm": 0.265899121761322, + "learning_rate": 0.0002607207207207207, + "loss": 0.4741, + "step": 882 + }, + { + "epoch": 0.2647180063706202, + "grad_norm": 0.24852798879146576, + "learning_rate": 0.00026067567567567565, + "loss": 0.4772, + "step": 883 + }, + { + "epoch": 0.26501780026231964, + "grad_norm": 0.24373799562454224, + "learning_rate": 0.0002606306306306306, + "loss": 0.475, + "step": 884 + }, + { + "epoch": 0.2653175941540191, + "grad_norm": 0.24994871020317078, + "learning_rate": 0.0002605855855855856, + "loss": 0.5094, + "step": 885 + }, + { + "epoch": 0.26561738804571855, + "grad_norm": 0.23686103522777557, + "learning_rate": 0.0002605405405405405, + "loss": 0.4583, + "step": 886 + }, + { + "epoch": 0.26591718193741803, + "grad_norm": 0.2280004322528839, + "learning_rate": 0.00026049549549549546, + "loss": 0.4474, + "step": 887 + }, + { + "epoch": 0.26621697582911746, + "grad_norm": 0.25110939145088196, + "learning_rate": 0.00026045045045045045, + "loss": 0.4722, + "step": 888 + }, + { + "epoch": 0.26651676972081695, + "grad_norm": 0.25370022654533386, + "learning_rate": 0.0002604054054054054, + "loss": 0.4651, + "step": 889 + }, + { + "epoch": 0.2668165636125164, + "grad_norm": 0.24179215729236603, + "learning_rate": 0.0002603603603603603, + "loss": 0.4727, + "step": 890 + }, + { + "epoch": 0.26711635750421586, + "grad_norm": 0.2525777518749237, + "learning_rate": 0.00026031531531531526, + "loss": 0.4681, + "step": 891 + }, + { + "epoch": 0.2674161513959153, + "grad_norm": 0.21325957775115967, + "learning_rate": 0.00026027027027027025, + "loss": 0.4374, + "step": 892 + }, + { + "epoch": 0.2677159452876148, + "grad_norm": 0.2358642816543579, + "learning_rate": 0.0002602252252252252, + "loss": 0.4492, + "step": 893 + }, + { + "epoch": 0.2680157391793142, + "grad_norm": 0.2625977694988251, + "learning_rate": 0.0002601801801801801, + "loss": 0.522, + "step": 894 + }, + { + "epoch": 0.2683155330710137, + "grad_norm": 0.22606413066387177, + "learning_rate": 0.0002601351351351351, + "loss": 0.4539, + "step": 895 + }, + { + "epoch": 0.2686153269627131, + "grad_norm": 0.24337491393089294, + "learning_rate": 0.00026009009009009005, + "loss": 0.4988, + "step": 896 + }, + { + "epoch": 0.2689151208544126, + "grad_norm": 0.23522725701332092, + "learning_rate": 0.000260045045045045, + "loss": 0.4665, + "step": 897 + }, + { + "epoch": 0.26921491474611203, + "grad_norm": 0.25222131609916687, + "learning_rate": 0.00026, + "loss": 0.4715, + "step": 898 + }, + { + "epoch": 0.2695147086378115, + "grad_norm": 0.22760646045207977, + "learning_rate": 0.0002599549549549549, + "loss": 0.4633, + "step": 899 + }, + { + "epoch": 0.26981450252951095, + "grad_norm": 0.2398597002029419, + "learning_rate": 0.00025990990990990985, + "loss": 0.4682, + "step": 900 + }, + { + "epoch": 0.27011429642121043, + "grad_norm": 0.24494816362857819, + "learning_rate": 0.00025986486486486484, + "loss": 0.4833, + "step": 901 + }, + { + "epoch": 0.27041409031290986, + "grad_norm": 0.23173199594020844, + "learning_rate": 0.0002598198198198198, + "loss": 0.4583, + "step": 902 + }, + { + "epoch": 0.27071388420460935, + "grad_norm": 0.242969810962677, + "learning_rate": 0.0002597747747747747, + "loss": 0.4748, + "step": 903 + }, + { + "epoch": 0.2710136780963088, + "grad_norm": 0.2286025583744049, + "learning_rate": 0.0002597297297297297, + "loss": 0.4802, + "step": 904 + }, + { + "epoch": 0.27131347198800826, + "grad_norm": 0.241167351603508, + "learning_rate": 0.00025968468468468464, + "loss": 0.4947, + "step": 905 + }, + { + "epoch": 0.2716132658797077, + "grad_norm": 0.2599638104438782, + "learning_rate": 0.00025963963963963963, + "loss": 0.5028, + "step": 906 + }, + { + "epoch": 0.2719130597714072, + "grad_norm": 0.22766104340553284, + "learning_rate": 0.00025959459459459457, + "loss": 0.4586, + "step": 907 + }, + { + "epoch": 0.2722128536631066, + "grad_norm": 0.24524454772472382, + "learning_rate": 0.0002595495495495495, + "loss": 0.4918, + "step": 908 + }, + { + "epoch": 0.2725126475548061, + "grad_norm": 0.24995583295822144, + "learning_rate": 0.0002595045045045045, + "loss": 0.4851, + "step": 909 + }, + { + "epoch": 0.2728124414465055, + "grad_norm": 0.24542704224586487, + "learning_rate": 0.00025945945945945944, + "loss": 0.4891, + "step": 910 + }, + { + "epoch": 0.273112235338205, + "grad_norm": 0.2262720763683319, + "learning_rate": 0.00025941441441441437, + "loss": 0.457, + "step": 911 + }, + { + "epoch": 0.27341202922990443, + "grad_norm": 0.2677282989025116, + "learning_rate": 0.00025936936936936936, + "loss": 0.4781, + "step": 912 + }, + { + "epoch": 0.2737118231216039, + "grad_norm": 0.22617483139038086, + "learning_rate": 0.0002593243243243243, + "loss": 0.4666, + "step": 913 + }, + { + "epoch": 0.27401161701330334, + "grad_norm": 0.24579745531082153, + "learning_rate": 0.00025927927927927924, + "loss": 0.4991, + "step": 914 + }, + { + "epoch": 0.27431141090500283, + "grad_norm": 0.22964158654212952, + "learning_rate": 0.00025923423423423423, + "loss": 0.4757, + "step": 915 + }, + { + "epoch": 0.27461120479670226, + "grad_norm": 0.24435275793075562, + "learning_rate": 0.00025918918918918916, + "loss": 0.4636, + "step": 916 + }, + { + "epoch": 0.27491099868840174, + "grad_norm": 0.23039105534553528, + "learning_rate": 0.0002591441441441441, + "loss": 0.4591, + "step": 917 + }, + { + "epoch": 0.27521079258010117, + "grad_norm": 0.24856770038604736, + "learning_rate": 0.0002590990990990991, + "loss": 0.4866, + "step": 918 + }, + { + "epoch": 0.27551058647180066, + "grad_norm": 0.22115269303321838, + "learning_rate": 0.00025905405405405403, + "loss": 0.4322, + "step": 919 + }, + { + "epoch": 0.2758103803635001, + "grad_norm": 0.2645402252674103, + "learning_rate": 0.000259009009009009, + "loss": 0.4891, + "step": 920 + }, + { + "epoch": 0.27611017425519957, + "grad_norm": 0.24427354335784912, + "learning_rate": 0.00025896396396396396, + "loss": 0.4636, + "step": 921 + }, + { + "epoch": 0.276409968146899, + "grad_norm": 0.23059400916099548, + "learning_rate": 0.0002589189189189189, + "loss": 0.4502, + "step": 922 + }, + { + "epoch": 0.2767097620385985, + "grad_norm": 0.21996812522411346, + "learning_rate": 0.0002588738738738739, + "loss": 0.4461, + "step": 923 + }, + { + "epoch": 0.2770095559302979, + "grad_norm": 0.24204552173614502, + "learning_rate": 0.0002588288288288288, + "loss": 0.4678, + "step": 924 + }, + { + "epoch": 0.2773093498219974, + "grad_norm": 0.26428595185279846, + "learning_rate": 0.00025878378378378376, + "loss": 0.476, + "step": 925 + }, + { + "epoch": 0.2776091437136968, + "grad_norm": 0.2542773187160492, + "learning_rate": 0.00025873873873873875, + "loss": 0.464, + "step": 926 + }, + { + "epoch": 0.2779089376053963, + "grad_norm": 0.2621975839138031, + "learning_rate": 0.0002586936936936937, + "loss": 0.4735, + "step": 927 + }, + { + "epoch": 0.27820873149709574, + "grad_norm": 0.24359507858753204, + "learning_rate": 0.0002586486486486486, + "loss": 0.4768, + "step": 928 + }, + { + "epoch": 0.2785085253887952, + "grad_norm": 0.24825096130371094, + "learning_rate": 0.0002586036036036036, + "loss": 0.471, + "step": 929 + }, + { + "epoch": 0.27880831928049465, + "grad_norm": 0.2950778007507324, + "learning_rate": 0.00025855855855855855, + "loss": 0.4625, + "step": 930 + }, + { + "epoch": 0.27910811317219414, + "grad_norm": 0.23210273683071136, + "learning_rate": 0.0002585135135135135, + "loss": 0.4325, + "step": 931 + }, + { + "epoch": 0.27940790706389357, + "grad_norm": 0.26627883315086365, + "learning_rate": 0.0002584684684684685, + "loss": 0.4903, + "step": 932 + }, + { + "epoch": 0.27970770095559305, + "grad_norm": 0.2619936466217041, + "learning_rate": 0.0002584234234234234, + "loss": 0.4777, + "step": 933 + }, + { + "epoch": 0.2800074948472925, + "grad_norm": 0.23771123588085175, + "learning_rate": 0.00025837837837837835, + "loss": 0.4319, + "step": 934 + }, + { + "epoch": 0.28030728873899197, + "grad_norm": 0.2495034784078598, + "learning_rate": 0.00025833333333333334, + "loss": 0.4503, + "step": 935 + }, + { + "epoch": 0.2806070826306914, + "grad_norm": 0.26627448201179504, + "learning_rate": 0.0002582882882882883, + "loss": 0.4501, + "step": 936 + }, + { + "epoch": 0.2809068765223909, + "grad_norm": 0.22482304275035858, + "learning_rate": 0.0002582432432432432, + "loss": 0.4615, + "step": 937 + }, + { + "epoch": 0.2812066704140903, + "grad_norm": 0.23891392350196838, + "learning_rate": 0.0002581981981981982, + "loss": 0.4732, + "step": 938 + }, + { + "epoch": 0.2815064643057898, + "grad_norm": 0.2233395278453827, + "learning_rate": 0.00025815315315315314, + "loss": 0.4521, + "step": 939 + }, + { + "epoch": 0.2818062581974892, + "grad_norm": 0.22510269284248352, + "learning_rate": 0.0002581081081081081, + "loss": 0.4307, + "step": 940 + }, + { + "epoch": 0.2821060520891887, + "grad_norm": 0.24909009039402008, + "learning_rate": 0.000258063063063063, + "loss": 0.4574, + "step": 941 + }, + { + "epoch": 0.28240584598088814, + "grad_norm": 0.2461954653263092, + "learning_rate": 0.000258018018018018, + "loss": 0.4782, + "step": 942 + }, + { + "epoch": 0.2827056398725876, + "grad_norm": 0.23391996324062347, + "learning_rate": 0.00025797297297297294, + "loss": 0.4692, + "step": 943 + }, + { + "epoch": 0.28300543376428705, + "grad_norm": 0.2419288158416748, + "learning_rate": 0.0002579279279279279, + "loss": 0.4742, + "step": 944 + }, + { + "epoch": 0.28330522765598654, + "grad_norm": 0.24654226005077362, + "learning_rate": 0.00025788288288288287, + "loss": 0.4793, + "step": 945 + }, + { + "epoch": 0.28360502154768596, + "grad_norm": 0.23362454771995544, + "learning_rate": 0.0002578378378378378, + "loss": 0.4398, + "step": 946 + }, + { + "epoch": 0.28390481543938545, + "grad_norm": 0.23269514739513397, + "learning_rate": 0.00025779279279279275, + "loss": 0.4642, + "step": 947 + }, + { + "epoch": 0.2842046093310849, + "grad_norm": 0.22531503438949585, + "learning_rate": 0.00025774774774774774, + "loss": 0.436, + "step": 948 + }, + { + "epoch": 0.28450440322278436, + "grad_norm": 0.24250438809394836, + "learning_rate": 0.0002577027027027027, + "loss": 0.4708, + "step": 949 + }, + { + "epoch": 0.2848041971144838, + "grad_norm": 0.2370329648256302, + "learning_rate": 0.0002576576576576576, + "loss": 0.4737, + "step": 950 + }, + { + "epoch": 0.2851039910061832, + "grad_norm": 0.2553395628929138, + "learning_rate": 0.0002576126126126126, + "loss": 0.4927, + "step": 951 + }, + { + "epoch": 0.2854037848978827, + "grad_norm": 0.24398140609264374, + "learning_rate": 0.00025756756756756754, + "loss": 0.4471, + "step": 952 + }, + { + "epoch": 0.28570357878958214, + "grad_norm": 0.2420070916414261, + "learning_rate": 0.0002575225225225225, + "loss": 0.4573, + "step": 953 + }, + { + "epoch": 0.2860033726812816, + "grad_norm": 0.22280406951904297, + "learning_rate": 0.00025747747747747747, + "loss": 0.4637, + "step": 954 + }, + { + "epoch": 0.28630316657298105, + "grad_norm": 0.268107146024704, + "learning_rate": 0.0002574324324324324, + "loss": 0.4751, + "step": 955 + }, + { + "epoch": 0.28660296046468053, + "grad_norm": 0.224797785282135, + "learning_rate": 0.00025738738738738734, + "loss": 0.4567, + "step": 956 + }, + { + "epoch": 0.28690275435637996, + "grad_norm": 0.2350010722875595, + "learning_rate": 0.00025734234234234233, + "loss": 0.4669, + "step": 957 + }, + { + "epoch": 0.28720254824807945, + "grad_norm": 0.23346953094005585, + "learning_rate": 0.00025729729729729727, + "loss": 0.4711, + "step": 958 + }, + { + "epoch": 0.2875023421397789, + "grad_norm": 0.26031309366226196, + "learning_rate": 0.0002572522522522522, + "loss": 0.4921, + "step": 959 + }, + { + "epoch": 0.28780213603147836, + "grad_norm": 0.21255329251289368, + "learning_rate": 0.0002572072072072072, + "loss": 0.444, + "step": 960 + }, + { + "epoch": 0.2881019299231778, + "grad_norm": 0.24799884855747223, + "learning_rate": 0.00025716216216216213, + "loss": 0.469, + "step": 961 + }, + { + "epoch": 0.2884017238148773, + "grad_norm": 0.2208838164806366, + "learning_rate": 0.00025711711711711707, + "loss": 0.442, + "step": 962 + }, + { + "epoch": 0.2887015177065767, + "grad_norm": 0.2880913317203522, + "learning_rate": 0.00025707207207207206, + "loss": 0.4424, + "step": 963 + }, + { + "epoch": 0.2890013115982762, + "grad_norm": 0.26574239134788513, + "learning_rate": 0.000257027027027027, + "loss": 0.452, + "step": 964 + }, + { + "epoch": 0.2893011054899756, + "grad_norm": 0.23267340660095215, + "learning_rate": 0.00025698198198198193, + "loss": 0.455, + "step": 965 + }, + { + "epoch": 0.2896008993816751, + "grad_norm": 0.26304900646209717, + "learning_rate": 0.0002569369369369369, + "loss": 0.5211, + "step": 966 + }, + { + "epoch": 0.28990069327337453, + "grad_norm": 0.2575905919075012, + "learning_rate": 0.00025689189189189186, + "loss": 0.483, + "step": 967 + }, + { + "epoch": 0.290200487165074, + "grad_norm": 0.22459660470485687, + "learning_rate": 0.0002568468468468468, + "loss": 0.4636, + "step": 968 + }, + { + "epoch": 0.29050028105677345, + "grad_norm": 0.220341295003891, + "learning_rate": 0.0002568018018018018, + "loss": 0.4615, + "step": 969 + }, + { + "epoch": 0.29080007494847293, + "grad_norm": 0.239531472325325, + "learning_rate": 0.0002567567567567567, + "loss": 0.4446, + "step": 970 + }, + { + "epoch": 0.29109986884017236, + "grad_norm": 0.23338812589645386, + "learning_rate": 0.00025671171171171166, + "loss": 0.4704, + "step": 971 + }, + { + "epoch": 0.29139966273187184, + "grad_norm": 0.24035978317260742, + "learning_rate": 0.00025666666666666665, + "loss": 0.4717, + "step": 972 + }, + { + "epoch": 0.2916994566235713, + "grad_norm": 0.23094506561756134, + "learning_rate": 0.0002566216216216216, + "loss": 0.4493, + "step": 973 + }, + { + "epoch": 0.29199925051527076, + "grad_norm": 0.25101473927497864, + "learning_rate": 0.0002565765765765765, + "loss": 0.4979, + "step": 974 + }, + { + "epoch": 0.2922990444069702, + "grad_norm": 0.24172839522361755, + "learning_rate": 0.0002565315315315315, + "loss": 0.4666, + "step": 975 + }, + { + "epoch": 0.2925988382986697, + "grad_norm": 0.2213078737258911, + "learning_rate": 0.00025648648648648645, + "loss": 0.4335, + "step": 976 + }, + { + "epoch": 0.2928986321903691, + "grad_norm": 0.2230203002691269, + "learning_rate": 0.00025644144144144145, + "loss": 0.4183, + "step": 977 + }, + { + "epoch": 0.2931984260820686, + "grad_norm": 0.24735258519649506, + "learning_rate": 0.0002563963963963964, + "loss": 0.4612, + "step": 978 + }, + { + "epoch": 0.293498219973768, + "grad_norm": 0.24861575663089752, + "learning_rate": 0.0002563513513513513, + "loss": 0.4713, + "step": 979 + }, + { + "epoch": 0.2937980138654675, + "grad_norm": 0.2333897352218628, + "learning_rate": 0.0002563063063063063, + "loss": 0.4502, + "step": 980 + }, + { + "epoch": 0.29409780775716693, + "grad_norm": 0.23923064768314362, + "learning_rate": 0.00025626126126126125, + "loss": 0.4574, + "step": 981 + }, + { + "epoch": 0.2943976016488664, + "grad_norm": 0.24568355083465576, + "learning_rate": 0.0002562162162162162, + "loss": 0.4358, + "step": 982 + }, + { + "epoch": 0.29469739554056584, + "grad_norm": 0.24993112683296204, + "learning_rate": 0.0002561711711711712, + "loss": 0.4663, + "step": 983 + }, + { + "epoch": 0.2949971894322653, + "grad_norm": 0.2531440854072571, + "learning_rate": 0.0002561261261261261, + "loss": 0.4964, + "step": 984 + }, + { + "epoch": 0.29529698332396476, + "grad_norm": 0.26995500922203064, + "learning_rate": 0.00025608108108108105, + "loss": 0.4971, + "step": 985 + }, + { + "epoch": 0.29559677721566424, + "grad_norm": 0.23319192230701447, + "learning_rate": 0.00025603603603603604, + "loss": 0.4605, + "step": 986 + }, + { + "epoch": 0.29589657110736367, + "grad_norm": 0.2496713548898697, + "learning_rate": 0.000255990990990991, + "loss": 0.4468, + "step": 987 + }, + { + "epoch": 0.29619636499906316, + "grad_norm": 0.23224860429763794, + "learning_rate": 0.0002559459459459459, + "loss": 0.4519, + "step": 988 + }, + { + "epoch": 0.2964961588907626, + "grad_norm": 0.24383842945098877, + "learning_rate": 0.0002559009009009009, + "loss": 0.467, + "step": 989 + }, + { + "epoch": 0.29679595278246207, + "grad_norm": 0.2240372598171234, + "learning_rate": 0.00025585585585585584, + "loss": 0.4541, + "step": 990 + }, + { + "epoch": 0.2970957466741615, + "grad_norm": 0.23554278910160065, + "learning_rate": 0.0002558108108108108, + "loss": 0.4627, + "step": 991 + }, + { + "epoch": 0.297395540565861, + "grad_norm": 0.24655288457870483, + "learning_rate": 0.00025576576576576577, + "loss": 0.4697, + "step": 992 + }, + { + "epoch": 0.2976953344575604, + "grad_norm": 0.2397495061159134, + "learning_rate": 0.0002557207207207207, + "loss": 0.4821, + "step": 993 + }, + { + "epoch": 0.2979951283492599, + "grad_norm": 0.24440963566303253, + "learning_rate": 0.00025567567567567564, + "loss": 0.4293, + "step": 994 + }, + { + "epoch": 0.2982949222409593, + "grad_norm": 0.230440154671669, + "learning_rate": 0.00025563063063063063, + "loss": 0.4592, + "step": 995 + }, + { + "epoch": 0.2985947161326588, + "grad_norm": 0.22211557626724243, + "learning_rate": 0.00025558558558558557, + "loss": 0.4324, + "step": 996 + }, + { + "epoch": 0.29889451002435824, + "grad_norm": 0.22826789319515228, + "learning_rate": 0.0002555405405405405, + "loss": 0.447, + "step": 997 + }, + { + "epoch": 0.2991943039160577, + "grad_norm": 0.24060975015163422, + "learning_rate": 0.0002554954954954955, + "loss": 0.4954, + "step": 998 + }, + { + "epoch": 0.29949409780775715, + "grad_norm": 0.2227400243282318, + "learning_rate": 0.00025545045045045043, + "loss": 0.4503, + "step": 999 + }, + { + "epoch": 0.29979389169945664, + "grad_norm": 0.23061898350715637, + "learning_rate": 0.00025540540540540537, + "loss": 0.4356, + "step": 1000 + }, + { + "epoch": 0.29979389169945664, + "eval_loss": 0.46561944484710693, + "eval_runtime": 566.7946, + "eval_samples_per_second": 3.809, + "eval_steps_per_second": 0.476, + "step": 1000 + }, + { + "epoch": 0.30009368559115607, + "grad_norm": 0.25651443004608154, + "learning_rate": 0.00025536036036036036, + "loss": 0.4813, + "step": 1001 + }, + { + "epoch": 0.30039347948285555, + "grad_norm": 0.23068277537822723, + "learning_rate": 0.0002553153153153153, + "loss": 0.4563, + "step": 1002 + }, + { + "epoch": 0.300693273374555, + "grad_norm": 0.2346974015235901, + "learning_rate": 0.00025527027027027024, + "loss": 0.4471, + "step": 1003 + }, + { + "epoch": 0.30099306726625447, + "grad_norm": 0.22291089594364166, + "learning_rate": 0.0002552252252252252, + "loss": 0.4399, + "step": 1004 + }, + { + "epoch": 0.3012928611579539, + "grad_norm": 0.24533626437187195, + "learning_rate": 0.00025518018018018016, + "loss": 0.4949, + "step": 1005 + }, + { + "epoch": 0.3015926550496534, + "grad_norm": 0.2337205857038498, + "learning_rate": 0.0002551351351351351, + "loss": 0.4815, + "step": 1006 + }, + { + "epoch": 0.3018924489413528, + "grad_norm": 0.23926278948783875, + "learning_rate": 0.0002550900900900901, + "loss": 0.4742, + "step": 1007 + }, + { + "epoch": 0.3021922428330523, + "grad_norm": 0.23630262911319733, + "learning_rate": 0.00025504504504504503, + "loss": 0.4643, + "step": 1008 + }, + { + "epoch": 0.3024920367247517, + "grad_norm": 0.2534981071949005, + "learning_rate": 0.00025499999999999996, + "loss": 0.4806, + "step": 1009 + }, + { + "epoch": 0.3027918306164512, + "grad_norm": 0.22765369713306427, + "learning_rate": 0.00025495495495495496, + "loss": 0.4562, + "step": 1010 + }, + { + "epoch": 0.30309162450815064, + "grad_norm": 0.23825423419475555, + "learning_rate": 0.0002549099099099099, + "loss": 0.4642, + "step": 1011 + }, + { + "epoch": 0.3033914183998501, + "grad_norm": 0.2275952249765396, + "learning_rate": 0.00025486486486486483, + "loss": 0.4716, + "step": 1012 + }, + { + "epoch": 0.30369121229154955, + "grad_norm": 0.23309756815433502, + "learning_rate": 0.00025481981981981977, + "loss": 0.467, + "step": 1013 + }, + { + "epoch": 0.30399100618324904, + "grad_norm": 0.2582738995552063, + "learning_rate": 0.00025477477477477476, + "loss": 0.4756, + "step": 1014 + }, + { + "epoch": 0.30429080007494846, + "grad_norm": 0.21543192863464355, + "learning_rate": 0.0002547297297297297, + "loss": 0.4184, + "step": 1015 + }, + { + "epoch": 0.30459059396664795, + "grad_norm": 0.22537867724895477, + "learning_rate": 0.00025468468468468463, + "loss": 0.451, + "step": 1016 + }, + { + "epoch": 0.3048903878583474, + "grad_norm": 0.22783374786376953, + "learning_rate": 0.0002546396396396396, + "loss": 0.4595, + "step": 1017 + }, + { + "epoch": 0.30519018175004686, + "grad_norm": 0.2382606863975525, + "learning_rate": 0.00025459459459459456, + "loss": 0.4465, + "step": 1018 + }, + { + "epoch": 0.3054899756417463, + "grad_norm": 0.23583681881427765, + "learning_rate": 0.0002545495495495495, + "loss": 0.453, + "step": 1019 + }, + { + "epoch": 0.3057897695334458, + "grad_norm": 0.24536754190921783, + "learning_rate": 0.0002545045045045045, + "loss": 0.4829, + "step": 1020 + }, + { + "epoch": 0.3060895634251452, + "grad_norm": 0.21961447596549988, + "learning_rate": 0.0002544594594594594, + "loss": 0.4555, + "step": 1021 + }, + { + "epoch": 0.3063893573168447, + "grad_norm": 0.22678659856319427, + "learning_rate": 0.00025441441441441436, + "loss": 0.4423, + "step": 1022 + }, + { + "epoch": 0.3066891512085441, + "grad_norm": 0.23871202766895294, + "learning_rate": 0.00025436936936936935, + "loss": 0.4657, + "step": 1023 + }, + { + "epoch": 0.3069889451002436, + "grad_norm": 0.23637500405311584, + "learning_rate": 0.0002543243243243243, + "loss": 0.4665, + "step": 1024 + }, + { + "epoch": 0.30728873899194303, + "grad_norm": 0.22538509964942932, + "learning_rate": 0.0002542792792792792, + "loss": 0.4349, + "step": 1025 + }, + { + "epoch": 0.3075885328836425, + "grad_norm": 0.2449694573879242, + "learning_rate": 0.0002542342342342342, + "loss": 0.465, + "step": 1026 + }, + { + "epoch": 0.30788832677534195, + "grad_norm": 0.22211913764476776, + "learning_rate": 0.00025418918918918915, + "loss": 0.4276, + "step": 1027 + }, + { + "epoch": 0.30818812066704143, + "grad_norm": 0.25039684772491455, + "learning_rate": 0.0002541441441441441, + "loss": 0.4799, + "step": 1028 + }, + { + "epoch": 0.30848791455874086, + "grad_norm": 0.22903893887996674, + "learning_rate": 0.0002540990990990991, + "loss": 0.4679, + "step": 1029 + }, + { + "epoch": 0.30878770845044035, + "grad_norm": 0.2479073703289032, + "learning_rate": 0.000254054054054054, + "loss": 0.4722, + "step": 1030 + }, + { + "epoch": 0.3090875023421398, + "grad_norm": 0.22166845202445984, + "learning_rate": 0.00025400900900900895, + "loss": 0.4354, + "step": 1031 + }, + { + "epoch": 0.30938729623383926, + "grad_norm": 0.22735866904258728, + "learning_rate": 0.00025396396396396394, + "loss": 0.4344, + "step": 1032 + }, + { + "epoch": 0.3096870901255387, + "grad_norm": 0.2471131831407547, + "learning_rate": 0.0002539189189189189, + "loss": 0.4751, + "step": 1033 + }, + { + "epoch": 0.3099868840172382, + "grad_norm": 0.22858619689941406, + "learning_rate": 0.00025387387387387387, + "loss": 0.4589, + "step": 1034 + }, + { + "epoch": 0.3102866779089376, + "grad_norm": 0.2418094426393509, + "learning_rate": 0.0002538288288288288, + "loss": 0.4873, + "step": 1035 + }, + { + "epoch": 0.3105864718006371, + "grad_norm": 0.2313918024301529, + "learning_rate": 0.00025378378378378374, + "loss": 0.4819, + "step": 1036 + }, + { + "epoch": 0.3108862656923365, + "grad_norm": 0.22201773524284363, + "learning_rate": 0.00025373873873873874, + "loss": 0.4553, + "step": 1037 + }, + { + "epoch": 0.311186059584036, + "grad_norm": 0.2284671664237976, + "learning_rate": 0.00025369369369369367, + "loss": 0.4485, + "step": 1038 + }, + { + "epoch": 0.31148585347573543, + "grad_norm": 0.2529887855052948, + "learning_rate": 0.0002536486486486486, + "loss": 0.4674, + "step": 1039 + }, + { + "epoch": 0.3117856473674349, + "grad_norm": 0.23350189626216888, + "learning_rate": 0.0002536036036036036, + "loss": 0.4835, + "step": 1040 + }, + { + "epoch": 0.31208544125913434, + "grad_norm": 0.23258428275585175, + "learning_rate": 0.00025355855855855854, + "loss": 0.4458, + "step": 1041 + }, + { + "epoch": 0.31238523515083383, + "grad_norm": 0.23113112151622772, + "learning_rate": 0.0002535135135135135, + "loss": 0.4581, + "step": 1042 + }, + { + "epoch": 0.31268502904253326, + "grad_norm": 0.22711153328418732, + "learning_rate": 0.00025346846846846846, + "loss": 0.4485, + "step": 1043 + }, + { + "epoch": 0.31298482293423274, + "grad_norm": 0.23305024206638336, + "learning_rate": 0.0002534234234234234, + "loss": 0.4316, + "step": 1044 + }, + { + "epoch": 0.31328461682593217, + "grad_norm": 0.24723917245864868, + "learning_rate": 0.0002533783783783784, + "loss": 0.4512, + "step": 1045 + }, + { + "epoch": 0.3135844107176316, + "grad_norm": 0.21640846133232117, + "learning_rate": 0.00025333333333333333, + "loss": 0.4485, + "step": 1046 + }, + { + "epoch": 0.3138842046093311, + "grad_norm": 0.25021156668663025, + "learning_rate": 0.00025328828828828827, + "loss": 0.4708, + "step": 1047 + }, + { + "epoch": 0.3141839985010305, + "grad_norm": 0.24005773663520813, + "learning_rate": 0.00025324324324324326, + "loss": 0.4698, + "step": 1048 + }, + { + "epoch": 0.31448379239273, + "grad_norm": 0.24885396659374237, + "learning_rate": 0.0002531981981981982, + "loss": 0.4899, + "step": 1049 + }, + { + "epoch": 0.31478358628442943, + "grad_norm": 0.2413524091243744, + "learning_rate": 0.00025315315315315313, + "loss": 0.4776, + "step": 1050 + }, + { + "epoch": 0.3150833801761289, + "grad_norm": 0.25239062309265137, + "learning_rate": 0.0002531081081081081, + "loss": 0.4788, + "step": 1051 + }, + { + "epoch": 0.31538317406782834, + "grad_norm": 0.23389939963817596, + "learning_rate": 0.00025306306306306306, + "loss": 0.4299, + "step": 1052 + }, + { + "epoch": 0.3156829679595278, + "grad_norm": 0.2468218207359314, + "learning_rate": 0.000253018018018018, + "loss": 0.4759, + "step": 1053 + }, + { + "epoch": 0.31598276185122726, + "grad_norm": 0.2298142910003662, + "learning_rate": 0.000252972972972973, + "loss": 0.4658, + "step": 1054 + }, + { + "epoch": 0.31628255574292674, + "grad_norm": 0.22888216376304626, + "learning_rate": 0.0002529279279279279, + "loss": 0.4419, + "step": 1055 + }, + { + "epoch": 0.31658234963462617, + "grad_norm": 0.23855063319206238, + "learning_rate": 0.00025288288288288286, + "loss": 0.4642, + "step": 1056 + }, + { + "epoch": 0.31688214352632565, + "grad_norm": 0.24454447627067566, + "learning_rate": 0.00025283783783783785, + "loss": 0.4495, + "step": 1057 + }, + { + "epoch": 0.3171819374180251, + "grad_norm": 0.22794046998023987, + "learning_rate": 0.0002527927927927928, + "loss": 0.4474, + "step": 1058 + }, + { + "epoch": 0.31748173130972457, + "grad_norm": 0.248634934425354, + "learning_rate": 0.0002527477477477477, + "loss": 0.4694, + "step": 1059 + }, + { + "epoch": 0.317781525201424, + "grad_norm": 0.24363334476947784, + "learning_rate": 0.00025270270270270266, + "loss": 0.4549, + "step": 1060 + }, + { + "epoch": 0.3180813190931235, + "grad_norm": 0.23220765590667725, + "learning_rate": 0.00025265765765765765, + "loss": 0.4478, + "step": 1061 + }, + { + "epoch": 0.3183811129848229, + "grad_norm": 0.22161665558815002, + "learning_rate": 0.0002526126126126126, + "loss": 0.4549, + "step": 1062 + }, + { + "epoch": 0.3186809068765224, + "grad_norm": 0.24613521993160248, + "learning_rate": 0.0002525675675675675, + "loss": 0.4505, + "step": 1063 + }, + { + "epoch": 0.3189807007682218, + "grad_norm": 0.26228928565979004, + "learning_rate": 0.0002525225225225225, + "loss": 0.4878, + "step": 1064 + }, + { + "epoch": 0.3192804946599213, + "grad_norm": 0.2279721200466156, + "learning_rate": 0.00025247747747747745, + "loss": 0.4481, + "step": 1065 + }, + { + "epoch": 0.31958028855162074, + "grad_norm": 0.24583470821380615, + "learning_rate": 0.0002524324324324324, + "loss": 0.4643, + "step": 1066 + }, + { + "epoch": 0.3198800824433202, + "grad_norm": 0.24150992929935455, + "learning_rate": 0.0002523873873873874, + "loss": 0.4705, + "step": 1067 + }, + { + "epoch": 0.32017987633501965, + "grad_norm": 0.2419997900724411, + "learning_rate": 0.0002523423423423423, + "loss": 0.4648, + "step": 1068 + }, + { + "epoch": 0.32047967022671914, + "grad_norm": 0.26776254177093506, + "learning_rate": 0.00025229729729729725, + "loss": 0.5013, + "step": 1069 + }, + { + "epoch": 0.32077946411841857, + "grad_norm": 0.24678350985050201, + "learning_rate": 0.00025225225225225225, + "loss": 0.449, + "step": 1070 + }, + { + "epoch": 0.32107925801011805, + "grad_norm": 0.24101199209690094, + "learning_rate": 0.0002522072072072072, + "loss": 0.469, + "step": 1071 + }, + { + "epoch": 0.3213790519018175, + "grad_norm": 0.24230711162090302, + "learning_rate": 0.0002521621621621621, + "loss": 0.481, + "step": 1072 + }, + { + "epoch": 0.32167884579351697, + "grad_norm": 0.22988542914390564, + "learning_rate": 0.0002521171171171171, + "loss": 0.4442, + "step": 1073 + }, + { + "epoch": 0.3219786396852164, + "grad_norm": 0.23284588754177094, + "learning_rate": 0.00025207207207207205, + "loss": 0.4392, + "step": 1074 + }, + { + "epoch": 0.3222784335769159, + "grad_norm": 0.24554894864559174, + "learning_rate": 0.000252027027027027, + "loss": 0.4544, + "step": 1075 + }, + { + "epoch": 0.3225782274686153, + "grad_norm": 0.22776508331298828, + "learning_rate": 0.000251981981981982, + "loss": 0.4503, + "step": 1076 + }, + { + "epoch": 0.3228780213603148, + "grad_norm": 0.2508374750614166, + "learning_rate": 0.0002519369369369369, + "loss": 0.4779, + "step": 1077 + }, + { + "epoch": 0.3231778152520142, + "grad_norm": 0.22543244063854218, + "learning_rate": 0.00025189189189189185, + "loss": 0.4672, + "step": 1078 + }, + { + "epoch": 0.3234776091437137, + "grad_norm": 0.2409958839416504, + "learning_rate": 0.00025184684684684684, + "loss": 0.4631, + "step": 1079 + }, + { + "epoch": 0.32377740303541314, + "grad_norm": 0.2308938056230545, + "learning_rate": 0.0002518018018018018, + "loss": 0.4244, + "step": 1080 + }, + { + "epoch": 0.3240771969271126, + "grad_norm": 0.2354745715856552, + "learning_rate": 0.0002517567567567567, + "loss": 0.4499, + "step": 1081 + }, + { + "epoch": 0.32437699081881205, + "grad_norm": 0.24564653635025024, + "learning_rate": 0.0002517117117117117, + "loss": 0.4655, + "step": 1082 + }, + { + "epoch": 0.32467678471051153, + "grad_norm": 0.2388393133878708, + "learning_rate": 0.00025166666666666664, + "loss": 0.4747, + "step": 1083 + }, + { + "epoch": 0.32497657860221096, + "grad_norm": 0.23941588401794434, + "learning_rate": 0.0002516216216216216, + "loss": 0.4646, + "step": 1084 + }, + { + "epoch": 0.32527637249391045, + "grad_norm": 0.24191126227378845, + "learning_rate": 0.0002515765765765765, + "loss": 0.4686, + "step": 1085 + }, + { + "epoch": 0.3255761663856099, + "grad_norm": 0.2466372847557068, + "learning_rate": 0.0002515315315315315, + "loss": 0.4989, + "step": 1086 + }, + { + "epoch": 0.32587596027730936, + "grad_norm": 0.2441006302833557, + "learning_rate": 0.00025148648648648644, + "loss": 0.4562, + "step": 1087 + }, + { + "epoch": 0.3261757541690088, + "grad_norm": 0.26483842730522156, + "learning_rate": 0.0002514414414414414, + "loss": 0.4872, + "step": 1088 + }, + { + "epoch": 0.3264755480607083, + "grad_norm": 0.2481161653995514, + "learning_rate": 0.00025139639639639637, + "loss": 0.4486, + "step": 1089 + }, + { + "epoch": 0.3267753419524077, + "grad_norm": 0.23705454170703888, + "learning_rate": 0.0002513513513513513, + "loss": 0.4407, + "step": 1090 + }, + { + "epoch": 0.3270751358441072, + "grad_norm": 0.25678539276123047, + "learning_rate": 0.0002513063063063063, + "loss": 0.4899, + "step": 1091 + }, + { + "epoch": 0.3273749297358066, + "grad_norm": 0.22578591108322144, + "learning_rate": 0.00025126126126126123, + "loss": 0.4326, + "step": 1092 + }, + { + "epoch": 0.3276747236275061, + "grad_norm": 0.23661458492279053, + "learning_rate": 0.00025121621621621617, + "loss": 0.4649, + "step": 1093 + }, + { + "epoch": 0.32797451751920553, + "grad_norm": 0.2496035248041153, + "learning_rate": 0.00025117117117117116, + "loss": 0.4978, + "step": 1094 + }, + { + "epoch": 0.328274311410905, + "grad_norm": 0.225214421749115, + "learning_rate": 0.0002511261261261261, + "loss": 0.4608, + "step": 1095 + }, + { + "epoch": 0.32857410530260445, + "grad_norm": 0.24089965224266052, + "learning_rate": 0.00025108108108108103, + "loss": 0.4854, + "step": 1096 + }, + { + "epoch": 0.32887389919430393, + "grad_norm": 0.23737536370754242, + "learning_rate": 0.000251036036036036, + "loss": 0.4692, + "step": 1097 + }, + { + "epoch": 0.32917369308600336, + "grad_norm": 0.23569715023040771, + "learning_rate": 0.00025099099099099096, + "loss": 0.44, + "step": 1098 + }, + { + "epoch": 0.32947348697770285, + "grad_norm": 0.22477473318576813, + "learning_rate": 0.0002509459459459459, + "loss": 0.4168, + "step": 1099 + }, + { + "epoch": 0.3297732808694023, + "grad_norm": 0.25336310267448425, + "learning_rate": 0.0002509009009009009, + "loss": 0.4493, + "step": 1100 + }, + { + "epoch": 0.33007307476110176, + "grad_norm": 0.2452186793088913, + "learning_rate": 0.00025085585585585583, + "loss": 0.484, + "step": 1101 + }, + { + "epoch": 0.3303728686528012, + "grad_norm": 0.23870813846588135, + "learning_rate": 0.0002508108108108108, + "loss": 0.4228, + "step": 1102 + }, + { + "epoch": 0.3306726625445007, + "grad_norm": 0.2262556552886963, + "learning_rate": 0.00025076576576576575, + "loss": 0.4525, + "step": 1103 + }, + { + "epoch": 0.3309724564362001, + "grad_norm": 0.2720157504081726, + "learning_rate": 0.0002507207207207207, + "loss": 0.4961, + "step": 1104 + }, + { + "epoch": 0.3312722503278996, + "grad_norm": 0.23624925315380096, + "learning_rate": 0.0002506756756756757, + "loss": 0.4524, + "step": 1105 + }, + { + "epoch": 0.331572044219599, + "grad_norm": 0.2634661793708801, + "learning_rate": 0.0002506306306306306, + "loss": 0.4603, + "step": 1106 + }, + { + "epoch": 0.3318718381112985, + "grad_norm": 0.24629558622837067, + "learning_rate": 0.00025058558558558556, + "loss": 0.453, + "step": 1107 + }, + { + "epoch": 0.33217163200299793, + "grad_norm": 0.2503926157951355, + "learning_rate": 0.00025054054054054055, + "loss": 0.4756, + "step": 1108 + }, + { + "epoch": 0.3324714258946974, + "grad_norm": 0.22624404728412628, + "learning_rate": 0.0002504954954954955, + "loss": 0.4391, + "step": 1109 + }, + { + "epoch": 0.33277121978639684, + "grad_norm": 0.22146141529083252, + "learning_rate": 0.0002504504504504504, + "loss": 0.4496, + "step": 1110 + }, + { + "epoch": 0.33307101367809633, + "grad_norm": 0.23911216855049133, + "learning_rate": 0.0002504054054054054, + "loss": 0.4584, + "step": 1111 + }, + { + "epoch": 0.33337080756979576, + "grad_norm": 0.23537759482860565, + "learning_rate": 0.00025036036036036035, + "loss": 0.4735, + "step": 1112 + }, + { + "epoch": 0.33367060146149524, + "grad_norm": 0.23113307356834412, + "learning_rate": 0.0002503153153153153, + "loss": 0.4393, + "step": 1113 + }, + { + "epoch": 0.33397039535319467, + "grad_norm": 0.24185238778591156, + "learning_rate": 0.0002502702702702703, + "loss": 0.4661, + "step": 1114 + }, + { + "epoch": 0.33427018924489416, + "grad_norm": 0.25618115067481995, + "learning_rate": 0.0002502252252252252, + "loss": 0.4777, + "step": 1115 + }, + { + "epoch": 0.3345699831365936, + "grad_norm": 0.24486567080020905, + "learning_rate": 0.00025018018018018015, + "loss": 0.4589, + "step": 1116 + }, + { + "epoch": 0.33486977702829307, + "grad_norm": 0.2608473300933838, + "learning_rate": 0.00025013513513513514, + "loss": 0.4716, + "step": 1117 + }, + { + "epoch": 0.3351695709199925, + "grad_norm": 0.2427588254213333, + "learning_rate": 0.0002500900900900901, + "loss": 0.4856, + "step": 1118 + }, + { + "epoch": 0.335469364811692, + "grad_norm": 0.2493797391653061, + "learning_rate": 0.000250045045045045, + "loss": 0.4925, + "step": 1119 + }, + { + "epoch": 0.3357691587033914, + "grad_norm": 0.2610112428665161, + "learning_rate": 0.00025, + "loss": 0.4685, + "step": 1120 + }, + { + "epoch": 0.3360689525950909, + "grad_norm": 0.2435634434223175, + "learning_rate": 0.00024995495495495494, + "loss": 0.4678, + "step": 1121 + }, + { + "epoch": 0.3363687464867903, + "grad_norm": 0.24447932839393616, + "learning_rate": 0.0002499099099099099, + "loss": 0.4656, + "step": 1122 + }, + { + "epoch": 0.3366685403784898, + "grad_norm": 0.24005521833896637, + "learning_rate": 0.00024986486486486487, + "loss": 0.4712, + "step": 1123 + }, + { + "epoch": 0.33696833427018924, + "grad_norm": 0.2303832322359085, + "learning_rate": 0.0002498198198198198, + "loss": 0.4527, + "step": 1124 + }, + { + "epoch": 0.3372681281618887, + "grad_norm": 0.221012681722641, + "learning_rate": 0.00024977477477477474, + "loss": 0.4376, + "step": 1125 + }, + { + "epoch": 0.33756792205358815, + "grad_norm": 0.23421809077262878, + "learning_rate": 0.00024972972972972973, + "loss": 0.4449, + "step": 1126 + }, + { + "epoch": 0.33786771594528764, + "grad_norm": 0.23941418528556824, + "learning_rate": 0.00024968468468468467, + "loss": 0.4679, + "step": 1127 + }, + { + "epoch": 0.33816750983698707, + "grad_norm": 0.2479025423526764, + "learning_rate": 0.0002496396396396396, + "loss": 0.4872, + "step": 1128 + }, + { + "epoch": 0.33846730372868655, + "grad_norm": 0.24516451358795166, + "learning_rate": 0.0002495945945945946, + "loss": 0.4488, + "step": 1129 + }, + { + "epoch": 0.338767097620386, + "grad_norm": 0.2436760663986206, + "learning_rate": 0.00024954954954954954, + "loss": 0.4477, + "step": 1130 + }, + { + "epoch": 0.33906689151208547, + "grad_norm": 0.23894813656806946, + "learning_rate": 0.00024950450450450447, + "loss": 0.4295, + "step": 1131 + }, + { + "epoch": 0.3393666854037849, + "grad_norm": 0.24569731950759888, + "learning_rate": 0.00024945945945945946, + "loss": 0.46, + "step": 1132 + }, + { + "epoch": 0.3396664792954844, + "grad_norm": 0.24807578325271606, + "learning_rate": 0.0002494144144144144, + "loss": 0.4512, + "step": 1133 + }, + { + "epoch": 0.3399662731871838, + "grad_norm": 0.23641791939735413, + "learning_rate": 0.00024936936936936934, + "loss": 0.454, + "step": 1134 + }, + { + "epoch": 0.3402660670788833, + "grad_norm": 0.26076388359069824, + "learning_rate": 0.0002493243243243243, + "loss": 0.4771, + "step": 1135 + }, + { + "epoch": 0.3405658609705827, + "grad_norm": 0.24686305224895477, + "learning_rate": 0.00024927927927927926, + "loss": 0.4783, + "step": 1136 + }, + { + "epoch": 0.3408656548622822, + "grad_norm": 0.2262791097164154, + "learning_rate": 0.0002492342342342342, + "loss": 0.4272, + "step": 1137 + }, + { + "epoch": 0.34116544875398164, + "grad_norm": 0.23418664932250977, + "learning_rate": 0.00024918918918918914, + "loss": 0.4666, + "step": 1138 + }, + { + "epoch": 0.3414652426456811, + "grad_norm": 0.23737958073616028, + "learning_rate": 0.00024914414414414413, + "loss": 0.4433, + "step": 1139 + }, + { + "epoch": 0.34176503653738055, + "grad_norm": 0.2579478919506073, + "learning_rate": 0.00024909909909909907, + "loss": 0.4702, + "step": 1140 + }, + { + "epoch": 0.34206483042908, + "grad_norm": 0.2627730667591095, + "learning_rate": 0.000249054054054054, + "loss": 0.4607, + "step": 1141 + }, + { + "epoch": 0.34236462432077946, + "grad_norm": 0.2283281534910202, + "learning_rate": 0.000249009009009009, + "loss": 0.4462, + "step": 1142 + }, + { + "epoch": 0.3426644182124789, + "grad_norm": 0.24083632230758667, + "learning_rate": 0.00024896396396396393, + "loss": 0.4663, + "step": 1143 + }, + { + "epoch": 0.3429642121041784, + "grad_norm": 0.24331289529800415, + "learning_rate": 0.00024891891891891887, + "loss": 0.4414, + "step": 1144 + }, + { + "epoch": 0.3432640059958778, + "grad_norm": 0.24059659242630005, + "learning_rate": 0.00024887387387387386, + "loss": 0.4728, + "step": 1145 + }, + { + "epoch": 0.3435637998875773, + "grad_norm": 0.23175103962421417, + "learning_rate": 0.0002488288288288288, + "loss": 0.437, + "step": 1146 + }, + { + "epoch": 0.3438635937792767, + "grad_norm": 0.23247724771499634, + "learning_rate": 0.00024878378378378373, + "loss": 0.4454, + "step": 1147 + }, + { + "epoch": 0.3441633876709762, + "grad_norm": 0.22808894515037537, + "learning_rate": 0.0002487387387387387, + "loss": 0.4379, + "step": 1148 + }, + { + "epoch": 0.34446318156267564, + "grad_norm": 0.2515697777271271, + "learning_rate": 0.00024869369369369366, + "loss": 0.4938, + "step": 1149 + }, + { + "epoch": 0.3447629754543751, + "grad_norm": 0.22830873727798462, + "learning_rate": 0.0002486486486486486, + "loss": 0.4335, + "step": 1150 + }, + { + "epoch": 0.34506276934607455, + "grad_norm": 0.2352674901485443, + "learning_rate": 0.0002486036036036036, + "loss": 0.4661, + "step": 1151 + }, + { + "epoch": 0.34536256323777403, + "grad_norm": 0.233395054936409, + "learning_rate": 0.0002485585585585585, + "loss": 0.4429, + "step": 1152 + }, + { + "epoch": 0.34566235712947346, + "grad_norm": 0.2488911747932434, + "learning_rate": 0.00024851351351351346, + "loss": 0.4832, + "step": 1153 + }, + { + "epoch": 0.34596215102117295, + "grad_norm": 0.2402927577495575, + "learning_rate": 0.00024846846846846845, + "loss": 0.4633, + "step": 1154 + }, + { + "epoch": 0.3462619449128724, + "grad_norm": 0.23628897964954376, + "learning_rate": 0.0002484234234234234, + "loss": 0.4683, + "step": 1155 + }, + { + "epoch": 0.34656173880457186, + "grad_norm": 0.23966571688652039, + "learning_rate": 0.0002483783783783783, + "loss": 0.4716, + "step": 1156 + }, + { + "epoch": 0.3468615326962713, + "grad_norm": 0.23786477744579315, + "learning_rate": 0.0002483333333333333, + "loss": 0.4829, + "step": 1157 + }, + { + "epoch": 0.3471613265879708, + "grad_norm": 0.24706590175628662, + "learning_rate": 0.00024828828828828825, + "loss": 0.4741, + "step": 1158 + }, + { + "epoch": 0.3474611204796702, + "grad_norm": 0.25192269682884216, + "learning_rate": 0.00024824324324324324, + "loss": 0.4725, + "step": 1159 + }, + { + "epoch": 0.3477609143713697, + "grad_norm": 0.2672080099582672, + "learning_rate": 0.0002481981981981982, + "loss": 0.4677, + "step": 1160 + }, + { + "epoch": 0.3480607082630691, + "grad_norm": 0.23710590600967407, + "learning_rate": 0.0002481531531531531, + "loss": 0.4629, + "step": 1161 + }, + { + "epoch": 0.3483605021547686, + "grad_norm": 0.2306007295846939, + "learning_rate": 0.0002481081081081081, + "loss": 0.4425, + "step": 1162 + }, + { + "epoch": 0.34866029604646803, + "grad_norm": 0.2416974902153015, + "learning_rate": 0.00024806306306306305, + "loss": 0.4452, + "step": 1163 + }, + { + "epoch": 0.3489600899381675, + "grad_norm": 0.2495068907737732, + "learning_rate": 0.000248018018018018, + "loss": 0.4875, + "step": 1164 + }, + { + "epoch": 0.34925988382986695, + "grad_norm": 0.26994964480400085, + "learning_rate": 0.00024797297297297297, + "loss": 0.4957, + "step": 1165 + }, + { + "epoch": 0.34955967772156643, + "grad_norm": 0.2546437978744507, + "learning_rate": 0.0002479279279279279, + "loss": 0.498, + "step": 1166 + }, + { + "epoch": 0.34985947161326586, + "grad_norm": 0.2271340936422348, + "learning_rate": 0.00024788288288288285, + "loss": 0.4615, + "step": 1167 + }, + { + "epoch": 0.35015926550496534, + "grad_norm": 0.24870999157428741, + "learning_rate": 0.00024783783783783784, + "loss": 0.4688, + "step": 1168 + }, + { + "epoch": 0.3504590593966648, + "grad_norm": 0.23978911340236664, + "learning_rate": 0.0002477927927927928, + "loss": 0.4882, + "step": 1169 + }, + { + "epoch": 0.35075885328836426, + "grad_norm": 0.2773337662220001, + "learning_rate": 0.0002477477477477477, + "loss": 0.4695, + "step": 1170 + }, + { + "epoch": 0.3510586471800637, + "grad_norm": 0.24570350348949432, + "learning_rate": 0.0002477027027027027, + "loss": 0.4679, + "step": 1171 + }, + { + "epoch": 0.3513584410717632, + "grad_norm": 0.25563982129096985, + "learning_rate": 0.00024765765765765764, + "loss": 0.4731, + "step": 1172 + }, + { + "epoch": 0.3516582349634626, + "grad_norm": 0.23189115524291992, + "learning_rate": 0.00024761261261261263, + "loss": 0.4476, + "step": 1173 + }, + { + "epoch": 0.3519580288551621, + "grad_norm": 0.24074453115463257, + "learning_rate": 0.00024756756756756757, + "loss": 0.4419, + "step": 1174 + }, + { + "epoch": 0.3522578227468615, + "grad_norm": 0.2376662790775299, + "learning_rate": 0.0002475225225225225, + "loss": 0.4571, + "step": 1175 + }, + { + "epoch": 0.352557616638561, + "grad_norm": 0.2344047725200653, + "learning_rate": 0.0002474774774774775, + "loss": 0.4389, + "step": 1176 + }, + { + "epoch": 0.35285741053026043, + "grad_norm": 0.23310165107250214, + "learning_rate": 0.00024743243243243243, + "loss": 0.4583, + "step": 1177 + }, + { + "epoch": 0.3531572044219599, + "grad_norm": 0.21277011930942535, + "learning_rate": 0.00024738738738738737, + "loss": 0.4306, + "step": 1178 + }, + { + "epoch": 0.35345699831365934, + "grad_norm": 0.23581352829933167, + "learning_rate": 0.00024734234234234236, + "loss": 0.475, + "step": 1179 + }, + { + "epoch": 0.3537567922053588, + "grad_norm": 0.23194879293441772, + "learning_rate": 0.0002472972972972973, + "loss": 0.4518, + "step": 1180 + }, + { + "epoch": 0.35405658609705826, + "grad_norm": 0.22603453695774078, + "learning_rate": 0.00024725225225225223, + "loss": 0.4464, + "step": 1181 + }, + { + "epoch": 0.35435637998875774, + "grad_norm": 0.23987054824829102, + "learning_rate": 0.00024720720720720717, + "loss": 0.4586, + "step": 1182 + }, + { + "epoch": 0.35465617388045717, + "grad_norm": 0.22986359894275665, + "learning_rate": 0.00024716216216216216, + "loss": 0.4664, + "step": 1183 + }, + { + "epoch": 0.35495596777215666, + "grad_norm": 0.22636739909648895, + "learning_rate": 0.0002471171171171171, + "loss": 0.4236, + "step": 1184 + }, + { + "epoch": 0.3552557616638561, + "grad_norm": 0.2346397340297699, + "learning_rate": 0.00024707207207207203, + "loss": 0.4703, + "step": 1185 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.2564719617366791, + "learning_rate": 0.000247027027027027, + "loss": 0.4775, + "step": 1186 + }, + { + "epoch": 0.355855349447255, + "grad_norm": 0.22305525839328766, + "learning_rate": 0.00024698198198198196, + "loss": 0.4694, + "step": 1187 + }, + { + "epoch": 0.3561551433389545, + "grad_norm": 0.2369467169046402, + "learning_rate": 0.0002469369369369369, + "loss": 0.4498, + "step": 1188 + }, + { + "epoch": 0.3564549372306539, + "grad_norm": 0.25123798847198486, + "learning_rate": 0.0002468918918918919, + "loss": 0.4619, + "step": 1189 + }, + { + "epoch": 0.3567547311223534, + "grad_norm": 0.21925069391727448, + "learning_rate": 0.0002468468468468468, + "loss": 0.4498, + "step": 1190 + }, + { + "epoch": 0.3570545250140528, + "grad_norm": 0.2385261207818985, + "learning_rate": 0.00024680180180180176, + "loss": 0.4537, + "step": 1191 + }, + { + "epoch": 0.3573543189057523, + "grad_norm": 0.23894301056861877, + "learning_rate": 0.00024675675675675675, + "loss": 0.4665, + "step": 1192 + }, + { + "epoch": 0.35765411279745174, + "grad_norm": 0.23315206170082092, + "learning_rate": 0.0002467117117117117, + "loss": 0.4421, + "step": 1193 + }, + { + "epoch": 0.3579539066891512, + "grad_norm": 0.23406696319580078, + "learning_rate": 0.0002466666666666666, + "loss": 0.4377, + "step": 1194 + }, + { + "epoch": 0.35825370058085065, + "grad_norm": 0.25852885842323303, + "learning_rate": 0.0002466216216216216, + "loss": 0.4838, + "step": 1195 + }, + { + "epoch": 0.35855349447255014, + "grad_norm": 0.24008771777153015, + "learning_rate": 0.00024657657657657655, + "loss": 0.4733, + "step": 1196 + }, + { + "epoch": 0.35885328836424957, + "grad_norm": 0.228665292263031, + "learning_rate": 0.0002465315315315315, + "loss": 0.4753, + "step": 1197 + }, + { + "epoch": 0.35915308225594905, + "grad_norm": 0.2344791293144226, + "learning_rate": 0.0002464864864864865, + "loss": 0.4484, + "step": 1198 + }, + { + "epoch": 0.3594528761476485, + "grad_norm": 0.22843588888645172, + "learning_rate": 0.0002464414414414414, + "loss": 0.4349, + "step": 1199 + }, + { + "epoch": 0.35975267003934797, + "grad_norm": 0.23127557337284088, + "learning_rate": 0.00024639639639639636, + "loss": 0.4466, + "step": 1200 + }, + { + "epoch": 0.3600524639310474, + "grad_norm": 0.2092585414648056, + "learning_rate": 0.00024635135135135135, + "loss": 0.4101, + "step": 1201 + }, + { + "epoch": 0.3603522578227469, + "grad_norm": 0.24416851997375488, + "learning_rate": 0.0002463063063063063, + "loss": 0.4507, + "step": 1202 + }, + { + "epoch": 0.3606520517144463, + "grad_norm": 0.2409181445837021, + "learning_rate": 0.0002462612612612612, + "loss": 0.4641, + "step": 1203 + }, + { + "epoch": 0.3609518456061458, + "grad_norm": 0.2390405684709549, + "learning_rate": 0.0002462162162162162, + "loss": 0.4398, + "step": 1204 + }, + { + "epoch": 0.3612516394978452, + "grad_norm": 0.25821688771247864, + "learning_rate": 0.00024617117117117115, + "loss": 0.4441, + "step": 1205 + }, + { + "epoch": 0.3615514333895447, + "grad_norm": 0.24180777370929718, + "learning_rate": 0.0002461261261261261, + "loss": 0.4852, + "step": 1206 + }, + { + "epoch": 0.36185122728124414, + "grad_norm": 0.2260608822107315, + "learning_rate": 0.000246081081081081, + "loss": 0.4214, + "step": 1207 + }, + { + "epoch": 0.3621510211729436, + "grad_norm": 0.2266250103712082, + "learning_rate": 0.000246036036036036, + "loss": 0.4104, + "step": 1208 + }, + { + "epoch": 0.36245081506464305, + "grad_norm": 0.247540682554245, + "learning_rate": 0.00024599099099099095, + "loss": 0.4563, + "step": 1209 + }, + { + "epoch": 0.36275060895634254, + "grad_norm": 0.22714072465896606, + "learning_rate": 0.0002459459459459459, + "loss": 0.4528, + "step": 1210 + }, + { + "epoch": 0.36305040284804196, + "grad_norm": 0.22302433848381042, + "learning_rate": 0.0002459009009009009, + "loss": 0.3982, + "step": 1211 + }, + { + "epoch": 0.36335019673974145, + "grad_norm": 0.2646171748638153, + "learning_rate": 0.0002458558558558558, + "loss": 0.4837, + "step": 1212 + }, + { + "epoch": 0.3636499906314409, + "grad_norm": 0.24546460807323456, + "learning_rate": 0.00024581081081081075, + "loss": 0.4716, + "step": 1213 + }, + { + "epoch": 0.36394978452314036, + "grad_norm": 0.2416929006576538, + "learning_rate": 0.00024576576576576574, + "loss": 0.4634, + "step": 1214 + }, + { + "epoch": 0.3642495784148398, + "grad_norm": 0.2360236495733261, + "learning_rate": 0.0002457207207207207, + "loss": 0.4409, + "step": 1215 + }, + { + "epoch": 0.3645493723065393, + "grad_norm": 0.24383249878883362, + "learning_rate": 0.00024567567567567567, + "loss": 0.4553, + "step": 1216 + }, + { + "epoch": 0.3648491661982387, + "grad_norm": 0.2516370117664337, + "learning_rate": 0.0002456306306306306, + "loss": 0.4553, + "step": 1217 + }, + { + "epoch": 0.3651489600899382, + "grad_norm": 0.2524015009403229, + "learning_rate": 0.00024558558558558554, + "loss": 0.4766, + "step": 1218 + }, + { + "epoch": 0.3654487539816376, + "grad_norm": 0.23386207222938538, + "learning_rate": 0.00024554054054054053, + "loss": 0.439, + "step": 1219 + }, + { + "epoch": 0.3657485478733371, + "grad_norm": 0.23544320464134216, + "learning_rate": 0.00024549549549549547, + "loss": 0.4414, + "step": 1220 + }, + { + "epoch": 0.36604834176503653, + "grad_norm": 0.24809177219867706, + "learning_rate": 0.0002454504504504504, + "loss": 0.4577, + "step": 1221 + }, + { + "epoch": 0.366348135656736, + "grad_norm": 0.24466872215270996, + "learning_rate": 0.0002454054054054054, + "loss": 0.4609, + "step": 1222 + }, + { + "epoch": 0.36664792954843545, + "grad_norm": 0.24159879982471466, + "learning_rate": 0.00024536036036036034, + "loss": 0.448, + "step": 1223 + }, + { + "epoch": 0.36694772344013493, + "grad_norm": 0.2456122189760208, + "learning_rate": 0.00024531531531531527, + "loss": 0.4563, + "step": 1224 + }, + { + "epoch": 0.36724751733183436, + "grad_norm": 0.23266494274139404, + "learning_rate": 0.00024527027027027026, + "loss": 0.457, + "step": 1225 + }, + { + "epoch": 0.36754731122353385, + "grad_norm": 0.24822424352169037, + "learning_rate": 0.0002452252252252252, + "loss": 0.4577, + "step": 1226 + }, + { + "epoch": 0.3678471051152333, + "grad_norm": 0.2528662383556366, + "learning_rate": 0.00024518018018018014, + "loss": 0.4717, + "step": 1227 + }, + { + "epoch": 0.36814689900693276, + "grad_norm": 0.22255463898181915, + "learning_rate": 0.00024513513513513513, + "loss": 0.4287, + "step": 1228 + }, + { + "epoch": 0.3684466928986322, + "grad_norm": 0.23769044876098633, + "learning_rate": 0.00024509009009009006, + "loss": 0.4348, + "step": 1229 + }, + { + "epoch": 0.3687464867903317, + "grad_norm": 0.23163922131061554, + "learning_rate": 0.00024504504504504506, + "loss": 0.4631, + "step": 1230 + }, + { + "epoch": 0.3690462806820311, + "grad_norm": 0.22742880880832672, + "learning_rate": 0.000245, + "loss": 0.4344, + "step": 1231 + }, + { + "epoch": 0.3693460745737306, + "grad_norm": 0.2314460575580597, + "learning_rate": 0.00024495495495495493, + "loss": 0.4507, + "step": 1232 + }, + { + "epoch": 0.36964586846543, + "grad_norm": 0.21651825308799744, + "learning_rate": 0.0002449099099099099, + "loss": 0.422, + "step": 1233 + }, + { + "epoch": 0.3699456623571295, + "grad_norm": 0.24304571747779846, + "learning_rate": 0.00024486486486486486, + "loss": 0.4322, + "step": 1234 + }, + { + "epoch": 0.37024545624882893, + "grad_norm": 0.24105508625507355, + "learning_rate": 0.0002448198198198198, + "loss": 0.4367, + "step": 1235 + }, + { + "epoch": 0.37054525014052836, + "grad_norm": 0.2495047152042389, + "learning_rate": 0.0002447747747747748, + "loss": 0.456, + "step": 1236 + }, + { + "epoch": 0.37084504403222784, + "grad_norm": 0.2545395791530609, + "learning_rate": 0.0002447297297297297, + "loss": 0.4608, + "step": 1237 + }, + { + "epoch": 0.3711448379239273, + "grad_norm": 0.2552499771118164, + "learning_rate": 0.00024468468468468466, + "loss": 0.4898, + "step": 1238 + }, + { + "epoch": 0.37144463181562676, + "grad_norm": 0.2469811737537384, + "learning_rate": 0.00024463963963963965, + "loss": 0.4427, + "step": 1239 + }, + { + "epoch": 0.3717444257073262, + "grad_norm": 0.23857302963733673, + "learning_rate": 0.0002445945945945946, + "loss": 0.4509, + "step": 1240 + }, + { + "epoch": 0.37204421959902567, + "grad_norm": 0.2521422803401947, + "learning_rate": 0.0002445495495495495, + "loss": 0.4425, + "step": 1241 + }, + { + "epoch": 0.3723440134907251, + "grad_norm": 0.24907280504703522, + "learning_rate": 0.0002445045045045045, + "loss": 0.4542, + "step": 1242 + }, + { + "epoch": 0.3726438073824246, + "grad_norm": 0.23783591389656067, + "learning_rate": 0.00024445945945945945, + "loss": 0.4831, + "step": 1243 + }, + { + "epoch": 0.372943601274124, + "grad_norm": 0.2376372069120407, + "learning_rate": 0.0002444144144144144, + "loss": 0.4514, + "step": 1244 + }, + { + "epoch": 0.3732433951658235, + "grad_norm": 0.2387792468070984, + "learning_rate": 0.0002443693693693694, + "loss": 0.4593, + "step": 1245 + }, + { + "epoch": 0.37354318905752293, + "grad_norm": 0.22432541847229004, + "learning_rate": 0.0002443243243243243, + "loss": 0.4462, + "step": 1246 + }, + { + "epoch": 0.3738429829492224, + "grad_norm": 0.24190527200698853, + "learning_rate": 0.00024427927927927925, + "loss": 0.4645, + "step": 1247 + }, + { + "epoch": 0.37414277684092184, + "grad_norm": 0.23738646507263184, + "learning_rate": 0.00024423423423423424, + "loss": 0.4594, + "step": 1248 + }, + { + "epoch": 0.3744425707326213, + "grad_norm": 0.24582220613956451, + "learning_rate": 0.0002441891891891892, + "loss": 0.4632, + "step": 1249 + }, + { + "epoch": 0.37474236462432076, + "grad_norm": 0.22717328369617462, + "learning_rate": 0.0002441441441441441, + "loss": 0.4372, + "step": 1250 + }, + { + "epoch": 0.37504215851602024, + "grad_norm": 0.24414947628974915, + "learning_rate": 0.0002440990990990991, + "loss": 0.4458, + "step": 1251 + }, + { + "epoch": 0.37534195240771967, + "grad_norm": 0.23710165917873383, + "learning_rate": 0.00024405405405405404, + "loss": 0.4468, + "step": 1252 + }, + { + "epoch": 0.37564174629941915, + "grad_norm": 0.25462841987609863, + "learning_rate": 0.00024400900900900898, + "loss": 0.4616, + "step": 1253 + }, + { + "epoch": 0.3759415401911186, + "grad_norm": 0.22636806964874268, + "learning_rate": 0.00024396396396396392, + "loss": 0.4259, + "step": 1254 + }, + { + "epoch": 0.37624133408281807, + "grad_norm": 0.24978788197040558, + "learning_rate": 0.0002439189189189189, + "loss": 0.464, + "step": 1255 + }, + { + "epoch": 0.3765411279745175, + "grad_norm": 0.2312556803226471, + "learning_rate": 0.00024387387387387384, + "loss": 0.4531, + "step": 1256 + }, + { + "epoch": 0.376840921866217, + "grad_norm": 0.22814960777759552, + "learning_rate": 0.00024382882882882878, + "loss": 0.4608, + "step": 1257 + }, + { + "epoch": 0.3771407157579164, + "grad_norm": 0.25135213136672974, + "learning_rate": 0.00024378378378378377, + "loss": 0.4551, + "step": 1258 + }, + { + "epoch": 0.3774405096496159, + "grad_norm": 0.2209721952676773, + "learning_rate": 0.0002437387387387387, + "loss": 0.4679, + "step": 1259 + }, + { + "epoch": 0.3777403035413153, + "grad_norm": 0.2256690412759781, + "learning_rate": 0.00024369369369369365, + "loss": 0.4388, + "step": 1260 + }, + { + "epoch": 0.3780400974330148, + "grad_norm": 0.23604658246040344, + "learning_rate": 0.00024364864864864864, + "loss": 0.4393, + "step": 1261 + }, + { + "epoch": 0.37833989132471424, + "grad_norm": 0.22875599563121796, + "learning_rate": 0.00024360360360360357, + "loss": 0.4176, + "step": 1262 + }, + { + "epoch": 0.3786396852164137, + "grad_norm": 0.2428806722164154, + "learning_rate": 0.00024355855855855854, + "loss": 0.461, + "step": 1263 + }, + { + "epoch": 0.37893947910811315, + "grad_norm": 0.2470446228981018, + "learning_rate": 0.0002435135135135135, + "loss": 0.462, + "step": 1264 + }, + { + "epoch": 0.37923927299981264, + "grad_norm": 0.22954460978507996, + "learning_rate": 0.00024346846846846844, + "loss": 0.4529, + "step": 1265 + }, + { + "epoch": 0.37953906689151207, + "grad_norm": 0.23748372495174408, + "learning_rate": 0.0002434234234234234, + "loss": 0.4598, + "step": 1266 + }, + { + "epoch": 0.37983886078321155, + "grad_norm": 0.26300883293151855, + "learning_rate": 0.00024337837837837837, + "loss": 0.482, + "step": 1267 + }, + { + "epoch": 0.380138654674911, + "grad_norm": 0.2546245753765106, + "learning_rate": 0.0002433333333333333, + "loss": 0.4607, + "step": 1268 + }, + { + "epoch": 0.38043844856661047, + "grad_norm": 0.24974289536476135, + "learning_rate": 0.00024328828828828827, + "loss": 0.4713, + "step": 1269 + }, + { + "epoch": 0.3807382424583099, + "grad_norm": 0.2457670271396637, + "learning_rate": 0.00024324324324324323, + "loss": 0.4709, + "step": 1270 + }, + { + "epoch": 0.3810380363500094, + "grad_norm": 0.2360873520374298, + "learning_rate": 0.00024319819819819817, + "loss": 0.4653, + "step": 1271 + }, + { + "epoch": 0.3813378302417088, + "grad_norm": 0.24448256194591522, + "learning_rate": 0.00024315315315315313, + "loss": 0.4651, + "step": 1272 + }, + { + "epoch": 0.3816376241334083, + "grad_norm": 0.22578337788581848, + "learning_rate": 0.0002431081081081081, + "loss": 0.4163, + "step": 1273 + }, + { + "epoch": 0.3819374180251077, + "grad_norm": 0.24973514676094055, + "learning_rate": 0.00024306306306306306, + "loss": 0.4638, + "step": 1274 + }, + { + "epoch": 0.3822372119168072, + "grad_norm": 0.21938931941986084, + "learning_rate": 0.000243018018018018, + "loss": 0.4325, + "step": 1275 + }, + { + "epoch": 0.38253700580850664, + "grad_norm": 0.23947425186634064, + "learning_rate": 0.00024297297297297296, + "loss": 0.4408, + "step": 1276 + }, + { + "epoch": 0.3828367997002061, + "grad_norm": 0.23008406162261963, + "learning_rate": 0.00024292792792792792, + "loss": 0.4377, + "step": 1277 + }, + { + "epoch": 0.38313659359190555, + "grad_norm": 0.24068063497543335, + "learning_rate": 0.00024288288288288286, + "loss": 0.4606, + "step": 1278 + }, + { + "epoch": 0.38343638748360503, + "grad_norm": 0.2432139664888382, + "learning_rate": 0.0002428378378378378, + "loss": 0.4494, + "step": 1279 + }, + { + "epoch": 0.38373618137530446, + "grad_norm": 0.22731392085552216, + "learning_rate": 0.0002427927927927928, + "loss": 0.426, + "step": 1280 + }, + { + "epoch": 0.38403597526700395, + "grad_norm": 0.2352358102798462, + "learning_rate": 0.00024274774774774772, + "loss": 0.4537, + "step": 1281 + }, + { + "epoch": 0.3843357691587034, + "grad_norm": 0.23868781328201294, + "learning_rate": 0.00024270270270270266, + "loss": 0.4641, + "step": 1282 + }, + { + "epoch": 0.38463556305040286, + "grad_norm": 0.23498302698135376, + "learning_rate": 0.00024265765765765765, + "loss": 0.4563, + "step": 1283 + }, + { + "epoch": 0.3849353569421023, + "grad_norm": 0.24769316613674164, + "learning_rate": 0.0002426126126126126, + "loss": 0.4567, + "step": 1284 + }, + { + "epoch": 0.3852351508338018, + "grad_norm": 0.21658611297607422, + "learning_rate": 0.00024256756756756753, + "loss": 0.4437, + "step": 1285 + }, + { + "epoch": 0.3855349447255012, + "grad_norm": 0.2677985727787018, + "learning_rate": 0.00024252252252252252, + "loss": 0.45, + "step": 1286 + }, + { + "epoch": 0.3858347386172007, + "grad_norm": 0.23147153854370117, + "learning_rate": 0.00024247747747747745, + "loss": 0.4341, + "step": 1287 + }, + { + "epoch": 0.3861345325089001, + "grad_norm": 0.2465144395828247, + "learning_rate": 0.0002424324324324324, + "loss": 0.4629, + "step": 1288 + }, + { + "epoch": 0.3864343264005996, + "grad_norm": 0.23633845150470734, + "learning_rate": 0.00024238738738738738, + "loss": 0.4484, + "step": 1289 + }, + { + "epoch": 0.38673412029229903, + "grad_norm": 0.22743773460388184, + "learning_rate": 0.00024234234234234232, + "loss": 0.4451, + "step": 1290 + }, + { + "epoch": 0.3870339141839985, + "grad_norm": 0.233259379863739, + "learning_rate": 0.00024229729729729726, + "loss": 0.4546, + "step": 1291 + }, + { + "epoch": 0.38733370807569795, + "grad_norm": 0.24213840067386627, + "learning_rate": 0.00024225225225225225, + "loss": 0.4378, + "step": 1292 + }, + { + "epoch": 0.38763350196739743, + "grad_norm": 0.23812246322631836, + "learning_rate": 0.00024220720720720718, + "loss": 0.4613, + "step": 1293 + }, + { + "epoch": 0.38793329585909686, + "grad_norm": 0.25436896085739136, + "learning_rate": 0.00024216216216216212, + "loss": 0.4462, + "step": 1294 + }, + { + "epoch": 0.38823308975079635, + "grad_norm": 0.24321508407592773, + "learning_rate": 0.0002421171171171171, + "loss": 0.4674, + "step": 1295 + }, + { + "epoch": 0.3885328836424958, + "grad_norm": 0.2434927523136139, + "learning_rate": 0.00024207207207207205, + "loss": 0.4317, + "step": 1296 + }, + { + "epoch": 0.38883267753419526, + "grad_norm": 0.2564734220504761, + "learning_rate": 0.000242027027027027, + "loss": 0.4556, + "step": 1297 + }, + { + "epoch": 0.3891324714258947, + "grad_norm": 0.26102596521377563, + "learning_rate": 0.00024198198198198198, + "loss": 0.4866, + "step": 1298 + }, + { + "epoch": 0.3894322653175942, + "grad_norm": 0.26838192343711853, + "learning_rate": 0.0002419369369369369, + "loss": 0.4824, + "step": 1299 + }, + { + "epoch": 0.3897320592092936, + "grad_norm": 0.24552933871746063, + "learning_rate": 0.00024189189189189188, + "loss": 0.4581, + "step": 1300 + }, + { + "epoch": 0.3900318531009931, + "grad_norm": 0.2453169822692871, + "learning_rate": 0.0002418468468468468, + "loss": 0.4605, + "step": 1301 + }, + { + "epoch": 0.3903316469926925, + "grad_norm": 0.24922487139701843, + "learning_rate": 0.00024180180180180178, + "loss": 0.4555, + "step": 1302 + }, + { + "epoch": 0.390631440884392, + "grad_norm": 0.2683355212211609, + "learning_rate": 0.00024175675675675674, + "loss": 0.4674, + "step": 1303 + }, + { + "epoch": 0.39093123477609143, + "grad_norm": 0.23587100207805634, + "learning_rate": 0.00024171171171171168, + "loss": 0.4564, + "step": 1304 + }, + { + "epoch": 0.3912310286677909, + "grad_norm": 0.2331109195947647, + "learning_rate": 0.00024166666666666664, + "loss": 0.4499, + "step": 1305 + }, + { + "epoch": 0.39153082255949034, + "grad_norm": 0.24744129180908203, + "learning_rate": 0.0002416216216216216, + "loss": 0.453, + "step": 1306 + }, + { + "epoch": 0.39183061645118983, + "grad_norm": 0.228163942694664, + "learning_rate": 0.00024157657657657654, + "loss": 0.4219, + "step": 1307 + }, + { + "epoch": 0.39213041034288926, + "grad_norm": 0.2409953773021698, + "learning_rate": 0.00024153153153153153, + "loss": 0.4322, + "step": 1308 + }, + { + "epoch": 0.39243020423458874, + "grad_norm": 0.22519168257713318, + "learning_rate": 0.00024148648648648647, + "loss": 0.4177, + "step": 1309 + }, + { + "epoch": 0.39272999812628817, + "grad_norm": 0.2466001957654953, + "learning_rate": 0.0002414414414414414, + "loss": 0.4747, + "step": 1310 + }, + { + "epoch": 0.39302979201798766, + "grad_norm": 0.247292622923851, + "learning_rate": 0.0002413963963963964, + "loss": 0.4663, + "step": 1311 + }, + { + "epoch": 0.3933295859096871, + "grad_norm": 0.23905499279499054, + "learning_rate": 0.00024135135135135133, + "loss": 0.4631, + "step": 1312 + }, + { + "epoch": 0.39362937980138657, + "grad_norm": 0.27826932072639465, + "learning_rate": 0.00024130630630630627, + "loss": 0.4863, + "step": 1313 + }, + { + "epoch": 0.393929173693086, + "grad_norm": 0.3269807994365692, + "learning_rate": 0.00024126126126126126, + "loss": 0.4751, + "step": 1314 + }, + { + "epoch": 0.3942289675847855, + "grad_norm": 0.2593337595462799, + "learning_rate": 0.0002412162162162162, + "loss": 0.4754, + "step": 1315 + }, + { + "epoch": 0.3945287614764849, + "grad_norm": 0.24511882662773132, + "learning_rate": 0.00024117117117117114, + "loss": 0.4704, + "step": 1316 + }, + { + "epoch": 0.3948285553681844, + "grad_norm": 0.2439701408147812, + "learning_rate": 0.00024112612612612613, + "loss": 0.4367, + "step": 1317 + }, + { + "epoch": 0.3951283492598838, + "grad_norm": 0.24651208519935608, + "learning_rate": 0.00024108108108108106, + "loss": 0.4635, + "step": 1318 + }, + { + "epoch": 0.3954281431515833, + "grad_norm": 0.23849599063396454, + "learning_rate": 0.000241036036036036, + "loss": 0.4377, + "step": 1319 + }, + { + "epoch": 0.39572793704328274, + "grad_norm": 0.23733258247375488, + "learning_rate": 0.000240990990990991, + "loss": 0.4401, + "step": 1320 + }, + { + "epoch": 0.3960277309349822, + "grad_norm": 0.2643173336982727, + "learning_rate": 0.00024094594594594593, + "loss": 0.4572, + "step": 1321 + }, + { + "epoch": 0.39632752482668165, + "grad_norm": 0.23576226830482483, + "learning_rate": 0.00024090090090090086, + "loss": 0.4693, + "step": 1322 + }, + { + "epoch": 0.39662731871838114, + "grad_norm": 0.2418884038925171, + "learning_rate": 0.00024085585585585586, + "loss": 0.4453, + "step": 1323 + }, + { + "epoch": 0.39692711261008057, + "grad_norm": 0.23336432874202728, + "learning_rate": 0.0002408108108108108, + "loss": 0.4663, + "step": 1324 + }, + { + "epoch": 0.39722690650178005, + "grad_norm": 0.2603462040424347, + "learning_rate": 0.00024076576576576573, + "loss": 0.4643, + "step": 1325 + }, + { + "epoch": 0.3975267003934795, + "grad_norm": 0.24288874864578247, + "learning_rate": 0.0002407207207207207, + "loss": 0.4442, + "step": 1326 + }, + { + "epoch": 0.39782649428517897, + "grad_norm": 0.24161702394485474, + "learning_rate": 0.00024067567567567566, + "loss": 0.4773, + "step": 1327 + }, + { + "epoch": 0.3981262881768784, + "grad_norm": 0.23539233207702637, + "learning_rate": 0.0002406306306306306, + "loss": 0.4416, + "step": 1328 + }, + { + "epoch": 0.3984260820685779, + "grad_norm": 0.26161664724349976, + "learning_rate": 0.00024058558558558556, + "loss": 0.4672, + "step": 1329 + }, + { + "epoch": 0.3987258759602773, + "grad_norm": 0.24159055948257446, + "learning_rate": 0.00024054054054054052, + "loss": 0.4563, + "step": 1330 + }, + { + "epoch": 0.39902566985197674, + "grad_norm": 0.24944022297859192, + "learning_rate": 0.00024049549549549548, + "loss": 0.4554, + "step": 1331 + }, + { + "epoch": 0.3993254637436762, + "grad_norm": 0.24242587387561798, + "learning_rate": 0.00024045045045045042, + "loss": 0.4343, + "step": 1332 + }, + { + "epoch": 0.39962525763537565, + "grad_norm": 0.2406960427761078, + "learning_rate": 0.00024040540540540539, + "loss": 0.4469, + "step": 1333 + }, + { + "epoch": 0.39992505152707514, + "grad_norm": 0.24025918543338776, + "learning_rate": 0.00024036036036036035, + "loss": 0.4056, + "step": 1334 + }, + { + "epoch": 0.40022484541877457, + "grad_norm": 0.267782062292099, + "learning_rate": 0.00024031531531531529, + "loss": 0.4319, + "step": 1335 + }, + { + "epoch": 0.40052463931047405, + "grad_norm": 0.2527433931827545, + "learning_rate": 0.00024027027027027025, + "loss": 0.4679, + "step": 1336 + }, + { + "epoch": 0.4008244332021735, + "grad_norm": 0.2520921528339386, + "learning_rate": 0.00024022522522522521, + "loss": 0.4802, + "step": 1337 + }, + { + "epoch": 0.40112422709387296, + "grad_norm": 0.24161456525325775, + "learning_rate": 0.00024018018018018015, + "loss": 0.4415, + "step": 1338 + }, + { + "epoch": 0.4014240209855724, + "grad_norm": 0.24513588845729828, + "learning_rate": 0.00024013513513513511, + "loss": 0.445, + "step": 1339 + }, + { + "epoch": 0.4017238148772719, + "grad_norm": 0.24400116503238678, + "learning_rate": 0.00024009009009009008, + "loss": 0.4283, + "step": 1340 + }, + { + "epoch": 0.4020236087689713, + "grad_norm": 0.24796655774116516, + "learning_rate": 0.00024004504504504502, + "loss": 0.4434, + "step": 1341 + }, + { + "epoch": 0.4023234026606708, + "grad_norm": 0.2471655309200287, + "learning_rate": 0.00023999999999999998, + "loss": 0.4378, + "step": 1342 + }, + { + "epoch": 0.4026231965523702, + "grad_norm": 0.2507822811603546, + "learning_rate": 0.00023995495495495494, + "loss": 0.4623, + "step": 1343 + }, + { + "epoch": 0.4029229904440697, + "grad_norm": 0.24304543435573578, + "learning_rate": 0.00023990990990990988, + "loss": 0.4355, + "step": 1344 + }, + { + "epoch": 0.40322278433576914, + "grad_norm": 0.240738183259964, + "learning_rate": 0.00023986486486486487, + "loss": 0.4454, + "step": 1345 + }, + { + "epoch": 0.4035225782274686, + "grad_norm": 0.2353314906358719, + "learning_rate": 0.0002398198198198198, + "loss": 0.4493, + "step": 1346 + }, + { + "epoch": 0.40382237211916805, + "grad_norm": 0.24467633664608002, + "learning_rate": 0.00023977477477477474, + "loss": 0.4771, + "step": 1347 + }, + { + "epoch": 0.40412216601086753, + "grad_norm": 0.22876618802547455, + "learning_rate": 0.00023972972972972974, + "loss": 0.4311, + "step": 1348 + }, + { + "epoch": 0.40442195990256696, + "grad_norm": 0.24602358043193817, + "learning_rate": 0.00023968468468468467, + "loss": 0.4676, + "step": 1349 + }, + { + "epoch": 0.40472175379426645, + "grad_norm": 0.2410927563905716, + "learning_rate": 0.0002396396396396396, + "loss": 0.4564, + "step": 1350 + }, + { + "epoch": 0.4050215476859659, + "grad_norm": 0.22765080630779266, + "learning_rate": 0.00023959459459459455, + "loss": 0.447, + "step": 1351 + }, + { + "epoch": 0.40532134157766536, + "grad_norm": 0.24429920315742493, + "learning_rate": 0.00023954954954954954, + "loss": 0.4734, + "step": 1352 + }, + { + "epoch": 0.4056211354693648, + "grad_norm": 0.2417088747024536, + "learning_rate": 0.00023950450450450447, + "loss": 0.4465, + "step": 1353 + }, + { + "epoch": 0.4059209293610643, + "grad_norm": 0.25090181827545166, + "learning_rate": 0.00023945945945945944, + "loss": 0.4754, + "step": 1354 + }, + { + "epoch": 0.4062207232527637, + "grad_norm": 0.255610853433609, + "learning_rate": 0.0002394144144144144, + "loss": 0.4891, + "step": 1355 + }, + { + "epoch": 0.4065205171444632, + "grad_norm": 0.22734206914901733, + "learning_rate": 0.00023936936936936934, + "loss": 0.4279, + "step": 1356 + }, + { + "epoch": 0.4068203110361626, + "grad_norm": 0.24400131404399872, + "learning_rate": 0.0002393243243243243, + "loss": 0.4493, + "step": 1357 + }, + { + "epoch": 0.4071201049278621, + "grad_norm": 0.2442186176776886, + "learning_rate": 0.00023927927927927927, + "loss": 0.4347, + "step": 1358 + }, + { + "epoch": 0.40741989881956153, + "grad_norm": 0.2278696894645691, + "learning_rate": 0.0002392342342342342, + "loss": 0.4346, + "step": 1359 + }, + { + "epoch": 0.407719692711261, + "grad_norm": 0.23576432466506958, + "learning_rate": 0.00023918918918918917, + "loss": 0.4279, + "step": 1360 + }, + { + "epoch": 0.40801948660296045, + "grad_norm": 0.25753775238990784, + "learning_rate": 0.00023914414414414413, + "loss": 0.465, + "step": 1361 + }, + { + "epoch": 0.40831928049465993, + "grad_norm": 0.232134148478508, + "learning_rate": 0.00023909909909909907, + "loss": 0.4467, + "step": 1362 + }, + { + "epoch": 0.40861907438635936, + "grad_norm": 0.25058963894844055, + "learning_rate": 0.00023905405405405403, + "loss": 0.4496, + "step": 1363 + }, + { + "epoch": 0.40891886827805884, + "grad_norm": 0.24595490097999573, + "learning_rate": 0.000239009009009009, + "loss": 0.4625, + "step": 1364 + }, + { + "epoch": 0.4092186621697583, + "grad_norm": 0.2425229847431183, + "learning_rate": 0.00023896396396396393, + "loss": 0.4757, + "step": 1365 + }, + { + "epoch": 0.40951845606145776, + "grad_norm": 0.26115790009498596, + "learning_rate": 0.0002389189189189189, + "loss": 0.4619, + "step": 1366 + }, + { + "epoch": 0.4098182499531572, + "grad_norm": 0.22355914115905762, + "learning_rate": 0.00023887387387387386, + "loss": 0.4234, + "step": 1367 + }, + { + "epoch": 0.4101180438448567, + "grad_norm": 0.22979210317134857, + "learning_rate": 0.00023882882882882882, + "loss": 0.4472, + "step": 1368 + }, + { + "epoch": 0.4104178377365561, + "grad_norm": 0.2713291645050049, + "learning_rate": 0.00023878378378378376, + "loss": 0.4819, + "step": 1369 + }, + { + "epoch": 0.4107176316282556, + "grad_norm": 0.22952406108379364, + "learning_rate": 0.00023873873873873872, + "loss": 0.4428, + "step": 1370 + }, + { + "epoch": 0.411017425519955, + "grad_norm": 0.24338556826114655, + "learning_rate": 0.0002386936936936937, + "loss": 0.4703, + "step": 1371 + }, + { + "epoch": 0.4113172194116545, + "grad_norm": 0.24610304832458496, + "learning_rate": 0.00023864864864864862, + "loss": 0.4583, + "step": 1372 + }, + { + "epoch": 0.41161701330335393, + "grad_norm": 0.23116329312324524, + "learning_rate": 0.0002386036036036036, + "loss": 0.4234, + "step": 1373 + }, + { + "epoch": 0.4119168071950534, + "grad_norm": 0.24718856811523438, + "learning_rate": 0.00023855855855855855, + "loss": 0.4507, + "step": 1374 + }, + { + "epoch": 0.41221660108675284, + "grad_norm": 0.25558093190193176, + "learning_rate": 0.0002385135135135135, + "loss": 0.4609, + "step": 1375 + }, + { + "epoch": 0.4125163949784523, + "grad_norm": 0.2391168475151062, + "learning_rate": 0.00023846846846846843, + "loss": 0.4334, + "step": 1376 + }, + { + "epoch": 0.41281618887015176, + "grad_norm": 0.242578387260437, + "learning_rate": 0.00023842342342342342, + "loss": 0.472, + "step": 1377 + }, + { + "epoch": 0.41311598276185124, + "grad_norm": 0.24465034902095795, + "learning_rate": 0.00023837837837837835, + "loss": 0.4584, + "step": 1378 + }, + { + "epoch": 0.41341577665355067, + "grad_norm": 0.2500922679901123, + "learning_rate": 0.0002383333333333333, + "loss": 0.468, + "step": 1379 + }, + { + "epoch": 0.41371557054525016, + "grad_norm": 0.23939989507198334, + "learning_rate": 0.00023828828828828828, + "loss": 0.443, + "step": 1380 + }, + { + "epoch": 0.4140153644369496, + "grad_norm": 0.272876113653183, + "learning_rate": 0.00023824324324324322, + "loss": 0.4665, + "step": 1381 + }, + { + "epoch": 0.41431515832864907, + "grad_norm": 0.2664034068584442, + "learning_rate": 0.00023819819819819815, + "loss": 0.4855, + "step": 1382 + }, + { + "epoch": 0.4146149522203485, + "grad_norm": 0.23809301853179932, + "learning_rate": 0.00023815315315315315, + "loss": 0.4483, + "step": 1383 + }, + { + "epoch": 0.414914746112048, + "grad_norm": 0.23019112646579742, + "learning_rate": 0.00023810810810810808, + "loss": 0.4324, + "step": 1384 + }, + { + "epoch": 0.4152145400037474, + "grad_norm": 0.25093144178390503, + "learning_rate": 0.00023806306306306302, + "loss": 0.4732, + "step": 1385 + }, + { + "epoch": 0.4155143338954469, + "grad_norm": 0.23062798380851746, + "learning_rate": 0.000238018018018018, + "loss": 0.4489, + "step": 1386 + }, + { + "epoch": 0.4158141277871463, + "grad_norm": 0.23424816131591797, + "learning_rate": 0.00023797297297297295, + "loss": 0.4586, + "step": 1387 + }, + { + "epoch": 0.4161139216788458, + "grad_norm": 0.23515811562538147, + "learning_rate": 0.0002379279279279279, + "loss": 0.4478, + "step": 1388 + }, + { + "epoch": 0.41641371557054524, + "grad_norm": 0.23988017439842224, + "learning_rate": 0.00023788288288288287, + "loss": 0.4382, + "step": 1389 + }, + { + "epoch": 0.4167135094622447, + "grad_norm": 0.23148085176944733, + "learning_rate": 0.0002378378378378378, + "loss": 0.4476, + "step": 1390 + }, + { + "epoch": 0.41701330335394415, + "grad_norm": 0.22274036705493927, + "learning_rate": 0.00023779279279279278, + "loss": 0.4356, + "step": 1391 + }, + { + "epoch": 0.41731309724564364, + "grad_norm": 0.22446554899215698, + "learning_rate": 0.00023774774774774774, + "loss": 0.438, + "step": 1392 + }, + { + "epoch": 0.41761289113734307, + "grad_norm": 0.2218816876411438, + "learning_rate": 0.00023770270270270268, + "loss": 0.4137, + "step": 1393 + }, + { + "epoch": 0.41791268502904255, + "grad_norm": 0.23347264528274536, + "learning_rate": 0.00023765765765765764, + "loss": 0.4361, + "step": 1394 + }, + { + "epoch": 0.418212478920742, + "grad_norm": 0.26775455474853516, + "learning_rate": 0.0002376126126126126, + "loss": 0.5153, + "step": 1395 + }, + { + "epoch": 0.41851227281244147, + "grad_norm": 0.2503002882003784, + "learning_rate": 0.00023756756756756754, + "loss": 0.4705, + "step": 1396 + }, + { + "epoch": 0.4188120667041409, + "grad_norm": 0.2259148210287094, + "learning_rate": 0.0002375225225225225, + "loss": 0.4251, + "step": 1397 + }, + { + "epoch": 0.4191118605958404, + "grad_norm": 0.24358846247196198, + "learning_rate": 0.00023747747747747744, + "loss": 0.4483, + "step": 1398 + }, + { + "epoch": 0.4194116544875398, + "grad_norm": 0.24675339460372925, + "learning_rate": 0.0002374324324324324, + "loss": 0.4379, + "step": 1399 + }, + { + "epoch": 0.4197114483792393, + "grad_norm": 0.24181531369686127, + "learning_rate": 0.00023738738738738737, + "loss": 0.4498, + "step": 1400 + }, + { + "epoch": 0.4200112422709387, + "grad_norm": 0.24917852878570557, + "learning_rate": 0.0002373423423423423, + "loss": 0.4436, + "step": 1401 + }, + { + "epoch": 0.4203110361626382, + "grad_norm": 0.24392594397068024, + "learning_rate": 0.0002372972972972973, + "loss": 0.4791, + "step": 1402 + }, + { + "epoch": 0.42061083005433764, + "grad_norm": 0.2411157488822937, + "learning_rate": 0.00023725225225225223, + "loss": 0.472, + "step": 1403 + }, + { + "epoch": 0.4209106239460371, + "grad_norm": 0.2558772563934326, + "learning_rate": 0.00023720720720720717, + "loss": 0.4914, + "step": 1404 + }, + { + "epoch": 0.42121041783773655, + "grad_norm": 0.2580840587615967, + "learning_rate": 0.00023716216216216216, + "loss": 0.4502, + "step": 1405 + }, + { + "epoch": 0.42151021172943604, + "grad_norm": 0.23793622851371765, + "learning_rate": 0.0002371171171171171, + "loss": 0.4599, + "step": 1406 + }, + { + "epoch": 0.42181000562113546, + "grad_norm": 0.23233507573604584, + "learning_rate": 0.00023707207207207203, + "loss": 0.4268, + "step": 1407 + }, + { + "epoch": 0.42210979951283495, + "grad_norm": 0.24053210020065308, + "learning_rate": 0.00023702702702702703, + "loss": 0.4233, + "step": 1408 + }, + { + "epoch": 0.4224095934045344, + "grad_norm": 0.2314370572566986, + "learning_rate": 0.00023698198198198196, + "loss": 0.427, + "step": 1409 + }, + { + "epoch": 0.42270938729623386, + "grad_norm": 0.2242741882801056, + "learning_rate": 0.0002369369369369369, + "loss": 0.4097, + "step": 1410 + }, + { + "epoch": 0.4230091811879333, + "grad_norm": 0.24178822338581085, + "learning_rate": 0.0002368918918918919, + "loss": 0.4259, + "step": 1411 + }, + { + "epoch": 0.4233089750796328, + "grad_norm": 0.23346510529518127, + "learning_rate": 0.00023684684684684683, + "loss": 0.4442, + "step": 1412 + }, + { + "epoch": 0.4236087689713322, + "grad_norm": 0.23908735811710358, + "learning_rate": 0.00023680180180180176, + "loss": 0.432, + "step": 1413 + }, + { + "epoch": 0.4239085628630317, + "grad_norm": 0.24085824191570282, + "learning_rate": 0.00023675675675675675, + "loss": 0.4648, + "step": 1414 + }, + { + "epoch": 0.4242083567547311, + "grad_norm": 0.2325652688741684, + "learning_rate": 0.0002367117117117117, + "loss": 0.4499, + "step": 1415 + }, + { + "epoch": 0.4245081506464306, + "grad_norm": 0.2413867712020874, + "learning_rate": 0.00023666666666666663, + "loss": 0.4407, + "step": 1416 + }, + { + "epoch": 0.42480794453813003, + "grad_norm": 0.2494458556175232, + "learning_rate": 0.00023662162162162162, + "loss": 0.4828, + "step": 1417 + }, + { + "epoch": 0.4251077384298295, + "grad_norm": 0.24031957983970642, + "learning_rate": 0.00023657657657657656, + "loss": 0.4453, + "step": 1418 + }, + { + "epoch": 0.42540753232152895, + "grad_norm": 0.2490081638097763, + "learning_rate": 0.0002365315315315315, + "loss": 0.4595, + "step": 1419 + }, + { + "epoch": 0.42570732621322843, + "grad_norm": 0.2513922452926636, + "learning_rate": 0.00023648648648648648, + "loss": 0.4507, + "step": 1420 + }, + { + "epoch": 0.42600712010492786, + "grad_norm": 0.23692888021469116, + "learning_rate": 0.00023644144144144142, + "loss": 0.4339, + "step": 1421 + }, + { + "epoch": 0.42630691399662735, + "grad_norm": 0.22867028415203094, + "learning_rate": 0.00023639639639639636, + "loss": 0.437, + "step": 1422 + }, + { + "epoch": 0.4266067078883268, + "grad_norm": 0.23194801807403564, + "learning_rate": 0.00023635135135135132, + "loss": 0.4558, + "step": 1423 + }, + { + "epoch": 0.42690650178002626, + "grad_norm": 0.23193570971488953, + "learning_rate": 0.00023630630630630628, + "loss": 0.4446, + "step": 1424 + }, + { + "epoch": 0.4272062956717257, + "grad_norm": 0.2337258905172348, + "learning_rate": 0.00023626126126126125, + "loss": 0.4337, + "step": 1425 + }, + { + "epoch": 0.4275060895634251, + "grad_norm": 0.23658838868141174, + "learning_rate": 0.00023621621621621619, + "loss": 0.4288, + "step": 1426 + }, + { + "epoch": 0.4278058834551246, + "grad_norm": 0.26249489188194275, + "learning_rate": 0.00023617117117117115, + "loss": 0.4673, + "step": 1427 + }, + { + "epoch": 0.42810567734682403, + "grad_norm": 0.2465837299823761, + "learning_rate": 0.0002361261261261261, + "loss": 0.4357, + "step": 1428 + }, + { + "epoch": 0.4284054712385235, + "grad_norm": 0.26788344979286194, + "learning_rate": 0.00023608108108108105, + "loss": 0.4465, + "step": 1429 + }, + { + "epoch": 0.42870526513022295, + "grad_norm": 0.2597041726112366, + "learning_rate": 0.00023603603603603601, + "loss": 0.4433, + "step": 1430 + }, + { + "epoch": 0.42900505902192243, + "grad_norm": 0.235942080616951, + "learning_rate": 0.00023599099099099098, + "loss": 0.4322, + "step": 1431 + }, + { + "epoch": 0.42930485291362186, + "grad_norm": 0.25687772035598755, + "learning_rate": 0.00023594594594594591, + "loss": 0.478, + "step": 1432 + }, + { + "epoch": 0.42960464680532134, + "grad_norm": 0.2209557294845581, + "learning_rate": 0.00023590090090090088, + "loss": 0.4254, + "step": 1433 + }, + { + "epoch": 0.4299044406970208, + "grad_norm": 0.2533595860004425, + "learning_rate": 0.00023585585585585584, + "loss": 0.4545, + "step": 1434 + }, + { + "epoch": 0.43020423458872026, + "grad_norm": 0.2461264431476593, + "learning_rate": 0.00023581081081081078, + "loss": 0.4642, + "step": 1435 + }, + { + "epoch": 0.4305040284804197, + "grad_norm": 0.23638790845870972, + "learning_rate": 0.00023576576576576577, + "loss": 0.4403, + "step": 1436 + }, + { + "epoch": 0.43080382237211917, + "grad_norm": 0.23931315541267395, + "learning_rate": 0.0002357207207207207, + "loss": 0.4356, + "step": 1437 + }, + { + "epoch": 0.4311036162638186, + "grad_norm": 0.26020053029060364, + "learning_rate": 0.00023567567567567564, + "loss": 0.4562, + "step": 1438 + }, + { + "epoch": 0.4314034101555181, + "grad_norm": 0.25653496384620667, + "learning_rate": 0.00023563063063063063, + "loss": 0.473, + "step": 1439 + }, + { + "epoch": 0.4317032040472175, + "grad_norm": 0.23115600645542145, + "learning_rate": 0.00023558558558558557, + "loss": 0.4259, + "step": 1440 + }, + { + "epoch": 0.432002997938917, + "grad_norm": 0.240982785820961, + "learning_rate": 0.0002355405405405405, + "loss": 0.4507, + "step": 1441 + }, + { + "epoch": 0.43230279183061643, + "grad_norm": 0.2819494307041168, + "learning_rate": 0.0002354954954954955, + "loss": 0.4539, + "step": 1442 + }, + { + "epoch": 0.4326025857223159, + "grad_norm": 0.24356286227703094, + "learning_rate": 0.00023545045045045044, + "loss": 0.4377, + "step": 1443 + }, + { + "epoch": 0.43290237961401534, + "grad_norm": 0.23919035494327545, + "learning_rate": 0.00023540540540540537, + "loss": 0.4141, + "step": 1444 + }, + { + "epoch": 0.4332021735057148, + "grad_norm": 0.27333101630210876, + "learning_rate": 0.00023536036036036036, + "loss": 0.4655, + "step": 1445 + }, + { + "epoch": 0.43350196739741426, + "grad_norm": 0.24128217995166779, + "learning_rate": 0.0002353153153153153, + "loss": 0.4483, + "step": 1446 + }, + { + "epoch": 0.43380176128911374, + "grad_norm": 0.2448810636997223, + "learning_rate": 0.00023527027027027024, + "loss": 0.4376, + "step": 1447 + }, + { + "epoch": 0.43410155518081317, + "grad_norm": 0.24084526300430298, + "learning_rate": 0.0002352252252252252, + "loss": 0.4228, + "step": 1448 + }, + { + "epoch": 0.43440134907251265, + "grad_norm": 0.2719487249851227, + "learning_rate": 0.00023518018018018016, + "loss": 0.4564, + "step": 1449 + }, + { + "epoch": 0.4347011429642121, + "grad_norm": 0.23745928704738617, + "learning_rate": 0.0002351351351351351, + "loss": 0.4283, + "step": 1450 + }, + { + "epoch": 0.43500093685591157, + "grad_norm": 0.22788289189338684, + "learning_rate": 0.00023509009009009007, + "loss": 0.4215, + "step": 1451 + }, + { + "epoch": 0.435300730747611, + "grad_norm": 0.2702344059944153, + "learning_rate": 0.00023504504504504503, + "loss": 0.4425, + "step": 1452 + }, + { + "epoch": 0.4356005246393105, + "grad_norm": 0.2503633499145508, + "learning_rate": 0.00023499999999999997, + "loss": 0.449, + "step": 1453 + }, + { + "epoch": 0.4359003185310099, + "grad_norm": 0.2610470950603485, + "learning_rate": 0.00023495495495495493, + "loss": 0.4708, + "step": 1454 + }, + { + "epoch": 0.4362001124227094, + "grad_norm": 0.2375205010175705, + "learning_rate": 0.0002349099099099099, + "loss": 0.4426, + "step": 1455 + }, + { + "epoch": 0.4364999063144088, + "grad_norm": 0.2531331777572632, + "learning_rate": 0.00023486486486486483, + "loss": 0.4516, + "step": 1456 + }, + { + "epoch": 0.4367997002061083, + "grad_norm": 0.23722784221172333, + "learning_rate": 0.0002348198198198198, + "loss": 0.4345, + "step": 1457 + }, + { + "epoch": 0.43709949409780774, + "grad_norm": 0.2411564141511917, + "learning_rate": 0.00023477477477477476, + "loss": 0.4227, + "step": 1458 + }, + { + "epoch": 0.4373992879895072, + "grad_norm": 0.22307904064655304, + "learning_rate": 0.00023472972972972972, + "loss": 0.4254, + "step": 1459 + }, + { + "epoch": 0.43769908188120665, + "grad_norm": 0.273569256067276, + "learning_rate": 0.00023468468468468466, + "loss": 0.4384, + "step": 1460 + }, + { + "epoch": 0.43799887577290614, + "grad_norm": 0.24101589620113373, + "learning_rate": 0.00023463963963963962, + "loss": 0.4694, + "step": 1461 + }, + { + "epoch": 0.43829866966460557, + "grad_norm": 0.2409631907939911, + "learning_rate": 0.0002345945945945946, + "loss": 0.4515, + "step": 1462 + }, + { + "epoch": 0.43859846355630505, + "grad_norm": 0.24057404696941376, + "learning_rate": 0.00023454954954954952, + "loss": 0.4516, + "step": 1463 + }, + { + "epoch": 0.4388982574480045, + "grad_norm": 0.24539843201637268, + "learning_rate": 0.0002345045045045045, + "loss": 0.4078, + "step": 1464 + }, + { + "epoch": 0.43919805133970397, + "grad_norm": 0.23763391375541687, + "learning_rate": 0.00023445945945945945, + "loss": 0.4538, + "step": 1465 + }, + { + "epoch": 0.4394978452314034, + "grad_norm": 0.25087833404541016, + "learning_rate": 0.0002344144144144144, + "loss": 0.4474, + "step": 1466 + }, + { + "epoch": 0.4397976391231029, + "grad_norm": 0.24220441281795502, + "learning_rate": 0.00023436936936936935, + "loss": 0.4527, + "step": 1467 + }, + { + "epoch": 0.4400974330148023, + "grad_norm": 0.23056988418102264, + "learning_rate": 0.00023432432432432432, + "loss": 0.4375, + "step": 1468 + }, + { + "epoch": 0.4403972269065018, + "grad_norm": 0.23940956592559814, + "learning_rate": 0.00023427927927927925, + "loss": 0.4561, + "step": 1469 + }, + { + "epoch": 0.4406970207982012, + "grad_norm": 0.23279373347759247, + "learning_rate": 0.0002342342342342342, + "loss": 0.4342, + "step": 1470 + }, + { + "epoch": 0.4409968146899007, + "grad_norm": 0.23729127645492554, + "learning_rate": 0.00023418918918918918, + "loss": 0.4196, + "step": 1471 + }, + { + "epoch": 0.44129660858160014, + "grad_norm": 0.2296978086233139, + "learning_rate": 0.00023414414414414412, + "loss": 0.4308, + "step": 1472 + }, + { + "epoch": 0.4415964024732996, + "grad_norm": 0.24595269560813904, + "learning_rate": 0.00023409909909909905, + "loss": 0.4607, + "step": 1473 + }, + { + "epoch": 0.44189619636499905, + "grad_norm": 0.2266230583190918, + "learning_rate": 0.00023405405405405404, + "loss": 0.4233, + "step": 1474 + }, + { + "epoch": 0.44219599025669853, + "grad_norm": 0.23812752962112427, + "learning_rate": 0.00023400900900900898, + "loss": 0.4341, + "step": 1475 + }, + { + "epoch": 0.44249578414839796, + "grad_norm": 0.2498784214258194, + "learning_rate": 0.00023396396396396392, + "loss": 0.4406, + "step": 1476 + }, + { + "epoch": 0.44279557804009745, + "grad_norm": 0.26205873489379883, + "learning_rate": 0.0002339189189189189, + "loss": 0.463, + "step": 1477 + }, + { + "epoch": 0.4430953719317969, + "grad_norm": 0.22054405510425568, + "learning_rate": 0.00023387387387387385, + "loss": 0.4142, + "step": 1478 + }, + { + "epoch": 0.44339516582349636, + "grad_norm": 0.2611534893512726, + "learning_rate": 0.00023382882882882878, + "loss": 0.4985, + "step": 1479 + }, + { + "epoch": 0.4436949597151958, + "grad_norm": 0.2515600323677063, + "learning_rate": 0.00023378378378378377, + "loss": 0.4567, + "step": 1480 + }, + { + "epoch": 0.4439947536068953, + "grad_norm": 0.23842091858386993, + "learning_rate": 0.0002337387387387387, + "loss": 0.4301, + "step": 1481 + }, + { + "epoch": 0.4442945474985947, + "grad_norm": 0.21718434989452362, + "learning_rate": 0.00023369369369369367, + "loss": 0.4444, + "step": 1482 + }, + { + "epoch": 0.4445943413902942, + "grad_norm": 0.2340681105852127, + "learning_rate": 0.00023364864864864864, + "loss": 0.4555, + "step": 1483 + }, + { + "epoch": 0.4448941352819936, + "grad_norm": 0.24483737349510193, + "learning_rate": 0.00023360360360360357, + "loss": 0.4419, + "step": 1484 + }, + { + "epoch": 0.4451939291736931, + "grad_norm": 0.24512210488319397, + "learning_rate": 0.00023355855855855854, + "loss": 0.427, + "step": 1485 + }, + { + "epoch": 0.44549372306539253, + "grad_norm": 0.21723505854606628, + "learning_rate": 0.0002335135135135135, + "loss": 0.4075, + "step": 1486 + }, + { + "epoch": 0.445793516957092, + "grad_norm": 0.23696058988571167, + "learning_rate": 0.00023346846846846844, + "loss": 0.4332, + "step": 1487 + }, + { + "epoch": 0.44609331084879145, + "grad_norm": 0.23997806012630463, + "learning_rate": 0.0002334234234234234, + "loss": 0.4438, + "step": 1488 + }, + { + "epoch": 0.44639310474049093, + "grad_norm": 0.2866729497909546, + "learning_rate": 0.00023337837837837837, + "loss": 0.4851, + "step": 1489 + }, + { + "epoch": 0.44669289863219036, + "grad_norm": 0.22988809645175934, + "learning_rate": 0.0002333333333333333, + "loss": 0.4185, + "step": 1490 + }, + { + "epoch": 0.44699269252388985, + "grad_norm": 0.24402973055839539, + "learning_rate": 0.00023328828828828827, + "loss": 0.4277, + "step": 1491 + }, + { + "epoch": 0.4472924864155893, + "grad_norm": 0.25479069352149963, + "learning_rate": 0.00023324324324324323, + "loss": 0.4593, + "step": 1492 + }, + { + "epoch": 0.44759228030728876, + "grad_norm": 0.27010777592658997, + "learning_rate": 0.0002331981981981982, + "loss": 0.4391, + "step": 1493 + }, + { + "epoch": 0.4478920741989882, + "grad_norm": 0.2443162202835083, + "learning_rate": 0.00023315315315315313, + "loss": 0.4485, + "step": 1494 + }, + { + "epoch": 0.4481918680906877, + "grad_norm": 0.23816895484924316, + "learning_rate": 0.00023310810810810807, + "loss": 0.4606, + "step": 1495 + }, + { + "epoch": 0.4484916619823871, + "grad_norm": 0.25230711698532104, + "learning_rate": 0.00023306306306306306, + "loss": 0.4479, + "step": 1496 + }, + { + "epoch": 0.4487914558740866, + "grad_norm": 0.23312939703464508, + "learning_rate": 0.000233018018018018, + "loss": 0.4157, + "step": 1497 + }, + { + "epoch": 0.449091249765786, + "grad_norm": 0.2355630099773407, + "learning_rate": 0.00023297297297297293, + "loss": 0.4428, + "step": 1498 + }, + { + "epoch": 0.4493910436574855, + "grad_norm": 0.21646787226200104, + "learning_rate": 0.00023292792792792792, + "loss": 0.4166, + "step": 1499 + }, + { + "epoch": 0.44969083754918493, + "grad_norm": 0.2547577917575836, + "learning_rate": 0.00023288288288288286, + "loss": 0.4517, + "step": 1500 + }, + { + "epoch": 0.44969083754918493, + "eval_loss": 0.4507916569709778, + "eval_runtime": 565.1356, + "eval_samples_per_second": 3.82, + "eval_steps_per_second": 0.478, + "step": 1500 + }, + { + "epoch": 0.4499906314408844, + "grad_norm": 0.22076524794101715, + "learning_rate": 0.0002328378378378378, + "loss": 0.4395, + "step": 1501 + }, + { + "epoch": 0.45029042533258384, + "grad_norm": 0.23838326334953308, + "learning_rate": 0.0002327927927927928, + "loss": 0.456, + "step": 1502 + }, + { + "epoch": 0.45059021922428333, + "grad_norm": 0.23766019940376282, + "learning_rate": 0.00023274774774774773, + "loss": 0.4506, + "step": 1503 + }, + { + "epoch": 0.45089001311598276, + "grad_norm": 0.2391175776720047, + "learning_rate": 0.00023270270270270266, + "loss": 0.4484, + "step": 1504 + }, + { + "epoch": 0.45118980700768224, + "grad_norm": 0.24999289214611053, + "learning_rate": 0.00023265765765765765, + "loss": 0.4524, + "step": 1505 + }, + { + "epoch": 0.45148960089938167, + "grad_norm": 0.24920374155044556, + "learning_rate": 0.0002326126126126126, + "loss": 0.4626, + "step": 1506 + }, + { + "epoch": 0.45178939479108116, + "grad_norm": 0.24506935477256775, + "learning_rate": 0.00023256756756756753, + "loss": 0.438, + "step": 1507 + }, + { + "epoch": 0.4520891886827806, + "grad_norm": 0.2410869002342224, + "learning_rate": 0.00023252252252252252, + "loss": 0.4579, + "step": 1508 + }, + { + "epoch": 0.45238898257448007, + "grad_norm": 0.2394392192363739, + "learning_rate": 0.00023247747747747745, + "loss": 0.4701, + "step": 1509 + }, + { + "epoch": 0.4526887764661795, + "grad_norm": 0.24809177219867706, + "learning_rate": 0.0002324324324324324, + "loss": 0.4521, + "step": 1510 + }, + { + "epoch": 0.452988570357879, + "grad_norm": 0.24093541502952576, + "learning_rate": 0.00023238738738738738, + "loss": 0.4364, + "step": 1511 + }, + { + "epoch": 0.4532883642495784, + "grad_norm": 0.24750453233718872, + "learning_rate": 0.00023234234234234232, + "loss": 0.454, + "step": 1512 + }, + { + "epoch": 0.4535881581412779, + "grad_norm": 0.24669384956359863, + "learning_rate": 0.00023229729729729726, + "loss": 0.4687, + "step": 1513 + }, + { + "epoch": 0.4538879520329773, + "grad_norm": 0.258184015750885, + "learning_rate": 0.00023225225225225225, + "loss": 0.4741, + "step": 1514 + }, + { + "epoch": 0.4541877459246768, + "grad_norm": 0.23264341056346893, + "learning_rate": 0.00023220720720720718, + "loss": 0.4255, + "step": 1515 + }, + { + "epoch": 0.45448753981637624, + "grad_norm": 0.24050508439540863, + "learning_rate": 0.00023216216216216215, + "loss": 0.4465, + "step": 1516 + }, + { + "epoch": 0.4547873337080757, + "grad_norm": 0.23079554736614227, + "learning_rate": 0.0002321171171171171, + "loss": 0.4232, + "step": 1517 + }, + { + "epoch": 0.45508712759977515, + "grad_norm": 0.22280898690223694, + "learning_rate": 0.00023207207207207205, + "loss": 0.4233, + "step": 1518 + }, + { + "epoch": 0.45538692149147464, + "grad_norm": 0.24419550597667694, + "learning_rate": 0.000232027027027027, + "loss": 0.453, + "step": 1519 + }, + { + "epoch": 0.45568671538317407, + "grad_norm": 0.2578713595867157, + "learning_rate": 0.00023198198198198195, + "loss": 0.4701, + "step": 1520 + }, + { + "epoch": 0.4559865092748735, + "grad_norm": 0.24617789685726166, + "learning_rate": 0.0002319369369369369, + "loss": 0.4744, + "step": 1521 + }, + { + "epoch": 0.456286303166573, + "grad_norm": 0.2564181387424469, + "learning_rate": 0.00023189189189189188, + "loss": 0.4672, + "step": 1522 + }, + { + "epoch": 0.4565860970582724, + "grad_norm": 0.23741687834262848, + "learning_rate": 0.0002318468468468468, + "loss": 0.4337, + "step": 1523 + }, + { + "epoch": 0.4568858909499719, + "grad_norm": 0.2671225070953369, + "learning_rate": 0.00023180180180180178, + "loss": 0.4723, + "step": 1524 + }, + { + "epoch": 0.4571856848416713, + "grad_norm": 0.2585636377334595, + "learning_rate": 0.00023175675675675674, + "loss": 0.4585, + "step": 1525 + }, + { + "epoch": 0.4574854787333708, + "grad_norm": 0.25808751583099365, + "learning_rate": 0.00023171171171171168, + "loss": 0.4667, + "step": 1526 + }, + { + "epoch": 0.45778527262507024, + "grad_norm": 0.25702106952667236, + "learning_rate": 0.00023166666666666667, + "loss": 0.4218, + "step": 1527 + }, + { + "epoch": 0.4580850665167697, + "grad_norm": 0.2685486972332001, + "learning_rate": 0.0002316216216216216, + "loss": 0.4765, + "step": 1528 + }, + { + "epoch": 0.45838486040846915, + "grad_norm": 0.25075605511665344, + "learning_rate": 0.00023157657657657654, + "loss": 0.4795, + "step": 1529 + }, + { + "epoch": 0.45868465430016864, + "grad_norm": 0.25849252939224243, + "learning_rate": 0.00023153153153153153, + "loss": 0.4362, + "step": 1530 + }, + { + "epoch": 0.45898444819186807, + "grad_norm": 0.25761592388153076, + "learning_rate": 0.00023148648648648647, + "loss": 0.4791, + "step": 1531 + }, + { + "epoch": 0.45928424208356755, + "grad_norm": 0.228532075881958, + "learning_rate": 0.0002314414414414414, + "loss": 0.4258, + "step": 1532 + }, + { + "epoch": 0.459584035975267, + "grad_norm": 0.24463020265102386, + "learning_rate": 0.0002313963963963964, + "loss": 0.4199, + "step": 1533 + }, + { + "epoch": 0.45988382986696646, + "grad_norm": 0.26668593287467957, + "learning_rate": 0.00023135135135135133, + "loss": 0.4686, + "step": 1534 + }, + { + "epoch": 0.4601836237586659, + "grad_norm": 0.24953673779964447, + "learning_rate": 0.00023130630630630627, + "loss": 0.4433, + "step": 1535 + }, + { + "epoch": 0.4604834176503654, + "grad_norm": 0.2565534710884094, + "learning_rate": 0.00023126126126126126, + "loss": 0.4638, + "step": 1536 + }, + { + "epoch": 0.4607832115420648, + "grad_norm": 0.241172194480896, + "learning_rate": 0.0002312162162162162, + "loss": 0.4065, + "step": 1537 + }, + { + "epoch": 0.4610830054337643, + "grad_norm": 0.2695203125476837, + "learning_rate": 0.00023117117117117114, + "loss": 0.4782, + "step": 1538 + }, + { + "epoch": 0.4613827993254637, + "grad_norm": 0.25559231638908386, + "learning_rate": 0.00023112612612612613, + "loss": 0.425, + "step": 1539 + }, + { + "epoch": 0.4616825932171632, + "grad_norm": 0.2544387876987457, + "learning_rate": 0.00023108108108108106, + "loss": 0.451, + "step": 1540 + }, + { + "epoch": 0.46198238710886264, + "grad_norm": 0.27124300599098206, + "learning_rate": 0.000231036036036036, + "loss": 0.4492, + "step": 1541 + }, + { + "epoch": 0.4622821810005621, + "grad_norm": 0.2581422030925751, + "learning_rate": 0.000230990990990991, + "loss": 0.4317, + "step": 1542 + }, + { + "epoch": 0.46258197489226155, + "grad_norm": 0.2657614052295685, + "learning_rate": 0.00023094594594594593, + "loss": 0.4308, + "step": 1543 + }, + { + "epoch": 0.46288176878396103, + "grad_norm": 0.25568437576293945, + "learning_rate": 0.00023090090090090087, + "loss": 0.4808, + "step": 1544 + }, + { + "epoch": 0.46318156267566046, + "grad_norm": 0.26639649271965027, + "learning_rate": 0.00023085585585585583, + "loss": 0.4808, + "step": 1545 + }, + { + "epoch": 0.46348135656735995, + "grad_norm": 0.23767255246639252, + "learning_rate": 0.0002308108108108108, + "loss": 0.4382, + "step": 1546 + }, + { + "epoch": 0.4637811504590594, + "grad_norm": 0.22875267267227173, + "learning_rate": 0.00023076576576576573, + "loss": 0.4132, + "step": 1547 + }, + { + "epoch": 0.46408094435075886, + "grad_norm": 0.27917149662971497, + "learning_rate": 0.0002307207207207207, + "loss": 0.4899, + "step": 1548 + }, + { + "epoch": 0.4643807382424583, + "grad_norm": 0.24044126272201538, + "learning_rate": 0.00023067567567567566, + "loss": 0.4481, + "step": 1549 + }, + { + "epoch": 0.4646805321341578, + "grad_norm": 0.2577783465385437, + "learning_rate": 0.00023063063063063062, + "loss": 0.4403, + "step": 1550 + }, + { + "epoch": 0.4649803260258572, + "grad_norm": 0.2589547038078308, + "learning_rate": 0.00023058558558558556, + "loss": 0.438, + "step": 1551 + }, + { + "epoch": 0.4652801199175567, + "grad_norm": 0.22958579659461975, + "learning_rate": 0.00023054054054054052, + "loss": 0.4374, + "step": 1552 + }, + { + "epoch": 0.4655799138092561, + "grad_norm": 0.28952687978744507, + "learning_rate": 0.00023049549549549549, + "loss": 0.4463, + "step": 1553 + }, + { + "epoch": 0.4658797077009556, + "grad_norm": 0.2680447995662689, + "learning_rate": 0.00023045045045045042, + "loss": 0.4758, + "step": 1554 + }, + { + "epoch": 0.46617950159265503, + "grad_norm": 0.23771092295646667, + "learning_rate": 0.00023040540540540539, + "loss": 0.4322, + "step": 1555 + }, + { + "epoch": 0.4664792954843545, + "grad_norm": 0.2691210210323334, + "learning_rate": 0.00023036036036036035, + "loss": 0.4739, + "step": 1556 + }, + { + "epoch": 0.46677908937605395, + "grad_norm": 0.24547810852527618, + "learning_rate": 0.0002303153153153153, + "loss": 0.4507, + "step": 1557 + }, + { + "epoch": 0.46707888326775343, + "grad_norm": 0.2669890224933624, + "learning_rate": 0.00023027027027027025, + "loss": 0.4809, + "step": 1558 + }, + { + "epoch": 0.46737867715945286, + "grad_norm": 0.25527825951576233, + "learning_rate": 0.00023022522522522521, + "loss": 0.4558, + "step": 1559 + }, + { + "epoch": 0.46767847105115234, + "grad_norm": 0.23491926491260529, + "learning_rate": 0.00023018018018018015, + "loss": 0.4594, + "step": 1560 + }, + { + "epoch": 0.4679782649428518, + "grad_norm": 0.27634891867637634, + "learning_rate": 0.00023013513513513514, + "loss": 0.4503, + "step": 1561 + }, + { + "epoch": 0.46827805883455126, + "grad_norm": 0.2656886577606201, + "learning_rate": 0.00023009009009009008, + "loss": 0.4674, + "step": 1562 + }, + { + "epoch": 0.4685778527262507, + "grad_norm": 0.23933476209640503, + "learning_rate": 0.00023004504504504502, + "loss": 0.4124, + "step": 1563 + }, + { + "epoch": 0.4688776466179502, + "grad_norm": 0.2596864700317383, + "learning_rate": 0.00023, + "loss": 0.4288, + "step": 1564 + }, + { + "epoch": 0.4691774405096496, + "grad_norm": 0.25186148285865784, + "learning_rate": 0.00022995495495495494, + "loss": 0.4571, + "step": 1565 + }, + { + "epoch": 0.4694772344013491, + "grad_norm": 0.28007790446281433, + "learning_rate": 0.00022990990990990988, + "loss": 0.4785, + "step": 1566 + }, + { + "epoch": 0.4697770282930485, + "grad_norm": 0.26225724816322327, + "learning_rate": 0.00022986486486486482, + "loss": 0.4378, + "step": 1567 + }, + { + "epoch": 0.470076822184748, + "grad_norm": 0.24554051458835602, + "learning_rate": 0.0002298198198198198, + "loss": 0.4655, + "step": 1568 + }, + { + "epoch": 0.47037661607644743, + "grad_norm": 0.24976593255996704, + "learning_rate": 0.00022977477477477475, + "loss": 0.4356, + "step": 1569 + }, + { + "epoch": 0.4706764099681469, + "grad_norm": 0.23914846777915955, + "learning_rate": 0.00022972972972972968, + "loss": 0.4239, + "step": 1570 + }, + { + "epoch": 0.47097620385984634, + "grad_norm": 0.24698884785175323, + "learning_rate": 0.00022968468468468467, + "loss": 0.4607, + "step": 1571 + }, + { + "epoch": 0.4712759977515458, + "grad_norm": 0.24240310490131378, + "learning_rate": 0.0002296396396396396, + "loss": 0.4218, + "step": 1572 + }, + { + "epoch": 0.47157579164324526, + "grad_norm": 0.24838680028915405, + "learning_rate": 0.00022959459459459457, + "loss": 0.4387, + "step": 1573 + }, + { + "epoch": 0.47187558553494474, + "grad_norm": 0.25536447763442993, + "learning_rate": 0.00022954954954954954, + "loss": 0.4529, + "step": 1574 + }, + { + "epoch": 0.47217537942664417, + "grad_norm": 0.24535490572452545, + "learning_rate": 0.00022950450450450447, + "loss": 0.4505, + "step": 1575 + }, + { + "epoch": 0.47247517331834366, + "grad_norm": 0.258878618478775, + "learning_rate": 0.00022945945945945944, + "loss": 0.4794, + "step": 1576 + }, + { + "epoch": 0.4727749672100431, + "grad_norm": 0.23862193524837494, + "learning_rate": 0.0002294144144144144, + "loss": 0.4555, + "step": 1577 + }, + { + "epoch": 0.47307476110174257, + "grad_norm": 0.2369290292263031, + "learning_rate": 0.00022936936936936934, + "loss": 0.4111, + "step": 1578 + }, + { + "epoch": 0.473374554993442, + "grad_norm": 0.2591108977794647, + "learning_rate": 0.0002293243243243243, + "loss": 0.4738, + "step": 1579 + }, + { + "epoch": 0.4736743488851415, + "grad_norm": 0.2639445662498474, + "learning_rate": 0.00022927927927927927, + "loss": 0.489, + "step": 1580 + }, + { + "epoch": 0.4739741427768409, + "grad_norm": 0.2452382892370224, + "learning_rate": 0.0002292342342342342, + "loss": 0.4499, + "step": 1581 + }, + { + "epoch": 0.4742739366685404, + "grad_norm": 0.24414241313934326, + "learning_rate": 0.00022918918918918917, + "loss": 0.4246, + "step": 1582 + }, + { + "epoch": 0.4745737305602398, + "grad_norm": 0.24609197676181793, + "learning_rate": 0.00022914414414414413, + "loss": 0.4615, + "step": 1583 + }, + { + "epoch": 0.4748735244519393, + "grad_norm": 0.2610466480255127, + "learning_rate": 0.0002290990990990991, + "loss": 0.4597, + "step": 1584 + }, + { + "epoch": 0.47517331834363874, + "grad_norm": 0.24946355819702148, + "learning_rate": 0.00022905405405405403, + "loss": 0.4382, + "step": 1585 + }, + { + "epoch": 0.4754731122353382, + "grad_norm": 0.24156548082828522, + "learning_rate": 0.000229009009009009, + "loss": 0.4421, + "step": 1586 + }, + { + "epoch": 0.47577290612703765, + "grad_norm": 0.2650264799594879, + "learning_rate": 0.00022896396396396396, + "loss": 0.4735, + "step": 1587 + }, + { + "epoch": 0.47607270001873714, + "grad_norm": 0.2677678167819977, + "learning_rate": 0.0002289189189189189, + "loss": 0.4581, + "step": 1588 + }, + { + "epoch": 0.47637249391043657, + "grad_norm": 0.2421169877052307, + "learning_rate": 0.00022887387387387386, + "loss": 0.4342, + "step": 1589 + }, + { + "epoch": 0.47667228780213605, + "grad_norm": 0.2284284085035324, + "learning_rate": 0.00022882882882882882, + "loss": 0.434, + "step": 1590 + }, + { + "epoch": 0.4769720816938355, + "grad_norm": 0.235052689909935, + "learning_rate": 0.00022878378378378376, + "loss": 0.4439, + "step": 1591 + }, + { + "epoch": 0.47727187558553497, + "grad_norm": 0.24947918951511383, + "learning_rate": 0.0002287387387387387, + "loss": 0.4704, + "step": 1592 + }, + { + "epoch": 0.4775716694772344, + "grad_norm": 0.24523784220218658, + "learning_rate": 0.0002286936936936937, + "loss": 0.4688, + "step": 1593 + }, + { + "epoch": 0.4778714633689339, + "grad_norm": 0.2427687793970108, + "learning_rate": 0.00022864864864864862, + "loss": 0.4145, + "step": 1594 + }, + { + "epoch": 0.4781712572606333, + "grad_norm": 0.2589262127876282, + "learning_rate": 0.00022860360360360356, + "loss": 0.4602, + "step": 1595 + }, + { + "epoch": 0.4784710511523328, + "grad_norm": 0.22775280475616455, + "learning_rate": 0.00022855855855855855, + "loss": 0.4118, + "step": 1596 + }, + { + "epoch": 0.4787708450440322, + "grad_norm": 0.26483890414237976, + "learning_rate": 0.0002285135135135135, + "loss": 0.4342, + "step": 1597 + }, + { + "epoch": 0.4790706389357317, + "grad_norm": 0.2529483735561371, + "learning_rate": 0.00022846846846846843, + "loss": 0.4474, + "step": 1598 + }, + { + "epoch": 0.47937043282743114, + "grad_norm": 0.24874089658260345, + "learning_rate": 0.00022842342342342342, + "loss": 0.4383, + "step": 1599 + }, + { + "epoch": 0.4796702267191306, + "grad_norm": 0.2583334445953369, + "learning_rate": 0.00022837837837837835, + "loss": 0.4406, + "step": 1600 + }, + { + "epoch": 0.47997002061083005, + "grad_norm": 0.2603740990161896, + "learning_rate": 0.0002283333333333333, + "loss": 0.4418, + "step": 1601 + }, + { + "epoch": 0.48026981450252954, + "grad_norm": 0.24915438890457153, + "learning_rate": 0.00022828828828828828, + "loss": 0.458, + "step": 1602 + }, + { + "epoch": 0.48056960839422896, + "grad_norm": 0.2531617283821106, + "learning_rate": 0.00022824324324324322, + "loss": 0.4639, + "step": 1603 + }, + { + "epoch": 0.48086940228592845, + "grad_norm": 0.23148907721042633, + "learning_rate": 0.00022819819819819816, + "loss": 0.4345, + "step": 1604 + }, + { + "epoch": 0.4811691961776279, + "grad_norm": 0.26466086506843567, + "learning_rate": 0.00022815315315315315, + "loss": 0.4625, + "step": 1605 + }, + { + "epoch": 0.48146899006932736, + "grad_norm": 0.2558290660381317, + "learning_rate": 0.00022810810810810808, + "loss": 0.4438, + "step": 1606 + }, + { + "epoch": 0.4817687839610268, + "grad_norm": 0.2294214814901352, + "learning_rate": 0.00022806306306306305, + "loss": 0.4343, + "step": 1607 + }, + { + "epoch": 0.4820685778527263, + "grad_norm": 0.25497883558273315, + "learning_rate": 0.000228018018018018, + "loss": 0.4498, + "step": 1608 + }, + { + "epoch": 0.4823683717444257, + "grad_norm": 0.25641798973083496, + "learning_rate": 0.00022797297297297295, + "loss": 0.4077, + "step": 1609 + }, + { + "epoch": 0.4826681656361252, + "grad_norm": 0.2556770145893097, + "learning_rate": 0.0002279279279279279, + "loss": 0.4125, + "step": 1610 + }, + { + "epoch": 0.4829679595278246, + "grad_norm": 0.2528950273990631, + "learning_rate": 0.00022788288288288288, + "loss": 0.4759, + "step": 1611 + }, + { + "epoch": 0.4832677534195241, + "grad_norm": 0.22939835488796234, + "learning_rate": 0.0002278378378378378, + "loss": 0.4422, + "step": 1612 + }, + { + "epoch": 0.48356754731122353, + "grad_norm": 0.24314181506633759, + "learning_rate": 0.00022779279279279278, + "loss": 0.4358, + "step": 1613 + }, + { + "epoch": 0.483867341202923, + "grad_norm": 0.2460578978061676, + "learning_rate": 0.00022774774774774774, + "loss": 0.4571, + "step": 1614 + }, + { + "epoch": 0.48416713509462245, + "grad_norm": 0.2627396583557129, + "learning_rate": 0.00022770270270270268, + "loss": 0.4616, + "step": 1615 + }, + { + "epoch": 0.4844669289863219, + "grad_norm": 0.23525434732437134, + "learning_rate": 0.00022765765765765764, + "loss": 0.4243, + "step": 1616 + }, + { + "epoch": 0.48476672287802136, + "grad_norm": 0.2397356927394867, + "learning_rate": 0.00022761261261261258, + "loss": 0.4382, + "step": 1617 + }, + { + "epoch": 0.4850665167697208, + "grad_norm": 0.2398831993341446, + "learning_rate": 0.00022756756756756757, + "loss": 0.4412, + "step": 1618 + }, + { + "epoch": 0.4853663106614203, + "grad_norm": 0.259376585483551, + "learning_rate": 0.0002275225225225225, + "loss": 0.4259, + "step": 1619 + }, + { + "epoch": 0.4856661045531197, + "grad_norm": 0.23204876482486725, + "learning_rate": 0.00022747747747747744, + "loss": 0.4068, + "step": 1620 + }, + { + "epoch": 0.4859658984448192, + "grad_norm": 0.2531450688838959, + "learning_rate": 0.00022743243243243243, + "loss": 0.4383, + "step": 1621 + }, + { + "epoch": 0.4862656923365186, + "grad_norm": 0.24866148829460144, + "learning_rate": 0.00022738738738738737, + "loss": 0.4856, + "step": 1622 + }, + { + "epoch": 0.4865654862282181, + "grad_norm": 0.23114702105522156, + "learning_rate": 0.0002273423423423423, + "loss": 0.4167, + "step": 1623 + }, + { + "epoch": 0.48686528011991753, + "grad_norm": 0.24857169389724731, + "learning_rate": 0.0002272972972972973, + "loss": 0.4678, + "step": 1624 + }, + { + "epoch": 0.487165074011617, + "grad_norm": 0.24516363441944122, + "learning_rate": 0.00022725225225225223, + "loss": 0.4236, + "step": 1625 + }, + { + "epoch": 0.48746486790331645, + "grad_norm": 0.2485557198524475, + "learning_rate": 0.00022720720720720717, + "loss": 0.4562, + "step": 1626 + }, + { + "epoch": 0.48776466179501593, + "grad_norm": 0.2518232762813568, + "learning_rate": 0.00022716216216216216, + "loss": 0.4703, + "step": 1627 + }, + { + "epoch": 0.48806445568671536, + "grad_norm": 0.23469193279743195, + "learning_rate": 0.0002271171171171171, + "loss": 0.4301, + "step": 1628 + }, + { + "epoch": 0.48836424957841484, + "grad_norm": 0.25275421142578125, + "learning_rate": 0.00022707207207207204, + "loss": 0.4576, + "step": 1629 + }, + { + "epoch": 0.4886640434701143, + "grad_norm": 0.25434768199920654, + "learning_rate": 0.00022702702702702703, + "loss": 0.4654, + "step": 1630 + }, + { + "epoch": 0.48896383736181376, + "grad_norm": 0.24483707547187805, + "learning_rate": 0.00022698198198198196, + "loss": 0.45, + "step": 1631 + }, + { + "epoch": 0.4892636312535132, + "grad_norm": 0.24300119280815125, + "learning_rate": 0.0002269369369369369, + "loss": 0.4795, + "step": 1632 + }, + { + "epoch": 0.48956342514521267, + "grad_norm": 0.23230652511119843, + "learning_rate": 0.0002268918918918919, + "loss": 0.4318, + "step": 1633 + }, + { + "epoch": 0.4898632190369121, + "grad_norm": 0.2618871331214905, + "learning_rate": 0.00022684684684684683, + "loss": 0.4846, + "step": 1634 + }, + { + "epoch": 0.4901630129286116, + "grad_norm": 0.2477121651172638, + "learning_rate": 0.00022680180180180176, + "loss": 0.4392, + "step": 1635 + }, + { + "epoch": 0.490462806820311, + "grad_norm": 0.2603763937950134, + "learning_rate": 0.00022675675675675676, + "loss": 0.4875, + "step": 1636 + }, + { + "epoch": 0.4907626007120105, + "grad_norm": 0.2470758855342865, + "learning_rate": 0.0002267117117117117, + "loss": 0.4575, + "step": 1637 + }, + { + "epoch": 0.49106239460370993, + "grad_norm": 0.2612755000591278, + "learning_rate": 0.00022666666666666663, + "loss": 0.4652, + "step": 1638 + }, + { + "epoch": 0.4913621884954094, + "grad_norm": 0.2424866259098053, + "learning_rate": 0.0002266216216216216, + "loss": 0.4346, + "step": 1639 + }, + { + "epoch": 0.49166198238710884, + "grad_norm": 0.2529725730419159, + "learning_rate": 0.00022657657657657656, + "loss": 0.4427, + "step": 1640 + }, + { + "epoch": 0.4919617762788083, + "grad_norm": 0.2564398944377899, + "learning_rate": 0.00022653153153153152, + "loss": 0.4641, + "step": 1641 + }, + { + "epoch": 0.49226157017050776, + "grad_norm": 0.2599097490310669, + "learning_rate": 0.00022648648648648646, + "loss": 0.4564, + "step": 1642 + }, + { + "epoch": 0.49256136406220724, + "grad_norm": 0.2355022430419922, + "learning_rate": 0.00022644144144144142, + "loss": 0.4313, + "step": 1643 + }, + { + "epoch": 0.49286115795390667, + "grad_norm": 0.24770522117614746, + "learning_rate": 0.00022639639639639638, + "loss": 0.4468, + "step": 1644 + }, + { + "epoch": 0.49316095184560615, + "grad_norm": 0.2558223009109497, + "learning_rate": 0.00022635135135135132, + "loss": 0.4506, + "step": 1645 + }, + { + "epoch": 0.4934607457373056, + "grad_norm": 0.24741050601005554, + "learning_rate": 0.00022630630630630629, + "loss": 0.4375, + "step": 1646 + }, + { + "epoch": 0.49376053962900507, + "grad_norm": 0.24249225854873657, + "learning_rate": 0.00022626126126126125, + "loss": 0.4283, + "step": 1647 + }, + { + "epoch": 0.4940603335207045, + "grad_norm": 0.2792401611804962, + "learning_rate": 0.00022621621621621619, + "loss": 0.4757, + "step": 1648 + }, + { + "epoch": 0.494360127412404, + "grad_norm": 0.23249030113220215, + "learning_rate": 0.00022617117117117115, + "loss": 0.4417, + "step": 1649 + }, + { + "epoch": 0.4946599213041034, + "grad_norm": 0.2646411955356598, + "learning_rate": 0.00022612612612612611, + "loss": 0.4338, + "step": 1650 + }, + { + "epoch": 0.4949597151958029, + "grad_norm": 0.2633950710296631, + "learning_rate": 0.00022608108108108105, + "loss": 0.476, + "step": 1651 + }, + { + "epoch": 0.4952595090875023, + "grad_norm": 0.24183906614780426, + "learning_rate": 0.00022603603603603601, + "loss": 0.4363, + "step": 1652 + }, + { + "epoch": 0.4955593029792018, + "grad_norm": 0.26413261890411377, + "learning_rate": 0.00022599099099099098, + "loss": 0.4387, + "step": 1653 + }, + { + "epoch": 0.49585909687090124, + "grad_norm": 0.2538875937461853, + "learning_rate": 0.00022594594594594592, + "loss": 0.4284, + "step": 1654 + }, + { + "epoch": 0.4961588907626007, + "grad_norm": 0.2566417157649994, + "learning_rate": 0.0002259009009009009, + "loss": 0.4352, + "step": 1655 + }, + { + "epoch": 0.49645868465430015, + "grad_norm": 0.2526787519454956, + "learning_rate": 0.00022585585585585584, + "loss": 0.4567, + "step": 1656 + }, + { + "epoch": 0.49675847854599964, + "grad_norm": 0.2506735324859619, + "learning_rate": 0.00022581081081081078, + "loss": 0.4346, + "step": 1657 + }, + { + "epoch": 0.49705827243769907, + "grad_norm": 0.25367823243141174, + "learning_rate": 0.00022576576576576577, + "loss": 0.4392, + "step": 1658 + }, + { + "epoch": 0.49735806632939855, + "grad_norm": 0.24888494610786438, + "learning_rate": 0.0002257207207207207, + "loss": 0.4532, + "step": 1659 + }, + { + "epoch": 0.497657860221098, + "grad_norm": 0.22846034169197083, + "learning_rate": 0.00022567567567567564, + "loss": 0.4163, + "step": 1660 + }, + { + "epoch": 0.49795765411279747, + "grad_norm": 0.23993121087551117, + "learning_rate": 0.00022563063063063064, + "loss": 0.4385, + "step": 1661 + }, + { + "epoch": 0.4982574480044969, + "grad_norm": 0.2544318437576294, + "learning_rate": 0.00022558558558558557, + "loss": 0.4058, + "step": 1662 + }, + { + "epoch": 0.4985572418961964, + "grad_norm": 0.2419573813676834, + "learning_rate": 0.0002255405405405405, + "loss": 0.4428, + "step": 1663 + }, + { + "epoch": 0.4988570357878958, + "grad_norm": 0.23838767409324646, + "learning_rate": 0.00022549549549549547, + "loss": 0.4293, + "step": 1664 + }, + { + "epoch": 0.4991568296795953, + "grad_norm": 0.23000121116638184, + "learning_rate": 0.00022545045045045044, + "loss": 0.436, + "step": 1665 + }, + { + "epoch": 0.4994566235712947, + "grad_norm": 0.2665446102619171, + "learning_rate": 0.00022540540540540537, + "loss": 0.4841, + "step": 1666 + }, + { + "epoch": 0.4997564174629942, + "grad_norm": 0.25025802850723267, + "learning_rate": 0.00022536036036036034, + "loss": 0.4357, + "step": 1667 + }, + { + "epoch": 0.5000562113546937, + "grad_norm": 0.2441510409116745, + "learning_rate": 0.0002253153153153153, + "loss": 0.4345, + "step": 1668 + }, + { + "epoch": 0.5003560052463931, + "grad_norm": 0.24143864214420319, + "learning_rate": 0.00022527027027027024, + "loss": 0.4416, + "step": 1669 + }, + { + "epoch": 0.5006557991380925, + "grad_norm": 0.246190145611763, + "learning_rate": 0.0002252252252252252, + "loss": 0.4124, + "step": 1670 + }, + { + "epoch": 0.500955593029792, + "grad_norm": 0.2695963382720947, + "learning_rate": 0.00022518018018018017, + "loss": 0.4683, + "step": 1671 + }, + { + "epoch": 0.5012553869214915, + "grad_norm": 0.23124708235263824, + "learning_rate": 0.0002251351351351351, + "loss": 0.4347, + "step": 1672 + }, + { + "epoch": 0.501555180813191, + "grad_norm": 0.25153648853302, + "learning_rate": 0.00022509009009009007, + "loss": 0.4408, + "step": 1673 + }, + { + "epoch": 0.5018549747048904, + "grad_norm": 0.2653743028640747, + "learning_rate": 0.00022504504504504503, + "loss": 0.4482, + "step": 1674 + }, + { + "epoch": 0.5021547685965898, + "grad_norm": 0.24365836381912231, + "learning_rate": 0.000225, + "loss": 0.4225, + "step": 1675 + }, + { + "epoch": 0.5024545624882893, + "grad_norm": 0.2595864236354828, + "learning_rate": 0.00022495495495495493, + "loss": 0.4624, + "step": 1676 + }, + { + "epoch": 0.5027543563799888, + "grad_norm": 0.24555698037147522, + "learning_rate": 0.0002249099099099099, + "loss": 0.4495, + "step": 1677 + }, + { + "epoch": 0.5030541502716882, + "grad_norm": 0.25951844453811646, + "learning_rate": 0.00022486486486486486, + "loss": 0.4503, + "step": 1678 + }, + { + "epoch": 0.5033539441633876, + "grad_norm": 0.28872594237327576, + "learning_rate": 0.0002248198198198198, + "loss": 0.4916, + "step": 1679 + }, + { + "epoch": 0.5036537380550872, + "grad_norm": 0.24827940762043, + "learning_rate": 0.00022477477477477476, + "loss": 0.4024, + "step": 1680 + }, + { + "epoch": 0.5039535319467866, + "grad_norm": 0.2625780999660492, + "learning_rate": 0.00022472972972972972, + "loss": 0.4624, + "step": 1681 + }, + { + "epoch": 0.504253325838486, + "grad_norm": 0.26651719212532043, + "learning_rate": 0.00022468468468468466, + "loss": 0.4526, + "step": 1682 + }, + { + "epoch": 0.5045531197301855, + "grad_norm": 0.2538798153400421, + "learning_rate": 0.00022463963963963962, + "loss": 0.4328, + "step": 1683 + }, + { + "epoch": 0.504852913621885, + "grad_norm": 0.25730088353157043, + "learning_rate": 0.0002245945945945946, + "loss": 0.4642, + "step": 1684 + }, + { + "epoch": 0.5051527075135844, + "grad_norm": 0.24593298137187958, + "learning_rate": 0.00022454954954954952, + "loss": 0.4422, + "step": 1685 + }, + { + "epoch": 0.5054525014052839, + "grad_norm": 0.25883376598358154, + "learning_rate": 0.0002245045045045045, + "loss": 0.4439, + "step": 1686 + }, + { + "epoch": 0.5057522952969833, + "grad_norm": 0.2680940330028534, + "learning_rate": 0.00022445945945945945, + "loss": 0.4459, + "step": 1687 + }, + { + "epoch": 0.5060520891886828, + "grad_norm": 0.23688143491744995, + "learning_rate": 0.0002244144144144144, + "loss": 0.4404, + "step": 1688 + }, + { + "epoch": 0.5063518830803823, + "grad_norm": 0.24362222850322723, + "learning_rate": 0.00022436936936936933, + "loss": 0.473, + "step": 1689 + }, + { + "epoch": 0.5066516769720817, + "grad_norm": 0.24516165256500244, + "learning_rate": 0.00022432432432432432, + "loss": 0.4288, + "step": 1690 + }, + { + "epoch": 0.5069514708637811, + "grad_norm": 0.23739798367023468, + "learning_rate": 0.00022427927927927925, + "loss": 0.4231, + "step": 1691 + }, + { + "epoch": 0.5072512647554807, + "grad_norm": 0.25751793384552, + "learning_rate": 0.0002242342342342342, + "loss": 0.4406, + "step": 1692 + }, + { + "epoch": 0.5075510586471801, + "grad_norm": 0.23062650859355927, + "learning_rate": 0.00022418918918918918, + "loss": 0.4273, + "step": 1693 + }, + { + "epoch": 0.5078508525388795, + "grad_norm": 0.2453629970550537, + "learning_rate": 0.00022414414414414412, + "loss": 0.4526, + "step": 1694 + }, + { + "epoch": 0.508150646430579, + "grad_norm": 0.26255685091018677, + "learning_rate": 0.00022409909909909905, + "loss": 0.461, + "step": 1695 + }, + { + "epoch": 0.5084504403222785, + "grad_norm": 0.2384280562400818, + "learning_rate": 0.00022405405405405405, + "loss": 0.4333, + "step": 1696 + }, + { + "epoch": 0.5087502342139779, + "grad_norm": 0.24055825173854828, + "learning_rate": 0.00022400900900900898, + "loss": 0.4472, + "step": 1697 + }, + { + "epoch": 0.5090500281056773, + "grad_norm": 0.24356570839881897, + "learning_rate": 0.00022396396396396395, + "loss": 0.4159, + "step": 1698 + }, + { + "epoch": 0.5093498219973768, + "grad_norm": 0.3042013645172119, + "learning_rate": 0.0002239189189189189, + "loss": 0.474, + "step": 1699 + }, + { + "epoch": 0.5096496158890763, + "grad_norm": 0.2500532567501068, + "learning_rate": 0.00022387387387387385, + "loss": 0.4223, + "step": 1700 + }, + { + "epoch": 0.5099494097807757, + "grad_norm": 0.25324761867523193, + "learning_rate": 0.0002238288288288288, + "loss": 0.452, + "step": 1701 + }, + { + "epoch": 0.5102492036724752, + "grad_norm": 0.26007258892059326, + "learning_rate": 0.00022378378378378377, + "loss": 0.4516, + "step": 1702 + }, + { + "epoch": 0.5105489975641746, + "grad_norm": 0.2657194435596466, + "learning_rate": 0.0002237387387387387, + "loss": 0.4663, + "step": 1703 + }, + { + "epoch": 0.5108487914558741, + "grad_norm": 0.28216373920440674, + "learning_rate": 0.00022369369369369368, + "loss": 0.4341, + "step": 1704 + }, + { + "epoch": 0.5111485853475736, + "grad_norm": 0.2571386992931366, + "learning_rate": 0.00022364864864864864, + "loss": 0.4532, + "step": 1705 + }, + { + "epoch": 0.511448379239273, + "grad_norm": 0.27189430594444275, + "learning_rate": 0.00022360360360360358, + "loss": 0.4593, + "step": 1706 + }, + { + "epoch": 0.5117481731309724, + "grad_norm": 0.2536429166793823, + "learning_rate": 0.00022355855855855854, + "loss": 0.4603, + "step": 1707 + }, + { + "epoch": 0.512047967022672, + "grad_norm": 0.2552615702152252, + "learning_rate": 0.0002235135135135135, + "loss": 0.4265, + "step": 1708 + }, + { + "epoch": 0.5123477609143714, + "grad_norm": 0.24926409125328064, + "learning_rate": 0.00022346846846846844, + "loss": 0.444, + "step": 1709 + }, + { + "epoch": 0.5126475548060708, + "grad_norm": 0.27449071407318115, + "learning_rate": 0.0002234234234234234, + "loss": 0.4605, + "step": 1710 + }, + { + "epoch": 0.5129473486977703, + "grad_norm": 0.255071222782135, + "learning_rate": 0.00022337837837837837, + "loss": 0.4409, + "step": 1711 + }, + { + "epoch": 0.5132471425894697, + "grad_norm": 0.2560432553291321, + "learning_rate": 0.00022333333333333333, + "loss": 0.467, + "step": 1712 + }, + { + "epoch": 0.5135469364811692, + "grad_norm": 0.2458151876926422, + "learning_rate": 0.00022328828828828827, + "loss": 0.4626, + "step": 1713 + }, + { + "epoch": 0.5138467303728687, + "grad_norm": 0.2361784130334854, + "learning_rate": 0.0002232432432432432, + "loss": 0.4305, + "step": 1714 + }, + { + "epoch": 0.5141465242645681, + "grad_norm": 0.25834766030311584, + "learning_rate": 0.0002231981981981982, + "loss": 0.4569, + "step": 1715 + }, + { + "epoch": 0.5144463181562675, + "grad_norm": 0.253662109375, + "learning_rate": 0.00022315315315315313, + "loss": 0.4289, + "step": 1716 + }, + { + "epoch": 0.514746112047967, + "grad_norm": 0.2442713975906372, + "learning_rate": 0.00022310810810810807, + "loss": 0.4356, + "step": 1717 + }, + { + "epoch": 0.5150459059396665, + "grad_norm": 0.25813472270965576, + "learning_rate": 0.00022306306306306306, + "loss": 0.4721, + "step": 1718 + }, + { + "epoch": 0.5153456998313659, + "grad_norm": 0.269927054643631, + "learning_rate": 0.000223018018018018, + "loss": 0.4603, + "step": 1719 + }, + { + "epoch": 0.5156454937230653, + "grad_norm": 0.25108182430267334, + "learning_rate": 0.00022297297297297293, + "loss": 0.4611, + "step": 1720 + }, + { + "epoch": 0.5159452876147649, + "grad_norm": 0.23553571105003357, + "learning_rate": 0.00022292792792792793, + "loss": 0.431, + "step": 1721 + }, + { + "epoch": 0.5162450815064643, + "grad_norm": 0.2411264032125473, + "learning_rate": 0.00022288288288288286, + "loss": 0.4405, + "step": 1722 + }, + { + "epoch": 0.5165448753981637, + "grad_norm": 0.24999505281448364, + "learning_rate": 0.0002228378378378378, + "loss": 0.4487, + "step": 1723 + }, + { + "epoch": 0.5168446692898632, + "grad_norm": 0.23619996011257172, + "learning_rate": 0.0002227927927927928, + "loss": 0.4399, + "step": 1724 + }, + { + "epoch": 0.5171444631815627, + "grad_norm": 0.23623579740524292, + "learning_rate": 0.00022274774774774773, + "loss": 0.4358, + "step": 1725 + }, + { + "epoch": 0.5174442570732621, + "grad_norm": 0.2538294494152069, + "learning_rate": 0.00022270270270270266, + "loss": 0.4715, + "step": 1726 + }, + { + "epoch": 0.5177440509649616, + "grad_norm": 0.23572376370429993, + "learning_rate": 0.00022265765765765765, + "loss": 0.4529, + "step": 1727 + }, + { + "epoch": 0.518043844856661, + "grad_norm": 0.25421515107154846, + "learning_rate": 0.0002226126126126126, + "loss": 0.4427, + "step": 1728 + }, + { + "epoch": 0.5183436387483605, + "grad_norm": 0.23936650156974792, + "learning_rate": 0.00022256756756756753, + "loss": 0.4416, + "step": 1729 + }, + { + "epoch": 0.51864343264006, + "grad_norm": 0.26791784167289734, + "learning_rate": 0.00022252252252252252, + "loss": 0.4697, + "step": 1730 + }, + { + "epoch": 0.5189432265317594, + "grad_norm": 0.2541804015636444, + "learning_rate": 0.00022247747747747746, + "loss": 0.4801, + "step": 1731 + }, + { + "epoch": 0.5192430204234588, + "grad_norm": 0.24337895214557648, + "learning_rate": 0.0002224324324324324, + "loss": 0.4316, + "step": 1732 + }, + { + "epoch": 0.5195428143151584, + "grad_norm": 0.2545047998428345, + "learning_rate": 0.00022238738738738738, + "loss": 0.4465, + "step": 1733 + }, + { + "epoch": 0.5198426082068578, + "grad_norm": 0.24640010297298431, + "learning_rate": 0.00022234234234234232, + "loss": 0.4574, + "step": 1734 + }, + { + "epoch": 0.5201424020985572, + "grad_norm": 0.2528793215751648, + "learning_rate": 0.00022229729729729728, + "loss": 0.4423, + "step": 1735 + }, + { + "epoch": 0.5204421959902567, + "grad_norm": 0.24697841703891754, + "learning_rate": 0.00022225225225225222, + "loss": 0.4259, + "step": 1736 + }, + { + "epoch": 0.5207419898819562, + "grad_norm": 0.24195986986160278, + "learning_rate": 0.00022220720720720718, + "loss": 0.439, + "step": 1737 + }, + { + "epoch": 0.5210417837736556, + "grad_norm": 0.2523336410522461, + "learning_rate": 0.00022216216216216215, + "loss": 0.4441, + "step": 1738 + }, + { + "epoch": 0.521341577665355, + "grad_norm": 0.22917968034744263, + "learning_rate": 0.00022211711711711709, + "loss": 0.4152, + "step": 1739 + }, + { + "epoch": 0.5216413715570545, + "grad_norm": 0.2633557915687561, + "learning_rate": 0.00022207207207207205, + "loss": 0.4593, + "step": 1740 + }, + { + "epoch": 0.521941165448754, + "grad_norm": 0.23756282031536102, + "learning_rate": 0.000222027027027027, + "loss": 0.4244, + "step": 1741 + }, + { + "epoch": 0.5222409593404534, + "grad_norm": 0.24200837314128876, + "learning_rate": 0.00022198198198198195, + "loss": 0.4056, + "step": 1742 + }, + { + "epoch": 0.5225407532321529, + "grad_norm": 0.2658364474773407, + "learning_rate": 0.00022193693693693691, + "loss": 0.4432, + "step": 1743 + }, + { + "epoch": 0.5228405471238523, + "grad_norm": 0.2403399795293808, + "learning_rate": 0.00022189189189189188, + "loss": 0.4395, + "step": 1744 + }, + { + "epoch": 0.5231403410155518, + "grad_norm": 0.23752082884311676, + "learning_rate": 0.00022184684684684681, + "loss": 0.4284, + "step": 1745 + }, + { + "epoch": 0.5234401349072513, + "grad_norm": 0.2684466540813446, + "learning_rate": 0.0002218018018018018, + "loss": 0.4266, + "step": 1746 + }, + { + "epoch": 0.5237399287989507, + "grad_norm": 0.24474304914474487, + "learning_rate": 0.00022175675675675674, + "loss": 0.4472, + "step": 1747 + }, + { + "epoch": 0.5240397226906501, + "grad_norm": 0.24958543479442596, + "learning_rate": 0.00022171171171171168, + "loss": 0.4253, + "step": 1748 + }, + { + "epoch": 0.5243395165823497, + "grad_norm": 0.25974011421203613, + "learning_rate": 0.00022166666666666667, + "loss": 0.4477, + "step": 1749 + }, + { + "epoch": 0.5246393104740491, + "grad_norm": 0.23762384057044983, + "learning_rate": 0.0002216216216216216, + "loss": 0.425, + "step": 1750 + }, + { + "epoch": 0.5249391043657485, + "grad_norm": 0.2543089687824249, + "learning_rate": 0.00022157657657657654, + "loss": 0.3947, + "step": 1751 + }, + { + "epoch": 0.525238898257448, + "grad_norm": 0.26260843873023987, + "learning_rate": 0.00022153153153153153, + "loss": 0.4642, + "step": 1752 + }, + { + "epoch": 0.5255386921491475, + "grad_norm": 0.2474740445613861, + "learning_rate": 0.00022148648648648647, + "loss": 0.4468, + "step": 1753 + }, + { + "epoch": 0.5258384860408469, + "grad_norm": 0.24307985603809357, + "learning_rate": 0.0002214414414414414, + "loss": 0.4537, + "step": 1754 + }, + { + "epoch": 0.5261382799325464, + "grad_norm": 0.27346494793891907, + "learning_rate": 0.0002213963963963964, + "loss": 0.4789, + "step": 1755 + }, + { + "epoch": 0.5264380738242458, + "grad_norm": 0.2407042384147644, + "learning_rate": 0.00022135135135135134, + "loss": 0.4334, + "step": 1756 + }, + { + "epoch": 0.5267378677159453, + "grad_norm": 0.2311742603778839, + "learning_rate": 0.00022130630630630627, + "loss": 0.4117, + "step": 1757 + }, + { + "epoch": 0.5270376616076448, + "grad_norm": 0.23060280084609985, + "learning_rate": 0.00022126126126126126, + "loss": 0.4413, + "step": 1758 + }, + { + "epoch": 0.5273374554993442, + "grad_norm": 0.286851167678833, + "learning_rate": 0.0002212162162162162, + "loss": 0.461, + "step": 1759 + }, + { + "epoch": 0.5276372493910436, + "grad_norm": 0.23764149844646454, + "learning_rate": 0.00022117117117117114, + "loss": 0.4428, + "step": 1760 + }, + { + "epoch": 0.5279370432827432, + "grad_norm": 0.24021115899085999, + "learning_rate": 0.0002211261261261261, + "loss": 0.4416, + "step": 1761 + }, + { + "epoch": 0.5282368371744426, + "grad_norm": 0.25310957431793213, + "learning_rate": 0.00022108108108108106, + "loss": 0.4474, + "step": 1762 + }, + { + "epoch": 0.528536631066142, + "grad_norm": 0.2636144161224365, + "learning_rate": 0.000221036036036036, + "loss": 0.4629, + "step": 1763 + }, + { + "epoch": 0.5288364249578414, + "grad_norm": 0.254807710647583, + "learning_rate": 0.00022099099099099097, + "loss": 0.4558, + "step": 1764 + }, + { + "epoch": 0.529136218849541, + "grad_norm": 0.2389029860496521, + "learning_rate": 0.00022094594594594593, + "loss": 0.4519, + "step": 1765 + }, + { + "epoch": 0.5294360127412404, + "grad_norm": 0.24269163608551025, + "learning_rate": 0.00022090090090090087, + "loss": 0.4371, + "step": 1766 + }, + { + "epoch": 0.5297358066329398, + "grad_norm": 0.2602348029613495, + "learning_rate": 0.00022085585585585583, + "loss": 0.4474, + "step": 1767 + }, + { + "epoch": 0.5300356005246393, + "grad_norm": 0.2557549774646759, + "learning_rate": 0.0002208108108108108, + "loss": 0.4709, + "step": 1768 + }, + { + "epoch": 0.5303353944163388, + "grad_norm": 0.27289271354675293, + "learning_rate": 0.00022076576576576576, + "loss": 0.4248, + "step": 1769 + }, + { + "epoch": 0.5306351883080382, + "grad_norm": 0.27305862307548523, + "learning_rate": 0.0002207207207207207, + "loss": 0.4566, + "step": 1770 + }, + { + "epoch": 0.5309349821997377, + "grad_norm": 0.2508034408092499, + "learning_rate": 0.00022067567567567566, + "loss": 0.4334, + "step": 1771 + }, + { + "epoch": 0.5312347760914371, + "grad_norm": 0.24729134142398834, + "learning_rate": 0.00022063063063063062, + "loss": 0.4251, + "step": 1772 + }, + { + "epoch": 0.5315345699831366, + "grad_norm": 0.23145246505737305, + "learning_rate": 0.00022058558558558556, + "loss": 0.3992, + "step": 1773 + }, + { + "epoch": 0.5318343638748361, + "grad_norm": 0.24190931022167206, + "learning_rate": 0.00022054054054054052, + "loss": 0.4183, + "step": 1774 + }, + { + "epoch": 0.5321341577665355, + "grad_norm": 0.2671104073524475, + "learning_rate": 0.0002204954954954955, + "loss": 0.4548, + "step": 1775 + }, + { + "epoch": 0.5324339516582349, + "grad_norm": 0.2361837476491928, + "learning_rate": 0.00022045045045045042, + "loss": 0.414, + "step": 1776 + }, + { + "epoch": 0.5327337455499345, + "grad_norm": 0.25529375672340393, + "learning_rate": 0.0002204054054054054, + "loss": 0.4345, + "step": 1777 + }, + { + "epoch": 0.5330335394416339, + "grad_norm": 0.26258841156959534, + "learning_rate": 0.00022036036036036035, + "loss": 0.4522, + "step": 1778 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.24532164633274078, + "learning_rate": 0.0002203153153153153, + "loss": 0.4303, + "step": 1779 + }, + { + "epoch": 0.5336331272250328, + "grad_norm": 0.24562229216098785, + "learning_rate": 0.00022027027027027028, + "loss": 0.427, + "step": 1780 + }, + { + "epoch": 0.5339329211167323, + "grad_norm": 0.25493496656417847, + "learning_rate": 0.00022022522522522522, + "loss": 0.45, + "step": 1781 + }, + { + "epoch": 0.5342327150084317, + "grad_norm": 0.25944793224334717, + "learning_rate": 0.00022018018018018015, + "loss": 0.449, + "step": 1782 + }, + { + "epoch": 0.5345325089001312, + "grad_norm": 0.24805709719657898, + "learning_rate": 0.00022013513513513514, + "loss": 0.4431, + "step": 1783 + }, + { + "epoch": 0.5348323027918306, + "grad_norm": 0.27083903551101685, + "learning_rate": 0.00022009009009009008, + "loss": 0.465, + "step": 1784 + }, + { + "epoch": 0.5351320966835301, + "grad_norm": 0.2876584827899933, + "learning_rate": 0.00022004504504504502, + "loss": 0.4859, + "step": 1785 + }, + { + "epoch": 0.5354318905752296, + "grad_norm": 0.2499544471502304, + "learning_rate": 0.00021999999999999995, + "loss": 0.4551, + "step": 1786 + }, + { + "epoch": 0.535731684466929, + "grad_norm": 0.2812490165233612, + "learning_rate": 0.00021995495495495494, + "loss": 0.4554, + "step": 1787 + }, + { + "epoch": 0.5360314783586284, + "grad_norm": 0.2530219256877899, + "learning_rate": 0.00021990990990990988, + "loss": 0.4335, + "step": 1788 + }, + { + "epoch": 0.536331272250328, + "grad_norm": 0.24573327600955963, + "learning_rate": 0.00021986486486486482, + "loss": 0.4574, + "step": 1789 + }, + { + "epoch": 0.5366310661420274, + "grad_norm": 0.3053630292415619, + "learning_rate": 0.0002198198198198198, + "loss": 0.4745, + "step": 1790 + }, + { + "epoch": 0.5369308600337268, + "grad_norm": 0.24176299571990967, + "learning_rate": 0.00021977477477477475, + "loss": 0.4244, + "step": 1791 + }, + { + "epoch": 0.5372306539254262, + "grad_norm": 0.27554550766944885, + "learning_rate": 0.0002197297297297297, + "loss": 0.469, + "step": 1792 + }, + { + "epoch": 0.5375304478171258, + "grad_norm": 0.28003454208374023, + "learning_rate": 0.00021968468468468467, + "loss": 0.4477, + "step": 1793 + }, + { + "epoch": 0.5378302417088252, + "grad_norm": 0.27388301491737366, + "learning_rate": 0.0002196396396396396, + "loss": 0.4602, + "step": 1794 + }, + { + "epoch": 0.5381300356005246, + "grad_norm": 0.2521568238735199, + "learning_rate": 0.00021959459459459457, + "loss": 0.4507, + "step": 1795 + }, + { + "epoch": 0.5384298294922241, + "grad_norm": 0.24554435908794403, + "learning_rate": 0.00021954954954954954, + "loss": 0.4387, + "step": 1796 + }, + { + "epoch": 0.5387296233839236, + "grad_norm": 0.24148909747600555, + "learning_rate": 0.00021950450450450447, + "loss": 0.4316, + "step": 1797 + }, + { + "epoch": 0.539029417275623, + "grad_norm": 0.24902650713920593, + "learning_rate": 0.00021945945945945944, + "loss": 0.448, + "step": 1798 + }, + { + "epoch": 0.5393292111673225, + "grad_norm": 0.25397536158561707, + "learning_rate": 0.0002194144144144144, + "loss": 0.479, + "step": 1799 + }, + { + "epoch": 0.5396290050590219, + "grad_norm": 0.2324562668800354, + "learning_rate": 0.00021936936936936934, + "loss": 0.4077, + "step": 1800 + }, + { + "epoch": 0.5399287989507214, + "grad_norm": 0.26509541273117065, + "learning_rate": 0.0002193243243243243, + "loss": 0.4628, + "step": 1801 + }, + { + "epoch": 0.5402285928424209, + "grad_norm": 0.24714629352092743, + "learning_rate": 0.00021927927927927927, + "loss": 0.4393, + "step": 1802 + }, + { + "epoch": 0.5405283867341203, + "grad_norm": 0.2634679079055786, + "learning_rate": 0.00021923423423423423, + "loss": 0.4508, + "step": 1803 + }, + { + "epoch": 0.5408281806258197, + "grad_norm": 0.2673392593860626, + "learning_rate": 0.00021918918918918917, + "loss": 0.4587, + "step": 1804 + }, + { + "epoch": 0.5411279745175192, + "grad_norm": 0.2438841462135315, + "learning_rate": 0.00021914414414414413, + "loss": 0.4271, + "step": 1805 + }, + { + "epoch": 0.5414277684092187, + "grad_norm": 0.2463088482618332, + "learning_rate": 0.0002190990990990991, + "loss": 0.4216, + "step": 1806 + }, + { + "epoch": 0.5417275623009181, + "grad_norm": 0.22443120181560516, + "learning_rate": 0.00021905405405405403, + "loss": 0.3935, + "step": 1807 + }, + { + "epoch": 0.5420273561926175, + "grad_norm": 0.2647371292114258, + "learning_rate": 0.00021900900900900897, + "loss": 0.4474, + "step": 1808 + }, + { + "epoch": 0.542327150084317, + "grad_norm": 0.2505868077278137, + "learning_rate": 0.00021896396396396396, + "loss": 0.4623, + "step": 1809 + }, + { + "epoch": 0.5426269439760165, + "grad_norm": 0.2395186722278595, + "learning_rate": 0.0002189189189189189, + "loss": 0.4469, + "step": 1810 + }, + { + "epoch": 0.542926737867716, + "grad_norm": 0.2345222383737564, + "learning_rate": 0.00021887387387387383, + "loss": 0.4127, + "step": 1811 + }, + { + "epoch": 0.5432265317594154, + "grad_norm": 0.25592970848083496, + "learning_rate": 0.00021882882882882882, + "loss": 0.4642, + "step": 1812 + }, + { + "epoch": 0.5435263256511148, + "grad_norm": 0.2604975700378418, + "learning_rate": 0.00021878378378378376, + "loss": 0.461, + "step": 1813 + }, + { + "epoch": 0.5438261195428143, + "grad_norm": 0.2413945347070694, + "learning_rate": 0.0002187387387387387, + "loss": 0.4281, + "step": 1814 + }, + { + "epoch": 0.5441259134345138, + "grad_norm": 0.2543574571609497, + "learning_rate": 0.0002186936936936937, + "loss": 0.4657, + "step": 1815 + }, + { + "epoch": 0.5444257073262132, + "grad_norm": 0.2783868610858917, + "learning_rate": 0.00021864864864864863, + "loss": 0.4917, + "step": 1816 + }, + { + "epoch": 0.5447255012179126, + "grad_norm": 0.23379693925380707, + "learning_rate": 0.00021860360360360356, + "loss": 0.4324, + "step": 1817 + }, + { + "epoch": 0.5450252951096122, + "grad_norm": 0.22307537496089935, + "learning_rate": 0.00021855855855855855, + "loss": 0.4175, + "step": 1818 + }, + { + "epoch": 0.5453250890013116, + "grad_norm": 0.23716312646865845, + "learning_rate": 0.0002185135135135135, + "loss": 0.4195, + "step": 1819 + }, + { + "epoch": 0.545624882893011, + "grad_norm": 0.2469712346792221, + "learning_rate": 0.00021846846846846843, + "loss": 0.4386, + "step": 1820 + }, + { + "epoch": 0.5459246767847105, + "grad_norm": 0.24840980768203735, + "learning_rate": 0.00021842342342342342, + "loss": 0.4193, + "step": 1821 + }, + { + "epoch": 0.54622447067641, + "grad_norm": 0.25362369418144226, + "learning_rate": 0.00021837837837837835, + "loss": 0.445, + "step": 1822 + }, + { + "epoch": 0.5465242645681094, + "grad_norm": 0.23812243342399597, + "learning_rate": 0.0002183333333333333, + "loss": 0.4511, + "step": 1823 + }, + { + "epoch": 0.5468240584598089, + "grad_norm": 0.25310468673706055, + "learning_rate": 0.00021828828828828828, + "loss": 0.4463, + "step": 1824 + }, + { + "epoch": 0.5471238523515083, + "grad_norm": 0.25943437218666077, + "learning_rate": 0.00021824324324324322, + "loss": 0.42, + "step": 1825 + }, + { + "epoch": 0.5474236462432078, + "grad_norm": 0.270178884267807, + "learning_rate": 0.00021819819819819818, + "loss": 0.4666, + "step": 1826 + }, + { + "epoch": 0.5477234401349073, + "grad_norm": 0.23301197588443756, + "learning_rate": 0.00021815315315315315, + "loss": 0.407, + "step": 1827 + }, + { + "epoch": 0.5480232340266067, + "grad_norm": 0.2634783983230591, + "learning_rate": 0.00021810810810810808, + "loss": 0.4596, + "step": 1828 + }, + { + "epoch": 0.5483230279183061, + "grad_norm": 0.24297159910202026, + "learning_rate": 0.00021806306306306305, + "loss": 0.4107, + "step": 1829 + }, + { + "epoch": 0.5486228218100057, + "grad_norm": 0.2781638503074646, + "learning_rate": 0.000218018018018018, + "loss": 0.4686, + "step": 1830 + }, + { + "epoch": 0.5489226157017051, + "grad_norm": 0.25801247358322144, + "learning_rate": 0.00021797297297297295, + "loss": 0.4644, + "step": 1831 + }, + { + "epoch": 0.5492224095934045, + "grad_norm": 0.27080485224723816, + "learning_rate": 0.0002179279279279279, + "loss": 0.428, + "step": 1832 + }, + { + "epoch": 0.5495222034851039, + "grad_norm": 0.2617061138153076, + "learning_rate": 0.00021788288288288285, + "loss": 0.4437, + "step": 1833 + }, + { + "epoch": 0.5498219973768035, + "grad_norm": 0.26249265670776367, + "learning_rate": 0.0002178378378378378, + "loss": 0.4413, + "step": 1834 + }, + { + "epoch": 0.5501217912685029, + "grad_norm": 0.2705504894256592, + "learning_rate": 0.00021779279279279278, + "loss": 0.4716, + "step": 1835 + }, + { + "epoch": 0.5504215851602023, + "grad_norm": 0.25836360454559326, + "learning_rate": 0.0002177477477477477, + "loss": 0.454, + "step": 1836 + }, + { + "epoch": 0.5507213790519018, + "grad_norm": 0.23872222006320953, + "learning_rate": 0.0002177027027027027, + "loss": 0.4173, + "step": 1837 + }, + { + "epoch": 0.5510211729436013, + "grad_norm": 0.2438534051179886, + "learning_rate": 0.00021765765765765764, + "loss": 0.4114, + "step": 1838 + }, + { + "epoch": 0.5513209668353007, + "grad_norm": 0.2600691318511963, + "learning_rate": 0.00021761261261261258, + "loss": 0.4364, + "step": 1839 + }, + { + "epoch": 0.5516207607270002, + "grad_norm": 0.2483833283185959, + "learning_rate": 0.00021756756756756757, + "loss": 0.4544, + "step": 1840 + }, + { + "epoch": 0.5519205546186996, + "grad_norm": 0.2506955564022064, + "learning_rate": 0.0002175225225225225, + "loss": 0.4561, + "step": 1841 + }, + { + "epoch": 0.5522203485103991, + "grad_norm": 0.2800386846065521, + "learning_rate": 0.00021747747747747744, + "loss": 0.438, + "step": 1842 + }, + { + "epoch": 0.5525201424020986, + "grad_norm": 0.2458580881357193, + "learning_rate": 0.00021743243243243243, + "loss": 0.4431, + "step": 1843 + }, + { + "epoch": 0.552819936293798, + "grad_norm": 0.24995484948158264, + "learning_rate": 0.00021738738738738737, + "loss": 0.4349, + "step": 1844 + }, + { + "epoch": 0.5531197301854974, + "grad_norm": 0.2502653896808624, + "learning_rate": 0.0002173423423423423, + "loss": 0.4366, + "step": 1845 + }, + { + "epoch": 0.553419524077197, + "grad_norm": 0.2538207471370697, + "learning_rate": 0.0002172972972972973, + "loss": 0.4381, + "step": 1846 + }, + { + "epoch": 0.5537193179688964, + "grad_norm": 0.2417684644460678, + "learning_rate": 0.00021725225225225223, + "loss": 0.4277, + "step": 1847 + }, + { + "epoch": 0.5540191118605958, + "grad_norm": 0.2531186044216156, + "learning_rate": 0.00021720720720720717, + "loss": 0.4361, + "step": 1848 + }, + { + "epoch": 0.5543189057522953, + "grad_norm": 0.2551022171974182, + "learning_rate": 0.00021716216216216216, + "loss": 0.4483, + "step": 1849 + }, + { + "epoch": 0.5546186996439948, + "grad_norm": 0.23375505208969116, + "learning_rate": 0.0002171171171171171, + "loss": 0.4157, + "step": 1850 + }, + { + "epoch": 0.5549184935356942, + "grad_norm": 0.23428985476493835, + "learning_rate": 0.00021707207207207204, + "loss": 0.4092, + "step": 1851 + }, + { + "epoch": 0.5552182874273937, + "grad_norm": 0.24895919859409332, + "learning_rate": 0.00021702702702702703, + "loss": 0.4377, + "step": 1852 + }, + { + "epoch": 0.5555180813190931, + "grad_norm": 0.23306754231452942, + "learning_rate": 0.00021698198198198196, + "loss": 0.4316, + "step": 1853 + }, + { + "epoch": 0.5558178752107926, + "grad_norm": 0.2527741491794586, + "learning_rate": 0.0002169369369369369, + "loss": 0.4245, + "step": 1854 + }, + { + "epoch": 0.556117669102492, + "grad_norm": 0.2319868952035904, + "learning_rate": 0.0002168918918918919, + "loss": 0.4074, + "step": 1855 + }, + { + "epoch": 0.5564174629941915, + "grad_norm": 0.25132691860198975, + "learning_rate": 0.00021684684684684683, + "loss": 0.4378, + "step": 1856 + }, + { + "epoch": 0.5567172568858909, + "grad_norm": 0.24814215302467346, + "learning_rate": 0.00021680180180180177, + "loss": 0.4639, + "step": 1857 + }, + { + "epoch": 0.5570170507775905, + "grad_norm": 0.23483987152576447, + "learning_rate": 0.00021675675675675673, + "loss": 0.4419, + "step": 1858 + }, + { + "epoch": 0.5573168446692899, + "grad_norm": 0.24670268595218658, + "learning_rate": 0.0002167117117117117, + "loss": 0.436, + "step": 1859 + }, + { + "epoch": 0.5576166385609893, + "grad_norm": 0.23737584054470062, + "learning_rate": 0.00021666666666666666, + "loss": 0.4246, + "step": 1860 + }, + { + "epoch": 0.5579164324526887, + "grad_norm": 0.2634913921356201, + "learning_rate": 0.0002166216216216216, + "loss": 0.4252, + "step": 1861 + }, + { + "epoch": 0.5582162263443883, + "grad_norm": 0.25565293431282043, + "learning_rate": 0.00021657657657657656, + "loss": 0.4287, + "step": 1862 + }, + { + "epoch": 0.5585160202360877, + "grad_norm": 0.2408502846956253, + "learning_rate": 0.00021653153153153152, + "loss": 0.4259, + "step": 1863 + }, + { + "epoch": 0.5588158141277871, + "grad_norm": 0.2516701817512512, + "learning_rate": 0.00021648648648648646, + "loss": 0.4361, + "step": 1864 + }, + { + "epoch": 0.5591156080194866, + "grad_norm": 0.2645619213581085, + "learning_rate": 0.00021644144144144142, + "loss": 0.4336, + "step": 1865 + }, + { + "epoch": 0.5594154019111861, + "grad_norm": 0.27260729670524597, + "learning_rate": 0.00021639639639639639, + "loss": 0.4494, + "step": 1866 + }, + { + "epoch": 0.5597151958028855, + "grad_norm": 0.2506016492843628, + "learning_rate": 0.00021635135135135132, + "loss": 0.4027, + "step": 1867 + }, + { + "epoch": 0.560014989694585, + "grad_norm": 0.2552817463874817, + "learning_rate": 0.00021630630630630629, + "loss": 0.426, + "step": 1868 + }, + { + "epoch": 0.5603147835862844, + "grad_norm": 0.2529433071613312, + "learning_rate": 0.00021626126126126125, + "loss": 0.4327, + "step": 1869 + }, + { + "epoch": 0.5606145774779839, + "grad_norm": 0.2483612447977066, + "learning_rate": 0.0002162162162162162, + "loss": 0.4492, + "step": 1870 + }, + { + "epoch": 0.5609143713696834, + "grad_norm": 0.25298991799354553, + "learning_rate": 0.00021617117117117118, + "loss": 0.4442, + "step": 1871 + }, + { + "epoch": 0.5612141652613828, + "grad_norm": 0.243782639503479, + "learning_rate": 0.00021612612612612611, + "loss": 0.4462, + "step": 1872 + }, + { + "epoch": 0.5615139591530822, + "grad_norm": 0.24677562713623047, + "learning_rate": 0.00021608108108108105, + "loss": 0.433, + "step": 1873 + }, + { + "epoch": 0.5618137530447818, + "grad_norm": 0.2761828601360321, + "learning_rate": 0.00021603603603603604, + "loss": 0.4436, + "step": 1874 + }, + { + "epoch": 0.5621135469364812, + "grad_norm": 0.24887466430664062, + "learning_rate": 0.00021599099099099098, + "loss": 0.4211, + "step": 1875 + }, + { + "epoch": 0.5624133408281806, + "grad_norm": 0.2687954902648926, + "learning_rate": 0.00021594594594594592, + "loss": 0.4545, + "step": 1876 + }, + { + "epoch": 0.56271313471988, + "grad_norm": 0.2376958280801773, + "learning_rate": 0.0002159009009009009, + "loss": 0.4062, + "step": 1877 + }, + { + "epoch": 0.5630129286115796, + "grad_norm": 0.25497403740882874, + "learning_rate": 0.00021585585585585584, + "loss": 0.4512, + "step": 1878 + }, + { + "epoch": 0.563312722503279, + "grad_norm": 0.27644020318984985, + "learning_rate": 0.00021581081081081078, + "loss": 0.4317, + "step": 1879 + }, + { + "epoch": 0.5636125163949784, + "grad_norm": 0.274746298789978, + "learning_rate": 0.00021576576576576577, + "loss": 0.4523, + "step": 1880 + }, + { + "epoch": 0.5639123102866779, + "grad_norm": 0.24522797763347626, + "learning_rate": 0.0002157207207207207, + "loss": 0.4193, + "step": 1881 + }, + { + "epoch": 0.5642121041783774, + "grad_norm": 0.26614081859588623, + "learning_rate": 0.00021567567567567565, + "loss": 0.4474, + "step": 1882 + }, + { + "epoch": 0.5645118980700768, + "grad_norm": 0.25119736790657043, + "learning_rate": 0.0002156306306306306, + "loss": 0.4349, + "step": 1883 + }, + { + "epoch": 0.5648116919617763, + "grad_norm": 0.2535664737224579, + "learning_rate": 0.00021558558558558557, + "loss": 0.4395, + "step": 1884 + }, + { + "epoch": 0.5651114858534757, + "grad_norm": 0.27441659569740295, + "learning_rate": 0.0002155405405405405, + "loss": 0.4591, + "step": 1885 + }, + { + "epoch": 0.5654112797451752, + "grad_norm": 0.2535187900066376, + "learning_rate": 0.00021549549549549547, + "loss": 0.4256, + "step": 1886 + }, + { + "epoch": 0.5657110736368747, + "grad_norm": 0.2877647876739502, + "learning_rate": 0.00021545045045045044, + "loss": 0.4877, + "step": 1887 + }, + { + "epoch": 0.5660108675285741, + "grad_norm": 0.23665641248226166, + "learning_rate": 0.00021540540540540537, + "loss": 0.4506, + "step": 1888 + }, + { + "epoch": 0.5663106614202735, + "grad_norm": 0.2620941996574402, + "learning_rate": 0.00021536036036036034, + "loss": 0.4415, + "step": 1889 + }, + { + "epoch": 0.5666104553119731, + "grad_norm": 0.2704925239086151, + "learning_rate": 0.0002153153153153153, + "loss": 0.4855, + "step": 1890 + }, + { + "epoch": 0.5669102492036725, + "grad_norm": 0.2636096775531769, + "learning_rate": 0.00021527027027027024, + "loss": 0.4196, + "step": 1891 + }, + { + "epoch": 0.5672100430953719, + "grad_norm": 0.2897530496120453, + "learning_rate": 0.0002152252252252252, + "loss": 0.4556, + "step": 1892 + }, + { + "epoch": 0.5675098369870714, + "grad_norm": 0.2407667189836502, + "learning_rate": 0.00021518018018018017, + "loss": 0.4406, + "step": 1893 + }, + { + "epoch": 0.5678096308787709, + "grad_norm": 0.277865469455719, + "learning_rate": 0.00021513513513513513, + "loss": 0.4033, + "step": 1894 + }, + { + "epoch": 0.5681094247704703, + "grad_norm": 0.2540576159954071, + "learning_rate": 0.00021509009009009007, + "loss": 0.4355, + "step": 1895 + }, + { + "epoch": 0.5684092186621698, + "grad_norm": 0.26155397295951843, + "learning_rate": 0.00021504504504504503, + "loss": 0.4235, + "step": 1896 + }, + { + "epoch": 0.5687090125538692, + "grad_norm": 0.25544247031211853, + "learning_rate": 0.000215, + "loss": 0.4279, + "step": 1897 + }, + { + "epoch": 0.5690088064455687, + "grad_norm": 0.2406405806541443, + "learning_rate": 0.00021495495495495493, + "loss": 0.4208, + "step": 1898 + }, + { + "epoch": 0.5693086003372682, + "grad_norm": 0.24712449312210083, + "learning_rate": 0.0002149099099099099, + "loss": 0.4344, + "step": 1899 + }, + { + "epoch": 0.5696083942289676, + "grad_norm": 0.26752716302871704, + "learning_rate": 0.00021486486486486486, + "loss": 0.4934, + "step": 1900 + }, + { + "epoch": 0.569908188120667, + "grad_norm": 0.24652232229709625, + "learning_rate": 0.0002148198198198198, + "loss": 0.4546, + "step": 1901 + }, + { + "epoch": 0.5702079820123664, + "grad_norm": 0.251396507024765, + "learning_rate": 0.00021477477477477476, + "loss": 0.4738, + "step": 1902 + }, + { + "epoch": 0.570507775904066, + "grad_norm": 0.24406296014785767, + "learning_rate": 0.00021472972972972972, + "loss": 0.4536, + "step": 1903 + }, + { + "epoch": 0.5708075697957654, + "grad_norm": 0.2634785771369934, + "learning_rate": 0.00021468468468468466, + "loss": 0.4653, + "step": 1904 + }, + { + "epoch": 0.5711073636874648, + "grad_norm": 0.24010764062404633, + "learning_rate": 0.0002146396396396396, + "loss": 0.4188, + "step": 1905 + }, + { + "epoch": 0.5714071575791643, + "grad_norm": 0.24869616329669952, + "learning_rate": 0.0002145945945945946, + "loss": 0.4474, + "step": 1906 + }, + { + "epoch": 0.5717069514708638, + "grad_norm": 0.24654364585876465, + "learning_rate": 0.00021454954954954953, + "loss": 0.4409, + "step": 1907 + }, + { + "epoch": 0.5720067453625632, + "grad_norm": 0.29856958985328674, + "learning_rate": 0.00021450450450450446, + "loss": 0.4912, + "step": 1908 + }, + { + "epoch": 0.5723065392542627, + "grad_norm": 0.2449256181716919, + "learning_rate": 0.00021445945945945945, + "loss": 0.4501, + "step": 1909 + }, + { + "epoch": 0.5726063331459621, + "grad_norm": 0.29776662588119507, + "learning_rate": 0.0002144144144144144, + "loss": 0.4403, + "step": 1910 + }, + { + "epoch": 0.5729061270376616, + "grad_norm": 0.26075392961502075, + "learning_rate": 0.00021436936936936933, + "loss": 0.4325, + "step": 1911 + }, + { + "epoch": 0.5732059209293611, + "grad_norm": 0.23287932574748993, + "learning_rate": 0.00021432432432432432, + "loss": 0.4265, + "step": 1912 + }, + { + "epoch": 0.5735057148210605, + "grad_norm": 0.2624457776546478, + "learning_rate": 0.00021427927927927925, + "loss": 0.4433, + "step": 1913 + }, + { + "epoch": 0.5738055087127599, + "grad_norm": 0.23440878093242645, + "learning_rate": 0.0002142342342342342, + "loss": 0.4136, + "step": 1914 + }, + { + "epoch": 0.5741053026044595, + "grad_norm": 0.2694714069366455, + "learning_rate": 0.00021418918918918918, + "loss": 0.4508, + "step": 1915 + }, + { + "epoch": 0.5744050964961589, + "grad_norm": 0.261952668428421, + "learning_rate": 0.00021414414414414412, + "loss": 0.4697, + "step": 1916 + }, + { + "epoch": 0.5747048903878583, + "grad_norm": 0.2526634931564331, + "learning_rate": 0.00021409909909909908, + "loss": 0.4518, + "step": 1917 + }, + { + "epoch": 0.5750046842795578, + "grad_norm": 0.2554527223110199, + "learning_rate": 0.00021405405405405405, + "loss": 0.4802, + "step": 1918 + }, + { + "epoch": 0.5753044781712573, + "grad_norm": 0.2729927599430084, + "learning_rate": 0.00021400900900900898, + "loss": 0.4622, + "step": 1919 + }, + { + "epoch": 0.5756042720629567, + "grad_norm": 0.26347601413726807, + "learning_rate": 0.00021396396396396395, + "loss": 0.4709, + "step": 1920 + }, + { + "epoch": 0.5759040659546562, + "grad_norm": 0.27795466780662537, + "learning_rate": 0.0002139189189189189, + "loss": 0.4555, + "step": 1921 + }, + { + "epoch": 0.5762038598463556, + "grad_norm": 0.25687262415885925, + "learning_rate": 0.00021387387387387385, + "loss": 0.4459, + "step": 1922 + }, + { + "epoch": 0.5765036537380551, + "grad_norm": 0.24855007231235504, + "learning_rate": 0.0002138288288288288, + "loss": 0.3944, + "step": 1923 + }, + { + "epoch": 0.5768034476297546, + "grad_norm": 0.26772525906562805, + "learning_rate": 0.00021378378378378378, + "loss": 0.4557, + "step": 1924 + }, + { + "epoch": 0.577103241521454, + "grad_norm": 0.2783243954181671, + "learning_rate": 0.0002137387387387387, + "loss": 0.4507, + "step": 1925 + }, + { + "epoch": 0.5774030354131534, + "grad_norm": 0.26769769191741943, + "learning_rate": 0.00021369369369369368, + "loss": 0.4371, + "step": 1926 + }, + { + "epoch": 0.577702829304853, + "grad_norm": 0.2783718705177307, + "learning_rate": 0.00021364864864864864, + "loss": 0.4269, + "step": 1927 + }, + { + "epoch": 0.5780026231965524, + "grad_norm": 0.2654764652252197, + "learning_rate": 0.0002136036036036036, + "loss": 0.4407, + "step": 1928 + }, + { + "epoch": 0.5783024170882518, + "grad_norm": 0.2694533169269562, + "learning_rate": 0.00021355855855855854, + "loss": 0.4822, + "step": 1929 + }, + { + "epoch": 0.5786022109799512, + "grad_norm": 0.2403596192598343, + "learning_rate": 0.00021351351351351348, + "loss": 0.3985, + "step": 1930 + }, + { + "epoch": 0.5789020048716508, + "grad_norm": 0.26435697078704834, + "learning_rate": 0.00021346846846846847, + "loss": 0.4547, + "step": 1931 + }, + { + "epoch": 0.5792017987633502, + "grad_norm": 0.2416696548461914, + "learning_rate": 0.0002134234234234234, + "loss": 0.406, + "step": 1932 + }, + { + "epoch": 0.5795015926550496, + "grad_norm": 0.2664091885089874, + "learning_rate": 0.00021337837837837834, + "loss": 0.412, + "step": 1933 + }, + { + "epoch": 0.5798013865467491, + "grad_norm": 0.24384742975234985, + "learning_rate": 0.00021333333333333333, + "loss": 0.4621, + "step": 1934 + }, + { + "epoch": 0.5801011804384486, + "grad_norm": 0.27569761872291565, + "learning_rate": 0.00021328828828828827, + "loss": 0.4643, + "step": 1935 + }, + { + "epoch": 0.580400974330148, + "grad_norm": 0.2567230761051178, + "learning_rate": 0.0002132432432432432, + "loss": 0.4231, + "step": 1936 + }, + { + "epoch": 0.5807007682218475, + "grad_norm": 0.25014641880989075, + "learning_rate": 0.0002131981981981982, + "loss": 0.4461, + "step": 1937 + }, + { + "epoch": 0.5810005621135469, + "grad_norm": 0.2679264545440674, + "learning_rate": 0.00021315315315315313, + "loss": 0.4321, + "step": 1938 + }, + { + "epoch": 0.5813003560052464, + "grad_norm": 0.2585947513580322, + "learning_rate": 0.00021310810810810807, + "loss": 0.4486, + "step": 1939 + }, + { + "epoch": 0.5816001498969459, + "grad_norm": 0.2640276253223419, + "learning_rate": 0.00021306306306306306, + "loss": 0.4439, + "step": 1940 + }, + { + "epoch": 0.5818999437886453, + "grad_norm": 0.2556924521923065, + "learning_rate": 0.000213018018018018, + "loss": 0.4557, + "step": 1941 + }, + { + "epoch": 0.5821997376803447, + "grad_norm": 0.2560097873210907, + "learning_rate": 0.00021297297297297294, + "loss": 0.4384, + "step": 1942 + }, + { + "epoch": 0.5824995315720443, + "grad_norm": 0.25773030519485474, + "learning_rate": 0.00021292792792792793, + "loss": 0.4289, + "step": 1943 + }, + { + "epoch": 0.5827993254637437, + "grad_norm": 0.26476195454597473, + "learning_rate": 0.00021288288288288286, + "loss": 0.4513, + "step": 1944 + }, + { + "epoch": 0.5830991193554431, + "grad_norm": 0.26929306983947754, + "learning_rate": 0.0002128378378378378, + "loss": 0.4464, + "step": 1945 + }, + { + "epoch": 0.5833989132471425, + "grad_norm": 0.25713875889778137, + "learning_rate": 0.0002127927927927928, + "loss": 0.4422, + "step": 1946 + }, + { + "epoch": 0.5836987071388421, + "grad_norm": 0.24906396865844727, + "learning_rate": 0.00021274774774774773, + "loss": 0.4454, + "step": 1947 + }, + { + "epoch": 0.5839985010305415, + "grad_norm": 0.24760214984416962, + "learning_rate": 0.00021270270270270266, + "loss": 0.4343, + "step": 1948 + }, + { + "epoch": 0.584298294922241, + "grad_norm": 0.25164082646369934, + "learning_rate": 0.00021265765765765766, + "loss": 0.4455, + "step": 1949 + }, + { + "epoch": 0.5845980888139404, + "grad_norm": 0.24564093351364136, + "learning_rate": 0.0002126126126126126, + "loss": 0.4389, + "step": 1950 + }, + { + "epoch": 0.5848978827056399, + "grad_norm": 0.2754795551300049, + "learning_rate": 0.00021256756756756756, + "loss": 0.466, + "step": 1951 + }, + { + "epoch": 0.5851976765973393, + "grad_norm": 0.2439223974943161, + "learning_rate": 0.00021252252252252252, + "loss": 0.4425, + "step": 1952 + }, + { + "epoch": 0.5854974704890388, + "grad_norm": 0.24320489168167114, + "learning_rate": 0.00021247747747747746, + "loss": 0.4085, + "step": 1953 + }, + { + "epoch": 0.5857972643807382, + "grad_norm": 0.2758175730705261, + "learning_rate": 0.00021243243243243242, + "loss": 0.4606, + "step": 1954 + }, + { + "epoch": 0.5860970582724377, + "grad_norm": 0.2623588442802429, + "learning_rate": 0.00021238738738738736, + "loss": 0.4167, + "step": 1955 + }, + { + "epoch": 0.5863968521641372, + "grad_norm": 0.2561275064945221, + "learning_rate": 0.00021234234234234232, + "loss": 0.4399, + "step": 1956 + }, + { + "epoch": 0.5866966460558366, + "grad_norm": 0.2818892300128937, + "learning_rate": 0.00021229729729729728, + "loss": 0.4628, + "step": 1957 + }, + { + "epoch": 0.586996439947536, + "grad_norm": 0.26609155535697937, + "learning_rate": 0.00021225225225225222, + "loss": 0.4285, + "step": 1958 + }, + { + "epoch": 0.5872962338392356, + "grad_norm": 0.2503769099712372, + "learning_rate": 0.00021220720720720719, + "loss": 0.4632, + "step": 1959 + }, + { + "epoch": 0.587596027730935, + "grad_norm": 0.2810426950454712, + "learning_rate": 0.00021216216216216215, + "loss": 0.4591, + "step": 1960 + }, + { + "epoch": 0.5878958216226344, + "grad_norm": 0.2517390251159668, + "learning_rate": 0.00021211711711711709, + "loss": 0.4323, + "step": 1961 + }, + { + "epoch": 0.5881956155143339, + "grad_norm": 0.26425543427467346, + "learning_rate": 0.00021207207207207205, + "loss": 0.4466, + "step": 1962 + }, + { + "epoch": 0.5884954094060334, + "grad_norm": 0.25192978978157043, + "learning_rate": 0.00021202702702702701, + "loss": 0.4524, + "step": 1963 + }, + { + "epoch": 0.5887952032977328, + "grad_norm": 0.26266035437583923, + "learning_rate": 0.00021198198198198195, + "loss": 0.4417, + "step": 1964 + }, + { + "epoch": 0.5890949971894323, + "grad_norm": 0.31183677911758423, + "learning_rate": 0.00021193693693693694, + "loss": 0.4473, + "step": 1965 + }, + { + "epoch": 0.5893947910811317, + "grad_norm": 0.24742548167705536, + "learning_rate": 0.00021189189189189188, + "loss": 0.4209, + "step": 1966 + }, + { + "epoch": 0.5896945849728312, + "grad_norm": 0.27282243967056274, + "learning_rate": 0.00021184684684684682, + "loss": 0.4316, + "step": 1967 + }, + { + "epoch": 0.5899943788645307, + "grad_norm": 0.2748444974422455, + "learning_rate": 0.0002118018018018018, + "loss": 0.4276, + "step": 1968 + }, + { + "epoch": 0.5902941727562301, + "grad_norm": 0.2746492624282837, + "learning_rate": 0.00021175675675675674, + "loss": 0.4597, + "step": 1969 + }, + { + "epoch": 0.5905939666479295, + "grad_norm": 0.25905507802963257, + "learning_rate": 0.00021171171171171168, + "loss": 0.4668, + "step": 1970 + }, + { + "epoch": 0.590893760539629, + "grad_norm": 0.2456134557723999, + "learning_rate": 0.00021166666666666667, + "loss": 0.4057, + "step": 1971 + }, + { + "epoch": 0.5911935544313285, + "grad_norm": 0.25221529603004456, + "learning_rate": 0.0002116216216216216, + "loss": 0.4427, + "step": 1972 + }, + { + "epoch": 0.5914933483230279, + "grad_norm": 0.24493472278118134, + "learning_rate": 0.00021157657657657654, + "loss": 0.4206, + "step": 1973 + }, + { + "epoch": 0.5917931422147273, + "grad_norm": 0.2561238408088684, + "learning_rate": 0.00021153153153153154, + "loss": 0.4537, + "step": 1974 + }, + { + "epoch": 0.5920929361064269, + "grad_norm": 0.2350313663482666, + "learning_rate": 0.00021148648648648647, + "loss": 0.4163, + "step": 1975 + }, + { + "epoch": 0.5923927299981263, + "grad_norm": 0.2535955309867859, + "learning_rate": 0.0002114414414414414, + "loss": 0.4435, + "step": 1976 + }, + { + "epoch": 0.5926925238898257, + "grad_norm": 0.2498706728219986, + "learning_rate": 0.00021139639639639637, + "loss": 0.4507, + "step": 1977 + }, + { + "epoch": 0.5929923177815252, + "grad_norm": 0.2510056495666504, + "learning_rate": 0.00021135135135135134, + "loss": 0.4268, + "step": 1978 + }, + { + "epoch": 0.5932921116732247, + "grad_norm": 0.252605676651001, + "learning_rate": 0.00021130630630630627, + "loss": 0.4457, + "step": 1979 + }, + { + "epoch": 0.5935919055649241, + "grad_norm": 0.2618269622325897, + "learning_rate": 0.00021126126126126124, + "loss": 0.4447, + "step": 1980 + }, + { + "epoch": 0.5938916994566236, + "grad_norm": 0.25301989912986755, + "learning_rate": 0.0002112162162162162, + "loss": 0.4468, + "step": 1981 + }, + { + "epoch": 0.594191493348323, + "grad_norm": 0.2434222400188446, + "learning_rate": 0.00021117117117117114, + "loss": 0.4348, + "step": 1982 + }, + { + "epoch": 0.5944912872400225, + "grad_norm": 0.24123191833496094, + "learning_rate": 0.0002111261261261261, + "loss": 0.4485, + "step": 1983 + }, + { + "epoch": 0.594791081131722, + "grad_norm": 0.2643167972564697, + "learning_rate": 0.00021108108108108107, + "loss": 0.4464, + "step": 1984 + }, + { + "epoch": 0.5950908750234214, + "grad_norm": 0.2503741979598999, + "learning_rate": 0.00021103603603603603, + "loss": 0.4247, + "step": 1985 + }, + { + "epoch": 0.5953906689151208, + "grad_norm": 0.23599058389663696, + "learning_rate": 0.00021099099099099097, + "loss": 0.4089, + "step": 1986 + }, + { + "epoch": 0.5956904628068204, + "grad_norm": 0.25657573342323303, + "learning_rate": 0.00021094594594594593, + "loss": 0.445, + "step": 1987 + }, + { + "epoch": 0.5959902566985198, + "grad_norm": 0.26530593633651733, + "learning_rate": 0.0002109009009009009, + "loss": 0.4386, + "step": 1988 + }, + { + "epoch": 0.5962900505902192, + "grad_norm": 0.2542692720890045, + "learning_rate": 0.00021085585585585583, + "loss": 0.4431, + "step": 1989 + }, + { + "epoch": 0.5965898444819187, + "grad_norm": 0.2595093548297882, + "learning_rate": 0.0002108108108108108, + "loss": 0.4243, + "step": 1990 + }, + { + "epoch": 0.5968896383736182, + "grad_norm": 0.2481933832168579, + "learning_rate": 0.00021076576576576576, + "loss": 0.4215, + "step": 1991 + }, + { + "epoch": 0.5971894322653176, + "grad_norm": 0.2607382535934448, + "learning_rate": 0.0002107207207207207, + "loss": 0.4505, + "step": 1992 + }, + { + "epoch": 0.597489226157017, + "grad_norm": 0.26596030592918396, + "learning_rate": 0.00021067567567567566, + "loss": 0.4703, + "step": 1993 + }, + { + "epoch": 0.5977890200487165, + "grad_norm": 0.24884359538555145, + "learning_rate": 0.00021063063063063062, + "loss": 0.4237, + "step": 1994 + }, + { + "epoch": 0.5980888139404159, + "grad_norm": 0.23510870337486267, + "learning_rate": 0.00021058558558558556, + "loss": 0.4235, + "step": 1995 + }, + { + "epoch": 0.5983886078321154, + "grad_norm": 0.25982344150543213, + "learning_rate": 0.00021054054054054052, + "loss": 0.4262, + "step": 1996 + }, + { + "epoch": 0.5986884017238149, + "grad_norm": 0.24918124079704285, + "learning_rate": 0.0002104954954954955, + "loss": 0.4225, + "step": 1997 + }, + { + "epoch": 0.5989881956155143, + "grad_norm": 0.26578548550605774, + "learning_rate": 0.00021045045045045042, + "loss": 0.4685, + "step": 1998 + }, + { + "epoch": 0.5992879895072137, + "grad_norm": 0.2682873010635376, + "learning_rate": 0.00021040540540540542, + "loss": 0.4291, + "step": 1999 + }, + { + "epoch": 0.5995877833989133, + "grad_norm": 0.247017040848732, + "learning_rate": 0.00021036036036036035, + "loss": 0.4457, + "step": 2000 + }, + { + "epoch": 0.5995877833989133, + "eval_loss": 0.4406769275665283, + "eval_runtime": 564.9558, + "eval_samples_per_second": 3.822, + "eval_steps_per_second": 0.478, + "step": 2000 + }, + { + "epoch": 0.5998875772906127, + "grad_norm": 0.25887760519981384, + "learning_rate": 0.0002103153153153153, + "loss": 0.4261, + "step": 2001 + }, + { + "epoch": 0.6001873711823121, + "grad_norm": 0.26813459396362305, + "learning_rate": 0.00021027027027027023, + "loss": 0.4091, + "step": 2002 + }, + { + "epoch": 0.6004871650740116, + "grad_norm": 0.2717922329902649, + "learning_rate": 0.00021022522522522522, + "loss": 0.4257, + "step": 2003 + }, + { + "epoch": 0.6007869589657111, + "grad_norm": 0.2423432320356369, + "learning_rate": 0.00021018018018018015, + "loss": 0.4187, + "step": 2004 + }, + { + "epoch": 0.6010867528574105, + "grad_norm": 0.2616721987724304, + "learning_rate": 0.0002101351351351351, + "loss": 0.4414, + "step": 2005 + }, + { + "epoch": 0.60138654674911, + "grad_norm": 0.2668519914150238, + "learning_rate": 0.00021009009009009008, + "loss": 0.4566, + "step": 2006 + }, + { + "epoch": 0.6016863406408094, + "grad_norm": 0.24378737807273865, + "learning_rate": 0.00021004504504504502, + "loss": 0.4379, + "step": 2007 + }, + { + "epoch": 0.6019861345325089, + "grad_norm": 0.24571847915649414, + "learning_rate": 0.00020999999999999998, + "loss": 0.4491, + "step": 2008 + }, + { + "epoch": 0.6022859284242084, + "grad_norm": 0.23369182646274567, + "learning_rate": 0.00020995495495495495, + "loss": 0.4023, + "step": 2009 + }, + { + "epoch": 0.6025857223159078, + "grad_norm": 0.27274230122566223, + "learning_rate": 0.00020990990990990988, + "loss": 0.4514, + "step": 2010 + }, + { + "epoch": 0.6028855162076072, + "grad_norm": 0.29650700092315674, + "learning_rate": 0.00020986486486486485, + "loss": 0.4553, + "step": 2011 + }, + { + "epoch": 0.6031853100993068, + "grad_norm": 0.2543351650238037, + "learning_rate": 0.0002098198198198198, + "loss": 0.4522, + "step": 2012 + }, + { + "epoch": 0.6034851039910062, + "grad_norm": 0.25208958983421326, + "learning_rate": 0.00020977477477477475, + "loss": 0.4342, + "step": 2013 + }, + { + "epoch": 0.6037848978827056, + "grad_norm": 0.24142181873321533, + "learning_rate": 0.0002097297297297297, + "loss": 0.4106, + "step": 2014 + }, + { + "epoch": 0.604084691774405, + "grad_norm": 0.2537544071674347, + "learning_rate": 0.00020968468468468467, + "loss": 0.4373, + "step": 2015 + }, + { + "epoch": 0.6043844856661046, + "grad_norm": 0.27682605385780334, + "learning_rate": 0.0002096396396396396, + "loss": 0.4491, + "step": 2016 + }, + { + "epoch": 0.604684279557804, + "grad_norm": 0.24604292213916779, + "learning_rate": 0.00020959459459459458, + "loss": 0.4059, + "step": 2017 + }, + { + "epoch": 0.6049840734495034, + "grad_norm": 0.24552318453788757, + "learning_rate": 0.00020954954954954954, + "loss": 0.4361, + "step": 2018 + }, + { + "epoch": 0.6052838673412029, + "grad_norm": 0.24892392754554749, + "learning_rate": 0.00020950450450450448, + "loss": 0.4579, + "step": 2019 + }, + { + "epoch": 0.6055836612329024, + "grad_norm": 0.26011762022972107, + "learning_rate": 0.00020945945945945944, + "loss": 0.4444, + "step": 2020 + }, + { + "epoch": 0.6058834551246018, + "grad_norm": 0.25486037135124207, + "learning_rate": 0.0002094144144144144, + "loss": 0.4256, + "step": 2021 + }, + { + "epoch": 0.6061832490163013, + "grad_norm": 0.24915416538715363, + "learning_rate": 0.00020936936936936937, + "loss": 0.4295, + "step": 2022 + }, + { + "epoch": 0.6064830429080007, + "grad_norm": 0.24448558688163757, + "learning_rate": 0.0002093243243243243, + "loss": 0.422, + "step": 2023 + }, + { + "epoch": 0.6067828367997002, + "grad_norm": 0.24826547503471375, + "learning_rate": 0.00020927927927927927, + "loss": 0.3927, + "step": 2024 + }, + { + "epoch": 0.6070826306913997, + "grad_norm": 0.24415460228919983, + "learning_rate": 0.00020923423423423423, + "loss": 0.4128, + "step": 2025 + }, + { + "epoch": 0.6073824245830991, + "grad_norm": 0.25376150012016296, + "learning_rate": 0.00020918918918918917, + "loss": 0.4363, + "step": 2026 + }, + { + "epoch": 0.6076822184747985, + "grad_norm": 0.2526264786720276, + "learning_rate": 0.0002091441441441441, + "loss": 0.4354, + "step": 2027 + }, + { + "epoch": 0.6079820123664981, + "grad_norm": 0.25586017966270447, + "learning_rate": 0.0002090990990990991, + "loss": 0.401, + "step": 2028 + }, + { + "epoch": 0.6082818062581975, + "grad_norm": 0.2642875909805298, + "learning_rate": 0.00020905405405405403, + "loss": 0.4476, + "step": 2029 + }, + { + "epoch": 0.6085816001498969, + "grad_norm": 0.2795594334602356, + "learning_rate": 0.00020900900900900897, + "loss": 0.4845, + "step": 2030 + }, + { + "epoch": 0.6088813940415964, + "grad_norm": 0.23246948421001434, + "learning_rate": 0.00020896396396396396, + "loss": 0.3906, + "step": 2031 + }, + { + "epoch": 0.6091811879332959, + "grad_norm": 0.25595536828041077, + "learning_rate": 0.0002089189189189189, + "loss": 0.4331, + "step": 2032 + }, + { + "epoch": 0.6094809818249953, + "grad_norm": 0.24884217977523804, + "learning_rate": 0.00020887387387387383, + "loss": 0.4361, + "step": 2033 + }, + { + "epoch": 0.6097807757166948, + "grad_norm": 0.25283509492874146, + "learning_rate": 0.00020882882882882883, + "loss": 0.422, + "step": 2034 + }, + { + "epoch": 0.6100805696083942, + "grad_norm": 0.25681042671203613, + "learning_rate": 0.00020878378378378376, + "loss": 0.4301, + "step": 2035 + }, + { + "epoch": 0.6103803635000937, + "grad_norm": 0.2695556879043579, + "learning_rate": 0.0002087387387387387, + "loss": 0.4624, + "step": 2036 + }, + { + "epoch": 0.6106801573917932, + "grad_norm": 0.28260117769241333, + "learning_rate": 0.0002086936936936937, + "loss": 0.4608, + "step": 2037 + }, + { + "epoch": 0.6109799512834926, + "grad_norm": 0.2578640878200531, + "learning_rate": 0.00020864864864864863, + "loss": 0.4638, + "step": 2038 + }, + { + "epoch": 0.611279745175192, + "grad_norm": 0.2544034421443939, + "learning_rate": 0.00020860360360360356, + "loss": 0.4222, + "step": 2039 + }, + { + "epoch": 0.6115795390668916, + "grad_norm": 0.2726188600063324, + "learning_rate": 0.00020855855855855855, + "loss": 0.4802, + "step": 2040 + }, + { + "epoch": 0.611879332958591, + "grad_norm": 0.26486557722091675, + "learning_rate": 0.0002085135135135135, + "loss": 0.4475, + "step": 2041 + }, + { + "epoch": 0.6121791268502904, + "grad_norm": 0.2517718970775604, + "learning_rate": 0.00020846846846846843, + "loss": 0.4445, + "step": 2042 + }, + { + "epoch": 0.6124789207419898, + "grad_norm": 0.2507137060165405, + "learning_rate": 0.00020842342342342342, + "loss": 0.4243, + "step": 2043 + }, + { + "epoch": 0.6127787146336894, + "grad_norm": 0.2707221806049347, + "learning_rate": 0.00020837837837837836, + "loss": 0.4706, + "step": 2044 + }, + { + "epoch": 0.6130785085253888, + "grad_norm": 0.24258100986480713, + "learning_rate": 0.00020833333333333332, + "loss": 0.423, + "step": 2045 + }, + { + "epoch": 0.6133783024170882, + "grad_norm": 0.25611022114753723, + "learning_rate": 0.00020828828828828828, + "loss": 0.4553, + "step": 2046 + }, + { + "epoch": 0.6136780963087877, + "grad_norm": 0.26037782430648804, + "learning_rate": 0.00020824324324324322, + "loss": 0.4173, + "step": 2047 + }, + { + "epoch": 0.6139778902004872, + "grad_norm": 0.26126980781555176, + "learning_rate": 0.00020819819819819818, + "loss": 0.4556, + "step": 2048 + }, + { + "epoch": 0.6142776840921866, + "grad_norm": 0.283407598733902, + "learning_rate": 0.00020815315315315315, + "loss": 0.4421, + "step": 2049 + }, + { + "epoch": 0.6145774779838861, + "grad_norm": 0.2533104717731476, + "learning_rate": 0.00020810810810810808, + "loss": 0.4253, + "step": 2050 + }, + { + "epoch": 0.6148772718755855, + "grad_norm": 0.24108561873435974, + "learning_rate": 0.00020806306306306305, + "loss": 0.438, + "step": 2051 + }, + { + "epoch": 0.615177065767285, + "grad_norm": 0.2516781687736511, + "learning_rate": 0.00020801801801801799, + "loss": 0.4245, + "step": 2052 + }, + { + "epoch": 0.6154768596589845, + "grad_norm": 0.25689882040023804, + "learning_rate": 0.00020797297297297295, + "loss": 0.4412, + "step": 2053 + }, + { + "epoch": 0.6157766535506839, + "grad_norm": 0.2661590576171875, + "learning_rate": 0.0002079279279279279, + "loss": 0.47, + "step": 2054 + }, + { + "epoch": 0.6160764474423833, + "grad_norm": 0.2580914795398712, + "learning_rate": 0.00020788288288288285, + "loss": 0.4537, + "step": 2055 + }, + { + "epoch": 0.6163762413340829, + "grad_norm": 0.23960340023040771, + "learning_rate": 0.00020783783783783784, + "loss": 0.4535, + "step": 2056 + }, + { + "epoch": 0.6166760352257823, + "grad_norm": 0.2658163905143738, + "learning_rate": 0.00020779279279279278, + "loss": 0.4483, + "step": 2057 + }, + { + "epoch": 0.6169758291174817, + "grad_norm": 0.2633172869682312, + "learning_rate": 0.00020774774774774771, + "loss": 0.4408, + "step": 2058 + }, + { + "epoch": 0.6172756230091812, + "grad_norm": 0.2618810832500458, + "learning_rate": 0.0002077027027027027, + "loss": 0.4497, + "step": 2059 + }, + { + "epoch": 0.6175754169008807, + "grad_norm": 0.2505929470062256, + "learning_rate": 0.00020765765765765764, + "loss": 0.4411, + "step": 2060 + }, + { + "epoch": 0.6178752107925801, + "grad_norm": 0.2556394338607788, + "learning_rate": 0.00020761261261261258, + "loss": 0.4427, + "step": 2061 + }, + { + "epoch": 0.6181750046842795, + "grad_norm": 0.2584443688392639, + "learning_rate": 0.00020756756756756757, + "loss": 0.4707, + "step": 2062 + }, + { + "epoch": 0.618474798575979, + "grad_norm": 0.2633087635040283, + "learning_rate": 0.0002075225225225225, + "loss": 0.4576, + "step": 2063 + }, + { + "epoch": 0.6187745924676785, + "grad_norm": 0.2487655133008957, + "learning_rate": 0.00020747747747747744, + "loss": 0.4374, + "step": 2064 + }, + { + "epoch": 0.619074386359378, + "grad_norm": 0.24193575978279114, + "learning_rate": 0.00020743243243243243, + "loss": 0.4267, + "step": 2065 + }, + { + "epoch": 0.6193741802510774, + "grad_norm": 0.2395676076412201, + "learning_rate": 0.00020738738738738737, + "loss": 0.4236, + "step": 2066 + }, + { + "epoch": 0.6196739741427768, + "grad_norm": 0.24150055646896362, + "learning_rate": 0.0002073423423423423, + "loss": 0.4138, + "step": 2067 + }, + { + "epoch": 0.6199737680344763, + "grad_norm": 0.2652229368686676, + "learning_rate": 0.0002072972972972973, + "loss": 0.4248, + "step": 2068 + }, + { + "epoch": 0.6202735619261758, + "grad_norm": 0.24750439822673798, + "learning_rate": 0.00020725225225225224, + "loss": 0.4273, + "step": 2069 + }, + { + "epoch": 0.6205733558178752, + "grad_norm": 0.25777891278266907, + "learning_rate": 0.00020720720720720717, + "loss": 0.4471, + "step": 2070 + }, + { + "epoch": 0.6208731497095746, + "grad_norm": 0.25587761402130127, + "learning_rate": 0.00020716216216216216, + "loss": 0.4246, + "step": 2071 + }, + { + "epoch": 0.6211729436012742, + "grad_norm": 0.2518714368343353, + "learning_rate": 0.0002071171171171171, + "loss": 0.4645, + "step": 2072 + }, + { + "epoch": 0.6214727374929736, + "grad_norm": 0.24348096549510956, + "learning_rate": 0.00020707207207207204, + "loss": 0.4454, + "step": 2073 + }, + { + "epoch": 0.621772531384673, + "grad_norm": 0.2423403263092041, + "learning_rate": 0.000207027027027027, + "loss": 0.405, + "step": 2074 + }, + { + "epoch": 0.6220723252763725, + "grad_norm": 0.26894888281822205, + "learning_rate": 0.00020698198198198196, + "loss": 0.4699, + "step": 2075 + }, + { + "epoch": 0.622372119168072, + "grad_norm": 0.2578108608722687, + "learning_rate": 0.0002069369369369369, + "loss": 0.4365, + "step": 2076 + }, + { + "epoch": 0.6226719130597714, + "grad_norm": 0.27144232392311096, + "learning_rate": 0.00020689189189189187, + "loss": 0.4291, + "step": 2077 + }, + { + "epoch": 0.6229717069514709, + "grad_norm": 0.25405609607696533, + "learning_rate": 0.00020684684684684683, + "loss": 0.4389, + "step": 2078 + }, + { + "epoch": 0.6232715008431703, + "grad_norm": 0.24775098264217377, + "learning_rate": 0.0002068018018018018, + "loss": 0.416, + "step": 2079 + }, + { + "epoch": 0.6235712947348698, + "grad_norm": 0.25767782330513, + "learning_rate": 0.00020675675675675673, + "loss": 0.455, + "step": 2080 + }, + { + "epoch": 0.6238710886265693, + "grad_norm": 0.2590571641921997, + "learning_rate": 0.0002067117117117117, + "loss": 0.4436, + "step": 2081 + }, + { + "epoch": 0.6241708825182687, + "grad_norm": 0.24577729403972626, + "learning_rate": 0.00020666666666666666, + "loss": 0.4389, + "step": 2082 + }, + { + "epoch": 0.6244706764099681, + "grad_norm": 0.25743165612220764, + "learning_rate": 0.0002066216216216216, + "loss": 0.4467, + "step": 2083 + }, + { + "epoch": 0.6247704703016677, + "grad_norm": 0.24127769470214844, + "learning_rate": 0.00020657657657657656, + "loss": 0.4056, + "step": 2084 + }, + { + "epoch": 0.6250702641933671, + "grad_norm": 0.25930923223495483, + "learning_rate": 0.00020653153153153152, + "loss": 0.4352, + "step": 2085 + }, + { + "epoch": 0.6253700580850665, + "grad_norm": 0.23111720383167267, + "learning_rate": 0.00020648648648648646, + "loss": 0.4035, + "step": 2086 + }, + { + "epoch": 0.625669851976766, + "grad_norm": 0.25705838203430176, + "learning_rate": 0.00020644144144144142, + "loss": 0.4402, + "step": 2087 + }, + { + "epoch": 0.6259696458684655, + "grad_norm": 0.260232150554657, + "learning_rate": 0.0002063963963963964, + "loss": 0.444, + "step": 2088 + }, + { + "epoch": 0.6262694397601649, + "grad_norm": 0.24525994062423706, + "learning_rate": 0.00020635135135135132, + "loss": 0.4461, + "step": 2089 + }, + { + "epoch": 0.6265692336518643, + "grad_norm": 0.2550069987773895, + "learning_rate": 0.00020630630630630631, + "loss": 0.4391, + "step": 2090 + }, + { + "epoch": 0.6268690275435638, + "grad_norm": 0.24561873078346252, + "learning_rate": 0.00020626126126126125, + "loss": 0.424, + "step": 2091 + }, + { + "epoch": 0.6271688214352632, + "grad_norm": 0.2500348687171936, + "learning_rate": 0.0002062162162162162, + "loss": 0.4415, + "step": 2092 + }, + { + "epoch": 0.6274686153269627, + "grad_norm": 0.23684681951999664, + "learning_rate": 0.00020617117117117118, + "loss": 0.4282, + "step": 2093 + }, + { + "epoch": 0.6277684092186622, + "grad_norm": 0.2459501475095749, + "learning_rate": 0.00020612612612612612, + "loss": 0.4229, + "step": 2094 + }, + { + "epoch": 0.6280682031103616, + "grad_norm": 0.23220065236091614, + "learning_rate": 0.00020608108108108105, + "loss": 0.4083, + "step": 2095 + }, + { + "epoch": 0.628367997002061, + "grad_norm": 0.26996707916259766, + "learning_rate": 0.00020603603603603604, + "loss": 0.4581, + "step": 2096 + }, + { + "epoch": 0.6286677908937606, + "grad_norm": 0.2551998496055603, + "learning_rate": 0.00020599099099099098, + "loss": 0.4444, + "step": 2097 + }, + { + "epoch": 0.62896758478546, + "grad_norm": 0.2604631781578064, + "learning_rate": 0.00020594594594594592, + "loss": 0.4367, + "step": 2098 + }, + { + "epoch": 0.6292673786771594, + "grad_norm": 0.25958266854286194, + "learning_rate": 0.00020590090090090085, + "loss": 0.4618, + "step": 2099 + }, + { + "epoch": 0.6295671725688589, + "grad_norm": 0.2714870274066925, + "learning_rate": 0.00020585585585585584, + "loss": 0.4382, + "step": 2100 + }, + { + "epoch": 0.6298669664605584, + "grad_norm": 0.2739773392677307, + "learning_rate": 0.00020581081081081078, + "loss": 0.4752, + "step": 2101 + }, + { + "epoch": 0.6301667603522578, + "grad_norm": 0.24829277396202087, + "learning_rate": 0.00020576576576576575, + "loss": 0.4285, + "step": 2102 + }, + { + "epoch": 0.6304665542439573, + "grad_norm": 0.2551855742931366, + "learning_rate": 0.0002057207207207207, + "loss": 0.4259, + "step": 2103 + }, + { + "epoch": 0.6307663481356567, + "grad_norm": 0.243735671043396, + "learning_rate": 0.00020567567567567565, + "loss": 0.4201, + "step": 2104 + }, + { + "epoch": 0.6310661420273562, + "grad_norm": 0.2511364817619324, + "learning_rate": 0.0002056306306306306, + "loss": 0.4351, + "step": 2105 + }, + { + "epoch": 0.6313659359190557, + "grad_norm": 0.2456447184085846, + "learning_rate": 0.00020558558558558557, + "loss": 0.448, + "step": 2106 + }, + { + "epoch": 0.6316657298107551, + "grad_norm": 0.26450565457344055, + "learning_rate": 0.0002055405405405405, + "loss": 0.4576, + "step": 2107 + }, + { + "epoch": 0.6319655237024545, + "grad_norm": 0.25267186760902405, + "learning_rate": 0.00020549549549549547, + "loss": 0.4511, + "step": 2108 + }, + { + "epoch": 0.632265317594154, + "grad_norm": 0.2436206340789795, + "learning_rate": 0.00020545045045045044, + "loss": 0.4147, + "step": 2109 + }, + { + "epoch": 0.6325651114858535, + "grad_norm": 0.27077367901802063, + "learning_rate": 0.00020540540540540537, + "loss": 0.4687, + "step": 2110 + }, + { + "epoch": 0.6328649053775529, + "grad_norm": 0.25476735830307007, + "learning_rate": 0.00020536036036036034, + "loss": 0.4357, + "step": 2111 + }, + { + "epoch": 0.6331646992692523, + "grad_norm": 0.23889677226543427, + "learning_rate": 0.0002053153153153153, + "loss": 0.4109, + "step": 2112 + }, + { + "epoch": 0.6334644931609519, + "grad_norm": 0.2620011270046234, + "learning_rate": 0.00020527027027027027, + "loss": 0.4418, + "step": 2113 + }, + { + "epoch": 0.6337642870526513, + "grad_norm": 0.24259789288043976, + "learning_rate": 0.0002052252252252252, + "loss": 0.4158, + "step": 2114 + }, + { + "epoch": 0.6340640809443507, + "grad_norm": 0.26212331652641296, + "learning_rate": 0.00020518018018018017, + "loss": 0.4407, + "step": 2115 + }, + { + "epoch": 0.6343638748360502, + "grad_norm": 0.2421627789735794, + "learning_rate": 0.00020513513513513513, + "loss": 0.4162, + "step": 2116 + }, + { + "epoch": 0.6346636687277497, + "grad_norm": 0.25949686765670776, + "learning_rate": 0.00020509009009009007, + "loss": 0.4436, + "step": 2117 + }, + { + "epoch": 0.6349634626194491, + "grad_norm": 0.26797404885292053, + "learning_rate": 0.00020504504504504503, + "loss": 0.4684, + "step": 2118 + }, + { + "epoch": 0.6352632565111486, + "grad_norm": 0.2433563470840454, + "learning_rate": 0.000205, + "loss": 0.4312, + "step": 2119 + }, + { + "epoch": 0.635563050402848, + "grad_norm": 0.25377362966537476, + "learning_rate": 0.00020495495495495493, + "loss": 0.4583, + "step": 2120 + }, + { + "epoch": 0.6358628442945475, + "grad_norm": 0.2583523392677307, + "learning_rate": 0.0002049099099099099, + "loss": 0.4587, + "step": 2121 + }, + { + "epoch": 0.636162638186247, + "grad_norm": 0.2512962520122528, + "learning_rate": 0.00020486486486486486, + "loss": 0.4148, + "step": 2122 + }, + { + "epoch": 0.6364624320779464, + "grad_norm": 0.24238212406635284, + "learning_rate": 0.0002048198198198198, + "loss": 0.4057, + "step": 2123 + }, + { + "epoch": 0.6367622259696458, + "grad_norm": 0.25228351354599, + "learning_rate": 0.00020477477477477473, + "loss": 0.4291, + "step": 2124 + }, + { + "epoch": 0.6370620198613454, + "grad_norm": 0.26437970995903015, + "learning_rate": 0.00020472972972972972, + "loss": 0.4425, + "step": 2125 + }, + { + "epoch": 0.6373618137530448, + "grad_norm": 0.2502360939979553, + "learning_rate": 0.00020468468468468466, + "loss": 0.4102, + "step": 2126 + }, + { + "epoch": 0.6376616076447442, + "grad_norm": 0.24895761907100677, + "learning_rate": 0.0002046396396396396, + "loss": 0.4423, + "step": 2127 + }, + { + "epoch": 0.6379614015364437, + "grad_norm": 0.2674795985221863, + "learning_rate": 0.0002045945945945946, + "loss": 0.4572, + "step": 2128 + }, + { + "epoch": 0.6382611954281432, + "grad_norm": 0.2468012422323227, + "learning_rate": 0.00020454954954954953, + "loss": 0.4525, + "step": 2129 + }, + { + "epoch": 0.6385609893198426, + "grad_norm": 0.2560267448425293, + "learning_rate": 0.00020450450450450446, + "loss": 0.4553, + "step": 2130 + }, + { + "epoch": 0.638860783211542, + "grad_norm": 0.25216519832611084, + "learning_rate": 0.00020445945945945945, + "loss": 0.4319, + "step": 2131 + }, + { + "epoch": 0.6391605771032415, + "grad_norm": 0.22927191853523254, + "learning_rate": 0.0002044144144144144, + "loss": 0.4088, + "step": 2132 + }, + { + "epoch": 0.639460370994941, + "grad_norm": 0.2495734840631485, + "learning_rate": 0.00020436936936936933, + "loss": 0.4301, + "step": 2133 + }, + { + "epoch": 0.6397601648866404, + "grad_norm": 0.2604648470878601, + "learning_rate": 0.00020432432432432432, + "loss": 0.4115, + "step": 2134 + }, + { + "epoch": 0.6400599587783399, + "grad_norm": 0.27185025811195374, + "learning_rate": 0.00020427927927927925, + "loss": 0.4452, + "step": 2135 + }, + { + "epoch": 0.6403597526700393, + "grad_norm": 0.26344043016433716, + "learning_rate": 0.00020423423423423422, + "loss": 0.4178, + "step": 2136 + }, + { + "epoch": 0.6406595465617388, + "grad_norm": 0.26041179895401, + "learning_rate": 0.00020418918918918918, + "loss": 0.4384, + "step": 2137 + }, + { + "epoch": 0.6409593404534383, + "grad_norm": 0.25143757462501526, + "learning_rate": 0.00020414414414414412, + "loss": 0.4486, + "step": 2138 + }, + { + "epoch": 0.6412591343451377, + "grad_norm": 0.2869662642478943, + "learning_rate": 0.00020409909909909908, + "loss": 0.487, + "step": 2139 + }, + { + "epoch": 0.6415589282368371, + "grad_norm": 0.2788044214248657, + "learning_rate": 0.00020405405405405405, + "loss": 0.4255, + "step": 2140 + }, + { + "epoch": 0.6418587221285367, + "grad_norm": 0.27385109663009644, + "learning_rate": 0.00020400900900900898, + "loss": 0.4535, + "step": 2141 + }, + { + "epoch": 0.6421585160202361, + "grad_norm": 0.26231497526168823, + "learning_rate": 0.00020396396396396395, + "loss": 0.4435, + "step": 2142 + }, + { + "epoch": 0.6424583099119355, + "grad_norm": 0.26582983136177063, + "learning_rate": 0.0002039189189189189, + "loss": 0.4394, + "step": 2143 + }, + { + "epoch": 0.642758103803635, + "grad_norm": 0.2665957510471344, + "learning_rate": 0.00020387387387387385, + "loss": 0.4401, + "step": 2144 + }, + { + "epoch": 0.6430578976953345, + "grad_norm": 0.2581312358379364, + "learning_rate": 0.0002038288288288288, + "loss": 0.4302, + "step": 2145 + }, + { + "epoch": 0.6433576915870339, + "grad_norm": 0.25063759088516235, + "learning_rate": 0.00020378378378378375, + "loss": 0.4558, + "step": 2146 + }, + { + "epoch": 0.6436574854787334, + "grad_norm": 0.2655949890613556, + "learning_rate": 0.00020373873873873874, + "loss": 0.4559, + "step": 2147 + }, + { + "epoch": 0.6439572793704328, + "grad_norm": 0.25797712802886963, + "learning_rate": 0.00020369369369369368, + "loss": 0.4127, + "step": 2148 + }, + { + "epoch": 0.6442570732621323, + "grad_norm": 0.2774755358695984, + "learning_rate": 0.00020364864864864861, + "loss": 0.4807, + "step": 2149 + }, + { + "epoch": 0.6445568671538318, + "grad_norm": 0.27283889055252075, + "learning_rate": 0.0002036036036036036, + "loss": 0.4483, + "step": 2150 + }, + { + "epoch": 0.6448566610455312, + "grad_norm": 0.2647114396095276, + "learning_rate": 0.00020355855855855854, + "loss": 0.4629, + "step": 2151 + }, + { + "epoch": 0.6451564549372306, + "grad_norm": 0.2683508098125458, + "learning_rate": 0.00020351351351351348, + "loss": 0.4755, + "step": 2152 + }, + { + "epoch": 0.6454562488289302, + "grad_norm": 0.26375094056129456, + "learning_rate": 0.00020346846846846847, + "loss": 0.4354, + "step": 2153 + }, + { + "epoch": 0.6457560427206296, + "grad_norm": 0.2688734233379364, + "learning_rate": 0.0002034234234234234, + "loss": 0.4515, + "step": 2154 + }, + { + "epoch": 0.646055836612329, + "grad_norm": 0.2545178532600403, + "learning_rate": 0.00020337837837837834, + "loss": 0.4281, + "step": 2155 + }, + { + "epoch": 0.6463556305040284, + "grad_norm": 0.24753254652023315, + "learning_rate": 0.00020333333333333333, + "loss": 0.4332, + "step": 2156 + }, + { + "epoch": 0.646655424395728, + "grad_norm": 0.2619148790836334, + "learning_rate": 0.00020328828828828827, + "loss": 0.4531, + "step": 2157 + }, + { + "epoch": 0.6469552182874274, + "grad_norm": 0.2698518633842468, + "learning_rate": 0.0002032432432432432, + "loss": 0.4807, + "step": 2158 + }, + { + "epoch": 0.6472550121791268, + "grad_norm": 0.2625516355037689, + "learning_rate": 0.0002031981981981982, + "loss": 0.4368, + "step": 2159 + }, + { + "epoch": 0.6475548060708263, + "grad_norm": 0.25750574469566345, + "learning_rate": 0.00020315315315315313, + "loss": 0.4489, + "step": 2160 + }, + { + "epoch": 0.6478545999625258, + "grad_norm": 0.2963887155056, + "learning_rate": 0.00020310810810810807, + "loss": 0.4567, + "step": 2161 + }, + { + "epoch": 0.6481543938542252, + "grad_norm": 0.28631022572517395, + "learning_rate": 0.00020306306306306306, + "loss": 0.457, + "step": 2162 + }, + { + "epoch": 0.6484541877459247, + "grad_norm": 0.2652076184749603, + "learning_rate": 0.000203018018018018, + "loss": 0.4008, + "step": 2163 + }, + { + "epoch": 0.6487539816376241, + "grad_norm": 0.25599247217178345, + "learning_rate": 0.00020297297297297294, + "loss": 0.4236, + "step": 2164 + }, + { + "epoch": 0.6490537755293236, + "grad_norm": 0.2655317783355713, + "learning_rate": 0.00020292792792792793, + "loss": 0.4401, + "step": 2165 + }, + { + "epoch": 0.6493535694210231, + "grad_norm": 0.27608954906463623, + "learning_rate": 0.00020288288288288286, + "loss": 0.4469, + "step": 2166 + }, + { + "epoch": 0.6496533633127225, + "grad_norm": 0.2523987293243408, + "learning_rate": 0.0002028378378378378, + "loss": 0.4134, + "step": 2167 + }, + { + "epoch": 0.6499531572044219, + "grad_norm": 0.26536789536476135, + "learning_rate": 0.0002027927927927928, + "loss": 0.4416, + "step": 2168 + }, + { + "epoch": 0.6502529510961215, + "grad_norm": 0.24977469444274902, + "learning_rate": 0.00020274774774774773, + "loss": 0.4318, + "step": 2169 + }, + { + "epoch": 0.6505527449878209, + "grad_norm": 0.27510321140289307, + "learning_rate": 0.0002027027027027027, + "loss": 0.4535, + "step": 2170 + }, + { + "epoch": 0.6508525388795203, + "grad_norm": 0.24680355191230774, + "learning_rate": 0.00020265765765765763, + "loss": 0.4247, + "step": 2171 + }, + { + "epoch": 0.6511523327712198, + "grad_norm": 0.24580539762973785, + "learning_rate": 0.0002026126126126126, + "loss": 0.4356, + "step": 2172 + }, + { + "epoch": 0.6514521266629193, + "grad_norm": 0.2560003101825714, + "learning_rate": 0.00020256756756756756, + "loss": 0.4019, + "step": 2173 + }, + { + "epoch": 0.6517519205546187, + "grad_norm": 0.2403692603111267, + "learning_rate": 0.0002025225225225225, + "loss": 0.4088, + "step": 2174 + }, + { + "epoch": 0.6520517144463182, + "grad_norm": 0.25952261686325073, + "learning_rate": 0.00020247747747747746, + "loss": 0.4585, + "step": 2175 + }, + { + "epoch": 0.6523515083380176, + "grad_norm": 0.2858710289001465, + "learning_rate": 0.00020243243243243242, + "loss": 0.4616, + "step": 2176 + }, + { + "epoch": 0.6526513022297171, + "grad_norm": 0.2761286795139313, + "learning_rate": 0.00020238738738738736, + "loss": 0.4694, + "step": 2177 + }, + { + "epoch": 0.6529510961214166, + "grad_norm": 0.2620023190975189, + "learning_rate": 0.00020234234234234232, + "loss": 0.4441, + "step": 2178 + }, + { + "epoch": 0.653250890013116, + "grad_norm": 0.2743069529533386, + "learning_rate": 0.00020229729729729729, + "loss": 0.4433, + "step": 2179 + }, + { + "epoch": 0.6535506839048154, + "grad_norm": 0.25842079520225525, + "learning_rate": 0.00020225225225225222, + "loss": 0.475, + "step": 2180 + }, + { + "epoch": 0.653850477796515, + "grad_norm": 0.267971932888031, + "learning_rate": 0.0002022072072072072, + "loss": 0.4384, + "step": 2181 + }, + { + "epoch": 0.6541502716882144, + "grad_norm": 0.26229405403137207, + "learning_rate": 0.00020216216216216215, + "loss": 0.4525, + "step": 2182 + }, + { + "epoch": 0.6544500655799138, + "grad_norm": 0.26667454838752747, + "learning_rate": 0.0002021171171171171, + "loss": 0.4384, + "step": 2183 + }, + { + "epoch": 0.6547498594716132, + "grad_norm": 0.2867937982082367, + "learning_rate": 0.00020207207207207208, + "loss": 0.4575, + "step": 2184 + }, + { + "epoch": 0.6550496533633127, + "grad_norm": 0.2772350013256073, + "learning_rate": 0.00020202702702702701, + "loss": 0.4515, + "step": 2185 + }, + { + "epoch": 0.6553494472550122, + "grad_norm": 0.2599480450153351, + "learning_rate": 0.00020198198198198195, + "loss": 0.4614, + "step": 2186 + }, + { + "epoch": 0.6556492411467116, + "grad_norm": 0.25755733251571655, + "learning_rate": 0.00020193693693693694, + "loss": 0.4543, + "step": 2187 + }, + { + "epoch": 0.6559490350384111, + "grad_norm": 0.2630176246166229, + "learning_rate": 0.00020189189189189188, + "loss": 0.4317, + "step": 2188 + }, + { + "epoch": 0.6562488289301105, + "grad_norm": 0.25234025716781616, + "learning_rate": 0.00020184684684684682, + "loss": 0.4187, + "step": 2189 + }, + { + "epoch": 0.65654862282181, + "grad_norm": 0.2525855302810669, + "learning_rate": 0.0002018018018018018, + "loss": 0.4364, + "step": 2190 + }, + { + "epoch": 0.6568484167135095, + "grad_norm": 0.2502008378505707, + "learning_rate": 0.00020175675675675674, + "loss": 0.4162, + "step": 2191 + }, + { + "epoch": 0.6571482106052089, + "grad_norm": 0.24894630908966064, + "learning_rate": 0.00020171171171171168, + "loss": 0.4287, + "step": 2192 + }, + { + "epoch": 0.6574480044969083, + "grad_norm": 0.24618011713027954, + "learning_rate": 0.00020166666666666667, + "loss": 0.4225, + "step": 2193 + }, + { + "epoch": 0.6577477983886079, + "grad_norm": 0.3233836591243744, + "learning_rate": 0.0002016216216216216, + "loss": 0.5051, + "step": 2194 + }, + { + "epoch": 0.6580475922803073, + "grad_norm": 0.2600880265235901, + "learning_rate": 0.00020157657657657655, + "loss": 0.4437, + "step": 2195 + }, + { + "epoch": 0.6583473861720067, + "grad_norm": 0.2576145529747009, + "learning_rate": 0.0002015315315315315, + "loss": 0.4035, + "step": 2196 + }, + { + "epoch": 0.6586471800637062, + "grad_norm": 0.2555258572101593, + "learning_rate": 0.00020148648648648647, + "loss": 0.409, + "step": 2197 + }, + { + "epoch": 0.6589469739554057, + "grad_norm": 0.2691539525985718, + "learning_rate": 0.0002014414414414414, + "loss": 0.4281, + "step": 2198 + }, + { + "epoch": 0.6592467678471051, + "grad_norm": 0.2896745502948761, + "learning_rate": 0.00020139639639639637, + "loss": 0.4529, + "step": 2199 + }, + { + "epoch": 0.6595465617388045, + "grad_norm": 0.2579841911792755, + "learning_rate": 0.00020135135135135134, + "loss": 0.4479, + "step": 2200 + }, + { + "epoch": 0.659846355630504, + "grad_norm": 0.24643990397453308, + "learning_rate": 0.00020130630630630627, + "loss": 0.4205, + "step": 2201 + }, + { + "epoch": 0.6601461495222035, + "grad_norm": 0.2523176968097687, + "learning_rate": 0.00020126126126126124, + "loss": 0.428, + "step": 2202 + }, + { + "epoch": 0.660445943413903, + "grad_norm": 0.27141350507736206, + "learning_rate": 0.0002012162162162162, + "loss": 0.4345, + "step": 2203 + }, + { + "epoch": 0.6607457373056024, + "grad_norm": 0.2594592571258545, + "learning_rate": 0.00020117117117117117, + "loss": 0.428, + "step": 2204 + }, + { + "epoch": 0.6610455311973018, + "grad_norm": 0.2673409581184387, + "learning_rate": 0.0002011261261261261, + "loss": 0.4229, + "step": 2205 + }, + { + "epoch": 0.6613453250890013, + "grad_norm": 0.26660290360450745, + "learning_rate": 0.00020108108108108107, + "loss": 0.4635, + "step": 2206 + }, + { + "epoch": 0.6616451189807008, + "grad_norm": 0.2599898874759674, + "learning_rate": 0.00020103603603603603, + "loss": 0.4398, + "step": 2207 + }, + { + "epoch": 0.6619449128724002, + "grad_norm": 0.2786708474159241, + "learning_rate": 0.00020099099099099097, + "loss": 0.4221, + "step": 2208 + }, + { + "epoch": 0.6622447067640996, + "grad_norm": 0.2445315271615982, + "learning_rate": 0.00020094594594594593, + "loss": 0.4143, + "step": 2209 + }, + { + "epoch": 0.6625445006557992, + "grad_norm": 0.2615225315093994, + "learning_rate": 0.0002009009009009009, + "loss": 0.4388, + "step": 2210 + }, + { + "epoch": 0.6628442945474986, + "grad_norm": 0.25724494457244873, + "learning_rate": 0.00020085585585585583, + "loss": 0.4359, + "step": 2211 + }, + { + "epoch": 0.663144088439198, + "grad_norm": 0.2562429904937744, + "learning_rate": 0.0002008108108108108, + "loss": 0.4213, + "step": 2212 + }, + { + "epoch": 0.6634438823308975, + "grad_norm": 0.25428321957588196, + "learning_rate": 0.00020076576576576576, + "loss": 0.4398, + "step": 2213 + }, + { + "epoch": 0.663743676222597, + "grad_norm": 0.2503153383731842, + "learning_rate": 0.0002007207207207207, + "loss": 0.4266, + "step": 2214 + }, + { + "epoch": 0.6640434701142964, + "grad_norm": 0.26181626319885254, + "learning_rate": 0.00020067567567567566, + "loss": 0.4398, + "step": 2215 + }, + { + "epoch": 0.6643432640059959, + "grad_norm": 0.26600703597068787, + "learning_rate": 0.00020063063063063062, + "loss": 0.4418, + "step": 2216 + }, + { + "epoch": 0.6646430578976953, + "grad_norm": 0.27476897835731506, + "learning_rate": 0.00020058558558558556, + "loss": 0.4405, + "step": 2217 + }, + { + "epoch": 0.6649428517893948, + "grad_norm": 0.23994912207126617, + "learning_rate": 0.0002005405405405405, + "loss": 0.4171, + "step": 2218 + }, + { + "epoch": 0.6652426456810943, + "grad_norm": 0.2647401988506317, + "learning_rate": 0.0002004954954954955, + "loss": 0.4514, + "step": 2219 + }, + { + "epoch": 0.6655424395727937, + "grad_norm": 0.27040109038352966, + "learning_rate": 0.00020045045045045043, + "loss": 0.4665, + "step": 2220 + }, + { + "epoch": 0.6658422334644931, + "grad_norm": 0.252128005027771, + "learning_rate": 0.00020040540540540536, + "loss": 0.4316, + "step": 2221 + }, + { + "epoch": 0.6661420273561927, + "grad_norm": 0.2605067193508148, + "learning_rate": 0.00020036036036036035, + "loss": 0.4242, + "step": 2222 + }, + { + "epoch": 0.6664418212478921, + "grad_norm": 0.26456427574157715, + "learning_rate": 0.0002003153153153153, + "loss": 0.4379, + "step": 2223 + }, + { + "epoch": 0.6667416151395915, + "grad_norm": 0.24853096902370453, + "learning_rate": 0.00020027027027027023, + "loss": 0.4339, + "step": 2224 + }, + { + "epoch": 0.6670414090312909, + "grad_norm": 0.2573966979980469, + "learning_rate": 0.00020022522522522522, + "loss": 0.4427, + "step": 2225 + }, + { + "epoch": 0.6673412029229905, + "grad_norm": 0.2402806282043457, + "learning_rate": 0.00020018018018018015, + "loss": 0.4195, + "step": 2226 + }, + { + "epoch": 0.6676409968146899, + "grad_norm": 0.2738892138004303, + "learning_rate": 0.00020013513513513512, + "loss": 0.4374, + "step": 2227 + }, + { + "epoch": 0.6679407907063893, + "grad_norm": 0.2548046410083771, + "learning_rate": 0.00020009009009009008, + "loss": 0.4312, + "step": 2228 + }, + { + "epoch": 0.6682405845980888, + "grad_norm": 0.24718345701694489, + "learning_rate": 0.00020004504504504502, + "loss": 0.4533, + "step": 2229 + }, + { + "epoch": 0.6685403784897883, + "grad_norm": 0.2544947862625122, + "learning_rate": 0.00019999999999999998, + "loss": 0.4255, + "step": 2230 + }, + { + "epoch": 0.6688401723814877, + "grad_norm": 0.2689761817455292, + "learning_rate": 0.00019995495495495495, + "loss": 0.451, + "step": 2231 + }, + { + "epoch": 0.6691399662731872, + "grad_norm": 0.2824248969554901, + "learning_rate": 0.00019990990990990988, + "loss": 0.4374, + "step": 2232 + }, + { + "epoch": 0.6694397601648866, + "grad_norm": 0.25299206376075745, + "learning_rate": 0.00019986486486486485, + "loss": 0.4301, + "step": 2233 + }, + { + "epoch": 0.6697395540565861, + "grad_norm": 0.26550740003585815, + "learning_rate": 0.0001998198198198198, + "loss": 0.4332, + "step": 2234 + }, + { + "epoch": 0.6700393479482856, + "grad_norm": 0.2446885108947754, + "learning_rate": 0.00019977477477477475, + "loss": 0.4207, + "step": 2235 + }, + { + "epoch": 0.670339141839985, + "grad_norm": 0.23779337108135223, + "learning_rate": 0.0001997297297297297, + "loss": 0.432, + "step": 2236 + }, + { + "epoch": 0.6706389357316844, + "grad_norm": 0.2619169354438782, + "learning_rate": 0.00019968468468468468, + "loss": 0.4332, + "step": 2237 + }, + { + "epoch": 0.670938729623384, + "grad_norm": 0.258789598941803, + "learning_rate": 0.00019963963963963964, + "loss": 0.4328, + "step": 2238 + }, + { + "epoch": 0.6712385235150834, + "grad_norm": 0.27282267808914185, + "learning_rate": 0.00019959459459459458, + "loss": 0.4423, + "step": 2239 + }, + { + "epoch": 0.6715383174067828, + "grad_norm": 0.2593368887901306, + "learning_rate": 0.00019954954954954954, + "loss": 0.4473, + "step": 2240 + }, + { + "epoch": 0.6718381112984823, + "grad_norm": 0.25508931279182434, + "learning_rate": 0.0001995045045045045, + "loss": 0.4371, + "step": 2241 + }, + { + "epoch": 0.6721379051901818, + "grad_norm": 0.25891220569610596, + "learning_rate": 0.00019945945945945944, + "loss": 0.4143, + "step": 2242 + }, + { + "epoch": 0.6724376990818812, + "grad_norm": 0.2661759853363037, + "learning_rate": 0.00019941441441441438, + "loss": 0.4585, + "step": 2243 + }, + { + "epoch": 0.6727374929735807, + "grad_norm": 0.25630587339401245, + "learning_rate": 0.00019936936936936937, + "loss": 0.4442, + "step": 2244 + }, + { + "epoch": 0.6730372868652801, + "grad_norm": 0.27050697803497314, + "learning_rate": 0.0001993243243243243, + "loss": 0.4437, + "step": 2245 + }, + { + "epoch": 0.6733370807569796, + "grad_norm": 0.2564866244792938, + "learning_rate": 0.00019927927927927924, + "loss": 0.4352, + "step": 2246 + }, + { + "epoch": 0.673636874648679, + "grad_norm": 0.2438693344593048, + "learning_rate": 0.00019923423423423423, + "loss": 0.4138, + "step": 2247 + }, + { + "epoch": 0.6739366685403785, + "grad_norm": 0.26038050651550293, + "learning_rate": 0.00019918918918918917, + "loss": 0.4309, + "step": 2248 + }, + { + "epoch": 0.6742364624320779, + "grad_norm": 0.2429644614458084, + "learning_rate": 0.0001991441441441441, + "loss": 0.4036, + "step": 2249 + }, + { + "epoch": 0.6745362563237775, + "grad_norm": 0.24670763313770294, + "learning_rate": 0.0001990990990990991, + "loss": 0.4054, + "step": 2250 + }, + { + "epoch": 0.6748360502154769, + "grad_norm": 0.2641808092594147, + "learning_rate": 0.00019905405405405403, + "loss": 0.4431, + "step": 2251 + }, + { + "epoch": 0.6751358441071763, + "grad_norm": 0.2601335644721985, + "learning_rate": 0.00019900900900900897, + "loss": 0.403, + "step": 2252 + }, + { + "epoch": 0.6754356379988757, + "grad_norm": 0.2698741555213928, + "learning_rate": 0.00019896396396396396, + "loss": 0.4328, + "step": 2253 + }, + { + "epoch": 0.6757354318905753, + "grad_norm": 0.26246562600135803, + "learning_rate": 0.0001989189189189189, + "loss": 0.4264, + "step": 2254 + }, + { + "epoch": 0.6760352257822747, + "grad_norm": 0.26453712582588196, + "learning_rate": 0.00019887387387387384, + "loss": 0.4552, + "step": 2255 + }, + { + "epoch": 0.6763350196739741, + "grad_norm": 0.24887052178382874, + "learning_rate": 0.00019882882882882883, + "loss": 0.3918, + "step": 2256 + }, + { + "epoch": 0.6766348135656736, + "grad_norm": 0.26789602637290955, + "learning_rate": 0.00019878378378378376, + "loss": 0.4416, + "step": 2257 + }, + { + "epoch": 0.6769346074573731, + "grad_norm": 0.24345183372497559, + "learning_rate": 0.0001987387387387387, + "loss": 0.4355, + "step": 2258 + }, + { + "epoch": 0.6772344013490725, + "grad_norm": 0.2475103735923767, + "learning_rate": 0.0001986936936936937, + "loss": 0.4277, + "step": 2259 + }, + { + "epoch": 0.677534195240772, + "grad_norm": 0.2740587294101715, + "learning_rate": 0.00019864864864864863, + "loss": 0.4536, + "step": 2260 + }, + { + "epoch": 0.6778339891324714, + "grad_norm": 0.23657365143299103, + "learning_rate": 0.0001986036036036036, + "loss": 0.3887, + "step": 2261 + }, + { + "epoch": 0.6781337830241709, + "grad_norm": 0.267630398273468, + "learning_rate": 0.00019855855855855856, + "loss": 0.443, + "step": 2262 + }, + { + "epoch": 0.6784335769158704, + "grad_norm": 0.2708898186683655, + "learning_rate": 0.0001985135135135135, + "loss": 0.4398, + "step": 2263 + }, + { + "epoch": 0.6787333708075698, + "grad_norm": 0.26607415080070496, + "learning_rate": 0.00019846846846846846, + "loss": 0.4249, + "step": 2264 + }, + { + "epoch": 0.6790331646992692, + "grad_norm": 0.2398756742477417, + "learning_rate": 0.00019842342342342342, + "loss": 0.419, + "step": 2265 + }, + { + "epoch": 0.6793329585909688, + "grad_norm": 0.2509295344352722, + "learning_rate": 0.00019837837837837836, + "loss": 0.3858, + "step": 2266 + }, + { + "epoch": 0.6796327524826682, + "grad_norm": 0.30269870162010193, + "learning_rate": 0.00019833333333333332, + "loss": 0.4564, + "step": 2267 + }, + { + "epoch": 0.6799325463743676, + "grad_norm": 0.2576700448989868, + "learning_rate": 0.00019828828828828826, + "loss": 0.4296, + "step": 2268 + }, + { + "epoch": 0.680232340266067, + "grad_norm": 0.29139164090156555, + "learning_rate": 0.00019824324324324322, + "loss": 0.4583, + "step": 2269 + }, + { + "epoch": 0.6805321341577666, + "grad_norm": 0.2578124701976776, + "learning_rate": 0.00019819819819819818, + "loss": 0.4419, + "step": 2270 + }, + { + "epoch": 0.680831928049466, + "grad_norm": 0.2546633780002594, + "learning_rate": 0.00019815315315315312, + "loss": 0.4311, + "step": 2271 + }, + { + "epoch": 0.6811317219411654, + "grad_norm": 0.293409526348114, + "learning_rate": 0.00019810810810810809, + "loss": 0.4756, + "step": 2272 + }, + { + "epoch": 0.6814315158328649, + "grad_norm": 0.249635249376297, + "learning_rate": 0.00019806306306306305, + "loss": 0.4434, + "step": 2273 + }, + { + "epoch": 0.6817313097245644, + "grad_norm": 0.2729664146900177, + "learning_rate": 0.00019801801801801799, + "loss": 0.4721, + "step": 2274 + }, + { + "epoch": 0.6820311036162638, + "grad_norm": 0.24961845576763153, + "learning_rate": 0.00019797297297297298, + "loss": 0.4124, + "step": 2275 + }, + { + "epoch": 0.6823308975079633, + "grad_norm": 0.26508617401123047, + "learning_rate": 0.00019792792792792791, + "loss": 0.4311, + "step": 2276 + }, + { + "epoch": 0.6826306913996627, + "grad_norm": 0.24888217449188232, + "learning_rate": 0.00019788288288288285, + "loss": 0.4334, + "step": 2277 + }, + { + "epoch": 0.6829304852913622, + "grad_norm": 0.2550651431083679, + "learning_rate": 0.00019783783783783784, + "loss": 0.4289, + "step": 2278 + }, + { + "epoch": 0.6832302791830617, + "grad_norm": 0.25816190242767334, + "learning_rate": 0.00019779279279279278, + "loss": 0.4425, + "step": 2279 + }, + { + "epoch": 0.6835300730747611, + "grad_norm": 0.25145018100738525, + "learning_rate": 0.00019774774774774772, + "loss": 0.4123, + "step": 2280 + }, + { + "epoch": 0.6838298669664605, + "grad_norm": 0.24678850173950195, + "learning_rate": 0.0001977027027027027, + "loss": 0.4309, + "step": 2281 + }, + { + "epoch": 0.68412966085816, + "grad_norm": 0.2629925012588501, + "learning_rate": 0.00019765765765765764, + "loss": 0.4184, + "step": 2282 + }, + { + "epoch": 0.6844294547498595, + "grad_norm": 0.2568414807319641, + "learning_rate": 0.00019761261261261258, + "loss": 0.4164, + "step": 2283 + }, + { + "epoch": 0.6847292486415589, + "grad_norm": 0.25906744599342346, + "learning_rate": 0.00019756756756756757, + "loss": 0.4547, + "step": 2284 + }, + { + "epoch": 0.6850290425332584, + "grad_norm": 0.2697434723377228, + "learning_rate": 0.0001975225225225225, + "loss": 0.4444, + "step": 2285 + }, + { + "epoch": 0.6853288364249578, + "grad_norm": 0.2573794424533844, + "learning_rate": 0.00019747747747747744, + "loss": 0.4309, + "step": 2286 + }, + { + "epoch": 0.6856286303166573, + "grad_norm": 0.2532881796360016, + "learning_rate": 0.00019743243243243244, + "loss": 0.4172, + "step": 2287 + }, + { + "epoch": 0.6859284242083568, + "grad_norm": 0.253292977809906, + "learning_rate": 0.00019738738738738737, + "loss": 0.4305, + "step": 2288 + }, + { + "epoch": 0.6862282181000562, + "grad_norm": 0.246769517660141, + "learning_rate": 0.0001973423423423423, + "loss": 0.4106, + "step": 2289 + }, + { + "epoch": 0.6865280119917556, + "grad_norm": 0.2593647539615631, + "learning_rate": 0.0001972972972972973, + "loss": 0.4537, + "step": 2290 + }, + { + "epoch": 0.6868278058834552, + "grad_norm": 0.25611796975135803, + "learning_rate": 0.00019725225225225224, + "loss": 0.43, + "step": 2291 + }, + { + "epoch": 0.6871275997751546, + "grad_norm": 0.25119319558143616, + "learning_rate": 0.00019720720720720717, + "loss": 0.4142, + "step": 2292 + }, + { + "epoch": 0.687427393666854, + "grad_norm": 0.250675231218338, + "learning_rate": 0.00019716216216216214, + "loss": 0.4134, + "step": 2293 + }, + { + "epoch": 0.6877271875585534, + "grad_norm": 0.2680164873600006, + "learning_rate": 0.0001971171171171171, + "loss": 0.4201, + "step": 2294 + }, + { + "epoch": 0.688026981450253, + "grad_norm": 0.26599201560020447, + "learning_rate": 0.00019707207207207206, + "loss": 0.45, + "step": 2295 + }, + { + "epoch": 0.6883267753419524, + "grad_norm": 0.24248278141021729, + "learning_rate": 0.000197027027027027, + "loss": 0.4129, + "step": 2296 + }, + { + "epoch": 0.6886265692336518, + "grad_norm": 0.25668129324913025, + "learning_rate": 0.00019698198198198197, + "loss": 0.4354, + "step": 2297 + }, + { + "epoch": 0.6889263631253513, + "grad_norm": 0.26304370164871216, + "learning_rate": 0.00019693693693693693, + "loss": 0.4423, + "step": 2298 + }, + { + "epoch": 0.6892261570170508, + "grad_norm": 0.2509578466415405, + "learning_rate": 0.00019689189189189187, + "loss": 0.4263, + "step": 2299 + }, + { + "epoch": 0.6895259509087502, + "grad_norm": 0.2629247009754181, + "learning_rate": 0.00019684684684684683, + "loss": 0.4323, + "step": 2300 + }, + { + "epoch": 0.6898257448004497, + "grad_norm": 0.24706493318080902, + "learning_rate": 0.0001968018018018018, + "loss": 0.3913, + "step": 2301 + }, + { + "epoch": 0.6901255386921491, + "grad_norm": 0.29551559686660767, + "learning_rate": 0.00019675675675675673, + "loss": 0.419, + "step": 2302 + }, + { + "epoch": 0.6904253325838486, + "grad_norm": 0.2612929046154022, + "learning_rate": 0.0001967117117117117, + "loss": 0.4223, + "step": 2303 + }, + { + "epoch": 0.6907251264755481, + "grad_norm": 0.28399109840393066, + "learning_rate": 0.00019666666666666666, + "loss": 0.4715, + "step": 2304 + }, + { + "epoch": 0.6910249203672475, + "grad_norm": 0.24555319547653198, + "learning_rate": 0.0001966216216216216, + "loss": 0.4559, + "step": 2305 + }, + { + "epoch": 0.6913247142589469, + "grad_norm": 0.2576359808444977, + "learning_rate": 0.00019657657657657656, + "loss": 0.4314, + "step": 2306 + }, + { + "epoch": 0.6916245081506465, + "grad_norm": 0.25595325231552124, + "learning_rate": 0.00019653153153153152, + "loss": 0.4286, + "step": 2307 + }, + { + "epoch": 0.6919243020423459, + "grad_norm": 0.23903168737888336, + "learning_rate": 0.00019648648648648646, + "loss": 0.4259, + "step": 2308 + }, + { + "epoch": 0.6922240959340453, + "grad_norm": 0.2797984480857849, + "learning_rate": 0.00019644144144144145, + "loss": 0.4621, + "step": 2309 + }, + { + "epoch": 0.6925238898257448, + "grad_norm": 0.25375935435295105, + "learning_rate": 0.0001963963963963964, + "loss": 0.4356, + "step": 2310 + }, + { + "epoch": 0.6928236837174443, + "grad_norm": 0.2765314280986786, + "learning_rate": 0.00019635135135135132, + "loss": 0.4474, + "step": 2311 + }, + { + "epoch": 0.6931234776091437, + "grad_norm": 0.23902222514152527, + "learning_rate": 0.00019630630630630632, + "loss": 0.4176, + "step": 2312 + }, + { + "epoch": 0.6934232715008432, + "grad_norm": 0.278622031211853, + "learning_rate": 0.00019626126126126125, + "loss": 0.4177, + "step": 2313 + }, + { + "epoch": 0.6937230653925426, + "grad_norm": 0.25161993503570557, + "learning_rate": 0.0001962162162162162, + "loss": 0.4235, + "step": 2314 + }, + { + "epoch": 0.6940228592842421, + "grad_norm": 0.28174108266830444, + "learning_rate": 0.00019617117117117113, + "loss": 0.4469, + "step": 2315 + }, + { + "epoch": 0.6943226531759416, + "grad_norm": 0.24297156929969788, + "learning_rate": 0.00019612612612612612, + "loss": 0.4348, + "step": 2316 + }, + { + "epoch": 0.694622447067641, + "grad_norm": 0.2582569122314453, + "learning_rate": 0.00019608108108108105, + "loss": 0.4373, + "step": 2317 + }, + { + "epoch": 0.6949222409593404, + "grad_norm": 0.2808705270290375, + "learning_rate": 0.00019603603603603602, + "loss": 0.4574, + "step": 2318 + }, + { + "epoch": 0.69522203485104, + "grad_norm": 0.27071914076805115, + "learning_rate": 0.00019599099099099098, + "loss": 0.4449, + "step": 2319 + }, + { + "epoch": 0.6955218287427394, + "grad_norm": 0.27735450863838196, + "learning_rate": 0.00019594594594594592, + "loss": 0.4154, + "step": 2320 + }, + { + "epoch": 0.6958216226344388, + "grad_norm": 0.25535905361175537, + "learning_rate": 0.00019590090090090088, + "loss": 0.4432, + "step": 2321 + }, + { + "epoch": 0.6961214165261382, + "grad_norm": 0.24208863079547882, + "learning_rate": 0.00019585585585585585, + "loss": 0.4248, + "step": 2322 + }, + { + "epoch": 0.6964212104178378, + "grad_norm": 0.26040393114089966, + "learning_rate": 0.00019581081081081078, + "loss": 0.4215, + "step": 2323 + }, + { + "epoch": 0.6967210043095372, + "grad_norm": 0.24389687180519104, + "learning_rate": 0.00019576576576576575, + "loss": 0.422, + "step": 2324 + }, + { + "epoch": 0.6970207982012366, + "grad_norm": 0.2545843720436096, + "learning_rate": 0.0001957207207207207, + "loss": 0.4384, + "step": 2325 + }, + { + "epoch": 0.6973205920929361, + "grad_norm": 0.2566373348236084, + "learning_rate": 0.00019567567567567565, + "loss": 0.4383, + "step": 2326 + }, + { + "epoch": 0.6976203859846356, + "grad_norm": 0.2538570463657379, + "learning_rate": 0.0001956306306306306, + "loss": 0.4285, + "step": 2327 + }, + { + "epoch": 0.697920179876335, + "grad_norm": 0.25821006298065186, + "learning_rate": 0.00019558558558558557, + "loss": 0.4304, + "step": 2328 + }, + { + "epoch": 0.6982199737680345, + "grad_norm": 0.26139143109321594, + "learning_rate": 0.0001955405405405405, + "loss": 0.4416, + "step": 2329 + }, + { + "epoch": 0.6985197676597339, + "grad_norm": 0.2557656168937683, + "learning_rate": 0.00019549549549549548, + "loss": 0.4166, + "step": 2330 + }, + { + "epoch": 0.6988195615514334, + "grad_norm": 0.2611480951309204, + "learning_rate": 0.00019545045045045044, + "loss": 0.4463, + "step": 2331 + }, + { + "epoch": 0.6991193554431329, + "grad_norm": 0.24384301900863647, + "learning_rate": 0.0001954054054054054, + "loss": 0.4027, + "step": 2332 + }, + { + "epoch": 0.6994191493348323, + "grad_norm": 0.2693532407283783, + "learning_rate": 0.00019536036036036034, + "loss": 0.4449, + "step": 2333 + }, + { + "epoch": 0.6997189432265317, + "grad_norm": 0.2669787108898163, + "learning_rate": 0.0001953153153153153, + "loss": 0.4266, + "step": 2334 + }, + { + "epoch": 0.7000187371182313, + "grad_norm": 0.23384937644004822, + "learning_rate": 0.00019527027027027027, + "loss": 0.4103, + "step": 2335 + }, + { + "epoch": 0.7003185310099307, + "grad_norm": 0.2738743722438812, + "learning_rate": 0.0001952252252252252, + "loss": 0.4664, + "step": 2336 + }, + { + "epoch": 0.7006183249016301, + "grad_norm": 0.2557884752750397, + "learning_rate": 0.00019518018018018017, + "loss": 0.4121, + "step": 2337 + }, + { + "epoch": 0.7009181187933295, + "grad_norm": 0.24830694496631622, + "learning_rate": 0.00019513513513513513, + "loss": 0.4, + "step": 2338 + }, + { + "epoch": 0.7012179126850291, + "grad_norm": 0.2636083960533142, + "learning_rate": 0.00019509009009009007, + "loss": 0.4418, + "step": 2339 + }, + { + "epoch": 0.7015177065767285, + "grad_norm": 0.252029687166214, + "learning_rate": 0.000195045045045045, + "loss": 0.4205, + "step": 2340 + }, + { + "epoch": 0.701817500468428, + "grad_norm": 0.25448256731033325, + "learning_rate": 0.000195, + "loss": 0.4517, + "step": 2341 + }, + { + "epoch": 0.7021172943601274, + "grad_norm": 0.26609039306640625, + "learning_rate": 0.00019495495495495493, + "loss": 0.4213, + "step": 2342 + }, + { + "epoch": 0.7024170882518269, + "grad_norm": 0.2746337652206421, + "learning_rate": 0.00019490990990990987, + "loss": 0.4183, + "step": 2343 + }, + { + "epoch": 0.7027168821435263, + "grad_norm": 0.2514724135398865, + "learning_rate": 0.00019486486486486486, + "loss": 0.4329, + "step": 2344 + }, + { + "epoch": 0.7030166760352258, + "grad_norm": 0.27683892846107483, + "learning_rate": 0.0001948198198198198, + "loss": 0.4207, + "step": 2345 + }, + { + "epoch": 0.7033164699269252, + "grad_norm": 0.2525181174278259, + "learning_rate": 0.00019477477477477473, + "loss": 0.4193, + "step": 2346 + }, + { + "epoch": 0.7036162638186247, + "grad_norm": 0.2759072482585907, + "learning_rate": 0.00019472972972972973, + "loss": 0.4508, + "step": 2347 + }, + { + "epoch": 0.7039160577103242, + "grad_norm": 0.2594849169254303, + "learning_rate": 0.00019468468468468466, + "loss": 0.4289, + "step": 2348 + }, + { + "epoch": 0.7042158516020236, + "grad_norm": 0.26971113681793213, + "learning_rate": 0.0001946396396396396, + "loss": 0.4514, + "step": 2349 + }, + { + "epoch": 0.704515645493723, + "grad_norm": 0.25291457772254944, + "learning_rate": 0.0001945945945945946, + "loss": 0.4664, + "step": 2350 + }, + { + "epoch": 0.7048154393854226, + "grad_norm": 0.2617851495742798, + "learning_rate": 0.00019454954954954953, + "loss": 0.4301, + "step": 2351 + }, + { + "epoch": 0.705115233277122, + "grad_norm": 0.24216975271701813, + "learning_rate": 0.00019450450450450446, + "loss": 0.4144, + "step": 2352 + }, + { + "epoch": 0.7054150271688214, + "grad_norm": 0.2737904489040375, + "learning_rate": 0.00019445945945945945, + "loss": 0.417, + "step": 2353 + }, + { + "epoch": 0.7057148210605209, + "grad_norm": 0.27587682008743286, + "learning_rate": 0.0001944144144144144, + "loss": 0.4421, + "step": 2354 + }, + { + "epoch": 0.7060146149522204, + "grad_norm": 0.24917447566986084, + "learning_rate": 0.00019436936936936936, + "loss": 0.406, + "step": 2355 + }, + { + "epoch": 0.7063144088439198, + "grad_norm": 0.27958497405052185, + "learning_rate": 0.00019432432432432432, + "loss": 0.4591, + "step": 2356 + }, + { + "epoch": 0.7066142027356193, + "grad_norm": 0.27273818850517273, + "learning_rate": 0.00019427927927927926, + "loss": 0.4219, + "step": 2357 + }, + { + "epoch": 0.7069139966273187, + "grad_norm": 0.24517607688903809, + "learning_rate": 0.00019423423423423422, + "loss": 0.4063, + "step": 2358 + }, + { + "epoch": 0.7072137905190182, + "grad_norm": 0.28854820132255554, + "learning_rate": 0.00019418918918918918, + "loss": 0.4383, + "step": 2359 + }, + { + "epoch": 0.7075135844107177, + "grad_norm": 0.27321329712867737, + "learning_rate": 0.00019414414414414412, + "loss": 0.4479, + "step": 2360 + }, + { + "epoch": 0.7078133783024171, + "grad_norm": 0.2749727666378021, + "learning_rate": 0.00019409909909909908, + "loss": 0.4272, + "step": 2361 + }, + { + "epoch": 0.7081131721941165, + "grad_norm": 0.27384045720100403, + "learning_rate": 0.00019405405405405405, + "loss": 0.444, + "step": 2362 + }, + { + "epoch": 0.708412966085816, + "grad_norm": 0.2604135274887085, + "learning_rate": 0.00019400900900900898, + "loss": 0.4248, + "step": 2363 + }, + { + "epoch": 0.7087127599775155, + "grad_norm": 0.2598932385444641, + "learning_rate": 0.00019396396396396395, + "loss": 0.4401, + "step": 2364 + }, + { + "epoch": 0.7090125538692149, + "grad_norm": 0.253755122423172, + "learning_rate": 0.00019391891891891889, + "loss": 0.4316, + "step": 2365 + }, + { + "epoch": 0.7093123477609143, + "grad_norm": 0.2677047848701477, + "learning_rate": 0.00019387387387387388, + "loss": 0.4264, + "step": 2366 + }, + { + "epoch": 0.7096121416526139, + "grad_norm": 0.24191899597644806, + "learning_rate": 0.0001938288288288288, + "loss": 0.4172, + "step": 2367 + }, + { + "epoch": 0.7099119355443133, + "grad_norm": 0.2684822082519531, + "learning_rate": 0.00019378378378378375, + "loss": 0.4466, + "step": 2368 + }, + { + "epoch": 0.7102117294360127, + "grad_norm": 0.2859225273132324, + "learning_rate": 0.00019373873873873874, + "loss": 0.4348, + "step": 2369 + }, + { + "epoch": 0.7105115233277122, + "grad_norm": 0.2676656246185303, + "learning_rate": 0.00019369369369369368, + "loss": 0.4322, + "step": 2370 + }, + { + "epoch": 0.7108113172194117, + "grad_norm": 0.27145159244537354, + "learning_rate": 0.00019364864864864861, + "loss": 0.4485, + "step": 2371 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.2646559178829193, + "learning_rate": 0.0001936036036036036, + "loss": 0.4255, + "step": 2372 + }, + { + "epoch": 0.7114109050028106, + "grad_norm": 0.26495832204818726, + "learning_rate": 0.00019355855855855854, + "loss": 0.4681, + "step": 2373 + }, + { + "epoch": 0.71171069889451, + "grad_norm": 0.26360276341438293, + "learning_rate": 0.00019351351351351348, + "loss": 0.4219, + "step": 2374 + }, + { + "epoch": 0.7120104927862094, + "grad_norm": 0.25372758507728577, + "learning_rate": 0.00019346846846846847, + "loss": 0.4542, + "step": 2375 + }, + { + "epoch": 0.712310286677909, + "grad_norm": 0.25859972834587097, + "learning_rate": 0.0001934234234234234, + "loss": 0.4121, + "step": 2376 + }, + { + "epoch": 0.7126100805696084, + "grad_norm": 0.2402067929506302, + "learning_rate": 0.00019337837837837834, + "loss": 0.4132, + "step": 2377 + }, + { + "epoch": 0.7129098744613078, + "grad_norm": 0.25183209776878357, + "learning_rate": 0.00019333333333333333, + "loss": 0.4106, + "step": 2378 + }, + { + "epoch": 0.7132096683530073, + "grad_norm": 0.24842600524425507, + "learning_rate": 0.00019328828828828827, + "loss": 0.3895, + "step": 2379 + }, + { + "epoch": 0.7135094622447068, + "grad_norm": 0.2635684609413147, + "learning_rate": 0.0001932432432432432, + "loss": 0.4112, + "step": 2380 + }, + { + "epoch": 0.7138092561364062, + "grad_norm": 0.2578394412994385, + "learning_rate": 0.0001931981981981982, + "loss": 0.4429, + "step": 2381 + }, + { + "epoch": 0.7141090500281057, + "grad_norm": 0.2912173867225647, + "learning_rate": 0.00019315315315315314, + "loss": 0.4256, + "step": 2382 + }, + { + "epoch": 0.7144088439198051, + "grad_norm": 0.2592369019985199, + "learning_rate": 0.00019310810810810807, + "loss": 0.4357, + "step": 2383 + }, + { + "epoch": 0.7147086378115046, + "grad_norm": 0.25853514671325684, + "learning_rate": 0.00019306306306306306, + "loss": 0.3816, + "step": 2384 + }, + { + "epoch": 0.715008431703204, + "grad_norm": 0.26601412892341614, + "learning_rate": 0.000193018018018018, + "loss": 0.4344, + "step": 2385 + }, + { + "epoch": 0.7153082255949035, + "grad_norm": 0.2789902091026306, + "learning_rate": 0.00019297297297297294, + "loss": 0.435, + "step": 2386 + }, + { + "epoch": 0.7156080194866029, + "grad_norm": 0.285659521818161, + "learning_rate": 0.0001929279279279279, + "loss": 0.4194, + "step": 2387 + }, + { + "epoch": 0.7159078133783024, + "grad_norm": 0.24971207976341248, + "learning_rate": 0.00019288288288288286, + "loss": 0.4322, + "step": 2388 + }, + { + "epoch": 0.7162076072700019, + "grad_norm": 0.30142828822135925, + "learning_rate": 0.00019283783783783783, + "loss": 0.415, + "step": 2389 + }, + { + "epoch": 0.7165074011617013, + "grad_norm": 0.2843458950519562, + "learning_rate": 0.00019279279279279277, + "loss": 0.4605, + "step": 2390 + }, + { + "epoch": 0.7168071950534007, + "grad_norm": 0.24795542657375336, + "learning_rate": 0.00019274774774774773, + "loss": 0.4147, + "step": 2391 + }, + { + "epoch": 0.7171069889451003, + "grad_norm": 0.2908228635787964, + "learning_rate": 0.0001927027027027027, + "loss": 0.4573, + "step": 2392 + }, + { + "epoch": 0.7174067828367997, + "grad_norm": 0.2770592272281647, + "learning_rate": 0.00019265765765765763, + "loss": 0.4248, + "step": 2393 + }, + { + "epoch": 0.7177065767284991, + "grad_norm": 0.2638701796531677, + "learning_rate": 0.0001926126126126126, + "loss": 0.4409, + "step": 2394 + }, + { + "epoch": 0.7180063706201986, + "grad_norm": 0.2952423393726349, + "learning_rate": 0.00019256756756756756, + "loss": 0.4651, + "step": 2395 + }, + { + "epoch": 0.7183061645118981, + "grad_norm": 0.2776016294956207, + "learning_rate": 0.0001925225225225225, + "loss": 0.4453, + "step": 2396 + }, + { + "epoch": 0.7186059584035975, + "grad_norm": 0.2560594081878662, + "learning_rate": 0.00019247747747747746, + "loss": 0.4441, + "step": 2397 + }, + { + "epoch": 0.718905752295297, + "grad_norm": 0.2397725135087967, + "learning_rate": 0.00019243243243243242, + "loss": 0.3933, + "step": 2398 + }, + { + "epoch": 0.7192055461869964, + "grad_norm": 0.2630067467689514, + "learning_rate": 0.00019238738738738736, + "loss": 0.4334, + "step": 2399 + }, + { + "epoch": 0.7195053400786959, + "grad_norm": 0.2648056149482727, + "learning_rate": 0.00019234234234234235, + "loss": 0.4456, + "step": 2400 + }, + { + "epoch": 0.7198051339703954, + "grad_norm": 0.27124911546707153, + "learning_rate": 0.0001922972972972973, + "loss": 0.4324, + "step": 2401 + }, + { + "epoch": 0.7201049278620948, + "grad_norm": 0.2809448540210724, + "learning_rate": 0.00019225225225225222, + "loss": 0.4419, + "step": 2402 + }, + { + "epoch": 0.7204047217537942, + "grad_norm": 0.2577565610408783, + "learning_rate": 0.00019220720720720721, + "loss": 0.441, + "step": 2403 + }, + { + "epoch": 0.7207045156454938, + "grad_norm": 0.261111855506897, + "learning_rate": 0.00019216216216216215, + "loss": 0.4234, + "step": 2404 + }, + { + "epoch": 0.7210043095371932, + "grad_norm": 0.2598218321800232, + "learning_rate": 0.0001921171171171171, + "loss": 0.4183, + "step": 2405 + }, + { + "epoch": 0.7213041034288926, + "grad_norm": 0.2712027132511139, + "learning_rate": 0.00019207207207207208, + "loss": 0.4354, + "step": 2406 + }, + { + "epoch": 0.721603897320592, + "grad_norm": 0.27912774682044983, + "learning_rate": 0.00019202702702702702, + "loss": 0.4594, + "step": 2407 + }, + { + "epoch": 0.7219036912122916, + "grad_norm": 0.25328657031059265, + "learning_rate": 0.00019198198198198195, + "loss": 0.408, + "step": 2408 + }, + { + "epoch": 0.722203485103991, + "grad_norm": 0.2425694465637207, + "learning_rate": 0.00019193693693693694, + "loss": 0.4197, + "step": 2409 + }, + { + "epoch": 0.7225032789956904, + "grad_norm": 0.2552039623260498, + "learning_rate": 0.00019189189189189188, + "loss": 0.4383, + "step": 2410 + }, + { + "epoch": 0.7228030728873899, + "grad_norm": 0.26128917932510376, + "learning_rate": 0.00019184684684684682, + "loss": 0.4204, + "step": 2411 + }, + { + "epoch": 0.7231028667790894, + "grad_norm": 0.2713935077190399, + "learning_rate": 0.00019180180180180178, + "loss": 0.4325, + "step": 2412 + }, + { + "epoch": 0.7234026606707888, + "grad_norm": 0.257618248462677, + "learning_rate": 0.00019175675675675674, + "loss": 0.4443, + "step": 2413 + }, + { + "epoch": 0.7237024545624883, + "grad_norm": 0.2597353458404541, + "learning_rate": 0.00019171171171171168, + "loss": 0.4415, + "step": 2414 + }, + { + "epoch": 0.7240022484541877, + "grad_norm": 0.26509061455726624, + "learning_rate": 0.00019166666666666665, + "loss": 0.4711, + "step": 2415 + }, + { + "epoch": 0.7243020423458872, + "grad_norm": 0.26354658603668213, + "learning_rate": 0.0001916216216216216, + "loss": 0.4341, + "step": 2416 + }, + { + "epoch": 0.7246018362375867, + "grad_norm": 0.2364490032196045, + "learning_rate": 0.00019157657657657655, + "loss": 0.4016, + "step": 2417 + }, + { + "epoch": 0.7249016301292861, + "grad_norm": 0.24982942640781403, + "learning_rate": 0.0001915315315315315, + "loss": 0.427, + "step": 2418 + }, + { + "epoch": 0.7252014240209855, + "grad_norm": 0.27166748046875, + "learning_rate": 0.00019148648648648647, + "loss": 0.4807, + "step": 2419 + }, + { + "epoch": 0.7255012179126851, + "grad_norm": 0.24789117276668549, + "learning_rate": 0.0001914414414414414, + "loss": 0.4265, + "step": 2420 + }, + { + "epoch": 0.7258010118043845, + "grad_norm": 0.2433491349220276, + "learning_rate": 0.00019139639639639637, + "loss": 0.4405, + "step": 2421 + }, + { + "epoch": 0.7261008056960839, + "grad_norm": 0.24121679365634918, + "learning_rate": 0.00019135135135135134, + "loss": 0.4215, + "step": 2422 + }, + { + "epoch": 0.7264005995877834, + "grad_norm": 0.25895169377326965, + "learning_rate": 0.0001913063063063063, + "loss": 0.4168, + "step": 2423 + }, + { + "epoch": 0.7267003934794829, + "grad_norm": 0.24981217086315155, + "learning_rate": 0.00019126126126126124, + "loss": 0.4268, + "step": 2424 + }, + { + "epoch": 0.7270001873711823, + "grad_norm": 0.25490307807922363, + "learning_rate": 0.0001912162162162162, + "loss": 0.4464, + "step": 2425 + }, + { + "epoch": 0.7272999812628818, + "grad_norm": 0.2552802562713623, + "learning_rate": 0.00019117117117117117, + "loss": 0.4461, + "step": 2426 + }, + { + "epoch": 0.7275997751545812, + "grad_norm": 0.27454614639282227, + "learning_rate": 0.0001911261261261261, + "loss": 0.4683, + "step": 2427 + }, + { + "epoch": 0.7278995690462807, + "grad_norm": 0.2501683831214905, + "learning_rate": 0.00019108108108108107, + "loss": 0.4244, + "step": 2428 + }, + { + "epoch": 0.7281993629379802, + "grad_norm": 0.24820026755332947, + "learning_rate": 0.00019103603603603603, + "loss": 0.4416, + "step": 2429 + }, + { + "epoch": 0.7284991568296796, + "grad_norm": 0.25755947828292847, + "learning_rate": 0.00019099099099099097, + "loss": 0.4377, + "step": 2430 + }, + { + "epoch": 0.728798950721379, + "grad_norm": 0.268839031457901, + "learning_rate": 0.00019094594594594593, + "loss": 0.46, + "step": 2431 + }, + { + "epoch": 0.7290987446130786, + "grad_norm": 0.2707115113735199, + "learning_rate": 0.0001909009009009009, + "loss": 0.4375, + "step": 2432 + }, + { + "epoch": 0.729398538504778, + "grad_norm": 0.25406280159950256, + "learning_rate": 0.00019085585585585583, + "loss": 0.441, + "step": 2433 + }, + { + "epoch": 0.7296983323964774, + "grad_norm": 0.2569238841533661, + "learning_rate": 0.00019081081081081082, + "loss": 0.4446, + "step": 2434 + }, + { + "epoch": 0.7299981262881768, + "grad_norm": 0.2784389555454254, + "learning_rate": 0.00019076576576576576, + "loss": 0.4396, + "step": 2435 + }, + { + "epoch": 0.7302979201798764, + "grad_norm": 0.25094011425971985, + "learning_rate": 0.0001907207207207207, + "loss": 0.4189, + "step": 2436 + }, + { + "epoch": 0.7305977140715758, + "grad_norm": 0.2696321904659271, + "learning_rate": 0.00019067567567567563, + "loss": 0.4575, + "step": 2437 + }, + { + "epoch": 0.7308975079632752, + "grad_norm": 0.2526487112045288, + "learning_rate": 0.00019063063063063062, + "loss": 0.4081, + "step": 2438 + }, + { + "epoch": 0.7311973018549747, + "grad_norm": 0.2577098608016968, + "learning_rate": 0.00019058558558558556, + "loss": 0.4478, + "step": 2439 + }, + { + "epoch": 0.7314970957466742, + "grad_norm": 0.26381629705429077, + "learning_rate": 0.0001905405405405405, + "loss": 0.4573, + "step": 2440 + }, + { + "epoch": 0.7317968896383736, + "grad_norm": 0.26065248250961304, + "learning_rate": 0.0001904954954954955, + "loss": 0.4413, + "step": 2441 + }, + { + "epoch": 0.7320966835300731, + "grad_norm": 0.24994462728500366, + "learning_rate": 0.00019045045045045043, + "loss": 0.4016, + "step": 2442 + }, + { + "epoch": 0.7323964774217725, + "grad_norm": 0.2742599844932556, + "learning_rate": 0.00019040540540540536, + "loss": 0.4578, + "step": 2443 + }, + { + "epoch": 0.732696271313472, + "grad_norm": 0.23844504356384277, + "learning_rate": 0.00019036036036036035, + "loss": 0.4078, + "step": 2444 + }, + { + "epoch": 0.7329960652051715, + "grad_norm": 0.2562139630317688, + "learning_rate": 0.0001903153153153153, + "loss": 0.4363, + "step": 2445 + }, + { + "epoch": 0.7332958590968709, + "grad_norm": 0.2525213956832886, + "learning_rate": 0.00019027027027027025, + "loss": 0.4585, + "step": 2446 + }, + { + "epoch": 0.7335956529885703, + "grad_norm": 0.2562119662761688, + "learning_rate": 0.00019022522522522522, + "loss": 0.4158, + "step": 2447 + }, + { + "epoch": 0.7338954468802699, + "grad_norm": 0.2771570682525635, + "learning_rate": 0.00019018018018018015, + "loss": 0.4601, + "step": 2448 + }, + { + "epoch": 0.7341952407719693, + "grad_norm": 0.2594900131225586, + "learning_rate": 0.00019013513513513512, + "loss": 0.4252, + "step": 2449 + }, + { + "epoch": 0.7344950346636687, + "grad_norm": 0.27634164690971375, + "learning_rate": 0.00019009009009009008, + "loss": 0.4614, + "step": 2450 + }, + { + "epoch": 0.7347948285553682, + "grad_norm": 0.27118000388145447, + "learning_rate": 0.00019004504504504502, + "loss": 0.425, + "step": 2451 + }, + { + "epoch": 0.7350946224470677, + "grad_norm": 0.26404282450675964, + "learning_rate": 0.00018999999999999998, + "loss": 0.435, + "step": 2452 + }, + { + "epoch": 0.7353944163387671, + "grad_norm": 0.24286137521266937, + "learning_rate": 0.00018995495495495495, + "loss": 0.4099, + "step": 2453 + }, + { + "epoch": 0.7356942102304665, + "grad_norm": 0.2554706335067749, + "learning_rate": 0.00018990990990990988, + "loss": 0.4031, + "step": 2454 + }, + { + "epoch": 0.735994004122166, + "grad_norm": 0.2666279375553131, + "learning_rate": 0.00018986486486486485, + "loss": 0.4397, + "step": 2455 + }, + { + "epoch": 0.7362937980138655, + "grad_norm": 0.24479645490646362, + "learning_rate": 0.0001898198198198198, + "loss": 0.4059, + "step": 2456 + }, + { + "epoch": 0.736593591905565, + "grad_norm": 0.27331724762916565, + "learning_rate": 0.00018977477477477478, + "loss": 0.4337, + "step": 2457 + }, + { + "epoch": 0.7368933857972644, + "grad_norm": 0.2546418309211731, + "learning_rate": 0.0001897297297297297, + "loss": 0.4206, + "step": 2458 + }, + { + "epoch": 0.7371931796889638, + "grad_norm": 0.2593313753604889, + "learning_rate": 0.00018968468468468468, + "loss": 0.39, + "step": 2459 + }, + { + "epoch": 0.7374929735806633, + "grad_norm": 0.2757156789302826, + "learning_rate": 0.00018963963963963964, + "loss": 0.441, + "step": 2460 + }, + { + "epoch": 0.7377927674723628, + "grad_norm": 0.2826617956161499, + "learning_rate": 0.00018959459459459458, + "loss": 0.4497, + "step": 2461 + }, + { + "epoch": 0.7380925613640622, + "grad_norm": 0.26498305797576904, + "learning_rate": 0.00018954954954954951, + "loss": 0.4429, + "step": 2462 + }, + { + "epoch": 0.7383923552557616, + "grad_norm": 0.22784557938575745, + "learning_rate": 0.0001895045045045045, + "loss": 0.4069, + "step": 2463 + }, + { + "epoch": 0.7386921491474612, + "grad_norm": 0.277037113904953, + "learning_rate": 0.00018945945945945944, + "loss": 0.4257, + "step": 2464 + }, + { + "epoch": 0.7389919430391606, + "grad_norm": 0.25758159160614014, + "learning_rate": 0.00018941441441441438, + "loss": 0.4255, + "step": 2465 + }, + { + "epoch": 0.73929173693086, + "grad_norm": 0.24654820561408997, + "learning_rate": 0.00018936936936936937, + "loss": 0.402, + "step": 2466 + }, + { + "epoch": 0.7395915308225595, + "grad_norm": 0.259376585483551, + "learning_rate": 0.0001893243243243243, + "loss": 0.416, + "step": 2467 + }, + { + "epoch": 0.739891324714259, + "grad_norm": 0.28223109245300293, + "learning_rate": 0.00018927927927927924, + "loss": 0.4675, + "step": 2468 + }, + { + "epoch": 0.7401911186059584, + "grad_norm": 0.2680475413799286, + "learning_rate": 0.00018923423423423423, + "loss": 0.4147, + "step": 2469 + }, + { + "epoch": 0.7404909124976579, + "grad_norm": 0.2528432309627533, + "learning_rate": 0.00018918918918918917, + "loss": 0.4374, + "step": 2470 + }, + { + "epoch": 0.7407907063893573, + "grad_norm": 0.26637372374534607, + "learning_rate": 0.0001891441441441441, + "loss": 0.4189, + "step": 2471 + }, + { + "epoch": 0.7410905002810567, + "grad_norm": 0.2570081055164337, + "learning_rate": 0.0001890990990990991, + "loss": 0.4388, + "step": 2472 + }, + { + "epoch": 0.7413902941727563, + "grad_norm": 0.27075570821762085, + "learning_rate": 0.00018905405405405403, + "loss": 0.4599, + "step": 2473 + }, + { + "epoch": 0.7416900880644557, + "grad_norm": 0.2676197290420532, + "learning_rate": 0.00018900900900900897, + "loss": 0.4105, + "step": 2474 + }, + { + "epoch": 0.7419898819561551, + "grad_norm": 0.24458040297031403, + "learning_rate": 0.00018896396396396396, + "loss": 0.4152, + "step": 2475 + }, + { + "epoch": 0.7422896758478545, + "grad_norm": 0.2793339788913727, + "learning_rate": 0.0001889189189189189, + "loss": 0.4468, + "step": 2476 + }, + { + "epoch": 0.7425894697395541, + "grad_norm": 0.25252237915992737, + "learning_rate": 0.00018887387387387384, + "loss": 0.4215, + "step": 2477 + }, + { + "epoch": 0.7428892636312535, + "grad_norm": 0.27801933884620667, + "learning_rate": 0.00018882882882882883, + "loss": 0.4362, + "step": 2478 + }, + { + "epoch": 0.743189057522953, + "grad_norm": 0.26056137681007385, + "learning_rate": 0.00018878378378378376, + "loss": 0.4342, + "step": 2479 + }, + { + "epoch": 0.7434888514146524, + "grad_norm": 0.26250821352005005, + "learning_rate": 0.00018873873873873873, + "loss": 0.4196, + "step": 2480 + }, + { + "epoch": 0.7437886453063519, + "grad_norm": 0.2682492733001709, + "learning_rate": 0.0001886936936936937, + "loss": 0.4368, + "step": 2481 + }, + { + "epoch": 0.7440884391980513, + "grad_norm": 0.2572811245918274, + "learning_rate": 0.00018864864864864863, + "loss": 0.4363, + "step": 2482 + }, + { + "epoch": 0.7443882330897508, + "grad_norm": 0.25746074318885803, + "learning_rate": 0.0001886036036036036, + "loss": 0.4045, + "step": 2483 + }, + { + "epoch": 0.7446880269814502, + "grad_norm": 0.25470736622810364, + "learning_rate": 0.00018855855855855853, + "loss": 0.4062, + "step": 2484 + }, + { + "epoch": 0.7449878208731497, + "grad_norm": 0.2766227722167969, + "learning_rate": 0.0001885135135135135, + "loss": 0.4605, + "step": 2485 + }, + { + "epoch": 0.7452876147648492, + "grad_norm": 0.28737902641296387, + "learning_rate": 0.00018846846846846846, + "loss": 0.4481, + "step": 2486 + }, + { + "epoch": 0.7455874086565486, + "grad_norm": 0.2646963894367218, + "learning_rate": 0.0001884234234234234, + "loss": 0.4212, + "step": 2487 + }, + { + "epoch": 0.745887202548248, + "grad_norm": 0.2569124698638916, + "learning_rate": 0.00018837837837837836, + "loss": 0.4268, + "step": 2488 + }, + { + "epoch": 0.7461869964399476, + "grad_norm": 0.25343701243400574, + "learning_rate": 0.00018833333333333332, + "loss": 0.4285, + "step": 2489 + }, + { + "epoch": 0.746486790331647, + "grad_norm": 0.27101901173591614, + "learning_rate": 0.00018828828828828826, + "loss": 0.4426, + "step": 2490 + }, + { + "epoch": 0.7467865842233464, + "grad_norm": 0.2594289779663086, + "learning_rate": 0.00018824324324324325, + "loss": 0.4333, + "step": 2491 + }, + { + "epoch": 0.7470863781150459, + "grad_norm": 0.2643277049064636, + "learning_rate": 0.00018819819819819819, + "loss": 0.4407, + "step": 2492 + }, + { + "epoch": 0.7473861720067454, + "grad_norm": 0.267240047454834, + "learning_rate": 0.00018815315315315312, + "loss": 0.4448, + "step": 2493 + }, + { + "epoch": 0.7476859658984448, + "grad_norm": 0.24963083863258362, + "learning_rate": 0.00018810810810810811, + "loss": 0.4313, + "step": 2494 + }, + { + "epoch": 0.7479857597901443, + "grad_norm": 0.2673603892326355, + "learning_rate": 0.00018806306306306305, + "loss": 0.4327, + "step": 2495 + }, + { + "epoch": 0.7482855536818437, + "grad_norm": 0.25436538457870483, + "learning_rate": 0.000188018018018018, + "loss": 0.4251, + "step": 2496 + }, + { + "epoch": 0.7485853475735432, + "grad_norm": 0.25511813163757324, + "learning_rate": 0.00018797297297297298, + "loss": 0.441, + "step": 2497 + }, + { + "epoch": 0.7488851414652427, + "grad_norm": 0.26634255051612854, + "learning_rate": 0.00018792792792792791, + "loss": 0.4142, + "step": 2498 + }, + { + "epoch": 0.7491849353569421, + "grad_norm": 0.2738245129585266, + "learning_rate": 0.00018788288288288285, + "loss": 0.446, + "step": 2499 + }, + { + "epoch": 0.7494847292486415, + "grad_norm": 0.2478281557559967, + "learning_rate": 0.00018783783783783784, + "loss": 0.4355, + "step": 2500 + }, + { + "epoch": 0.7494847292486415, + "eval_loss": 0.43330296874046326, + "eval_runtime": 567.7062, + "eval_samples_per_second": 3.803, + "eval_steps_per_second": 0.476, + "step": 2500 + }, + { + "epoch": 0.749784523140341, + "grad_norm": 0.24915559589862823, + "learning_rate": 0.00018779279279279278, + "loss": 0.431, + "step": 2501 + }, + { + "epoch": 0.7500843170320405, + "grad_norm": 0.25979626178741455, + "learning_rate": 0.00018774774774774772, + "loss": 0.4336, + "step": 2502 + }, + { + "epoch": 0.7503841109237399, + "grad_norm": 0.2514503002166748, + "learning_rate": 0.0001877027027027027, + "loss": 0.4104, + "step": 2503 + }, + { + "epoch": 0.7506839048154393, + "grad_norm": 0.2693893015384674, + "learning_rate": 0.00018765765765765764, + "loss": 0.4706, + "step": 2504 + }, + { + "epoch": 0.7509836987071389, + "grad_norm": 0.2881157696247101, + "learning_rate": 0.00018761261261261258, + "loss": 0.4346, + "step": 2505 + }, + { + "epoch": 0.7512834925988383, + "grad_norm": 0.232576385140419, + "learning_rate": 0.00018756756756756757, + "loss": 0.42, + "step": 2506 + }, + { + "epoch": 0.7515832864905377, + "grad_norm": 0.30108994245529175, + "learning_rate": 0.0001875225225225225, + "loss": 0.425, + "step": 2507 + }, + { + "epoch": 0.7518830803822372, + "grad_norm": 0.28761547803878784, + "learning_rate": 0.00018747747747747745, + "loss": 0.47, + "step": 2508 + }, + { + "epoch": 0.7521828742739367, + "grad_norm": 0.2556571662425995, + "learning_rate": 0.0001874324324324324, + "loss": 0.4236, + "step": 2509 + }, + { + "epoch": 0.7524826681656361, + "grad_norm": 0.27593177556991577, + "learning_rate": 0.00018738738738738737, + "loss": 0.4209, + "step": 2510 + }, + { + "epoch": 0.7527824620573356, + "grad_norm": 0.26506245136260986, + "learning_rate": 0.0001873423423423423, + "loss": 0.4182, + "step": 2511 + }, + { + "epoch": 0.753082255949035, + "grad_norm": 0.31767213344573975, + "learning_rate": 0.00018729729729729727, + "loss": 0.4574, + "step": 2512 + }, + { + "epoch": 0.7533820498407345, + "grad_norm": 0.25470229983329773, + "learning_rate": 0.00018725225225225224, + "loss": 0.4064, + "step": 2513 + }, + { + "epoch": 0.753681843732434, + "grad_norm": 0.25668561458587646, + "learning_rate": 0.0001872072072072072, + "loss": 0.4247, + "step": 2514 + }, + { + "epoch": 0.7539816376241334, + "grad_norm": 0.29275453090667725, + "learning_rate": 0.00018716216216216214, + "loss": 0.4232, + "step": 2515 + }, + { + "epoch": 0.7542814315158328, + "grad_norm": 0.2815520167350769, + "learning_rate": 0.0001871171171171171, + "loss": 0.4402, + "step": 2516 + }, + { + "epoch": 0.7545812254075324, + "grad_norm": 0.24637946486473083, + "learning_rate": 0.00018707207207207207, + "loss": 0.4209, + "step": 2517 + }, + { + "epoch": 0.7548810192992318, + "grad_norm": 0.27894946932792664, + "learning_rate": 0.000187027027027027, + "loss": 0.453, + "step": 2518 + }, + { + "epoch": 0.7551808131909312, + "grad_norm": 0.3131442070007324, + "learning_rate": 0.00018698198198198197, + "loss": 0.4728, + "step": 2519 + }, + { + "epoch": 0.7554806070826307, + "grad_norm": 0.2698810398578644, + "learning_rate": 0.00018693693693693693, + "loss": 0.4387, + "step": 2520 + }, + { + "epoch": 0.7557804009743302, + "grad_norm": 0.24818141758441925, + "learning_rate": 0.00018689189189189187, + "loss": 0.4065, + "step": 2521 + }, + { + "epoch": 0.7560801948660296, + "grad_norm": 0.2903098464012146, + "learning_rate": 0.00018684684684684683, + "loss": 0.426, + "step": 2522 + }, + { + "epoch": 0.756379988757729, + "grad_norm": 0.2602495551109314, + "learning_rate": 0.0001868018018018018, + "loss": 0.4144, + "step": 2523 + }, + { + "epoch": 0.7566797826494285, + "grad_norm": 0.2648625373840332, + "learning_rate": 0.00018675675675675673, + "loss": 0.4476, + "step": 2524 + }, + { + "epoch": 0.756979576541128, + "grad_norm": 0.26782098412513733, + "learning_rate": 0.0001867117117117117, + "loss": 0.4238, + "step": 2525 + }, + { + "epoch": 0.7572793704328274, + "grad_norm": 0.27317121624946594, + "learning_rate": 0.00018666666666666666, + "loss": 0.4251, + "step": 2526 + }, + { + "epoch": 0.7575791643245269, + "grad_norm": 0.2720593214035034, + "learning_rate": 0.0001866216216216216, + "loss": 0.4667, + "step": 2527 + }, + { + "epoch": 0.7578789582162263, + "grad_norm": 0.2541276812553406, + "learning_rate": 0.0001865765765765766, + "loss": 0.431, + "step": 2528 + }, + { + "epoch": 0.7581787521079258, + "grad_norm": 0.27258971333503723, + "learning_rate": 0.00018653153153153152, + "loss": 0.4263, + "step": 2529 + }, + { + "epoch": 0.7584785459996253, + "grad_norm": 0.28021714091300964, + "learning_rate": 0.00018648648648648646, + "loss": 0.4276, + "step": 2530 + }, + { + "epoch": 0.7587783398913247, + "grad_norm": 0.2505019009113312, + "learning_rate": 0.00018644144144144145, + "loss": 0.4105, + "step": 2531 + }, + { + "epoch": 0.7590781337830241, + "grad_norm": 0.28030917048454285, + "learning_rate": 0.0001863963963963964, + "loss": 0.4428, + "step": 2532 + }, + { + "epoch": 0.7593779276747237, + "grad_norm": 0.27447059750556946, + "learning_rate": 0.00018635135135135133, + "loss": 0.4551, + "step": 2533 + }, + { + "epoch": 0.7596777215664231, + "grad_norm": 0.26824313402175903, + "learning_rate": 0.00018630630630630626, + "loss": 0.4445, + "step": 2534 + }, + { + "epoch": 0.7599775154581225, + "grad_norm": 0.2674945294857025, + "learning_rate": 0.00018626126126126125, + "loss": 0.4513, + "step": 2535 + }, + { + "epoch": 0.760277309349822, + "grad_norm": 0.2604798972606659, + "learning_rate": 0.0001862162162162162, + "loss": 0.4381, + "step": 2536 + }, + { + "epoch": 0.7605771032415215, + "grad_norm": 0.27609342336654663, + "learning_rate": 0.00018617117117117115, + "loss": 0.4441, + "step": 2537 + }, + { + "epoch": 0.7608768971332209, + "grad_norm": 0.2614879012107849, + "learning_rate": 0.00018612612612612612, + "loss": 0.435, + "step": 2538 + }, + { + "epoch": 0.7611766910249204, + "grad_norm": 0.25386688113212585, + "learning_rate": 0.00018608108108108105, + "loss": 0.4434, + "step": 2539 + }, + { + "epoch": 0.7614764849166198, + "grad_norm": 0.24181143939495087, + "learning_rate": 0.00018603603603603602, + "loss": 0.4175, + "step": 2540 + }, + { + "epoch": 0.7617762788083193, + "grad_norm": 0.2645350694656372, + "learning_rate": 0.00018599099099099098, + "loss": 0.4136, + "step": 2541 + }, + { + "epoch": 0.7620760727000188, + "grad_norm": 0.2677913010120392, + "learning_rate": 0.00018594594594594592, + "loss": 0.4436, + "step": 2542 + }, + { + "epoch": 0.7623758665917182, + "grad_norm": 0.2717260420322418, + "learning_rate": 0.00018590090090090088, + "loss": 0.4565, + "step": 2543 + }, + { + "epoch": 0.7626756604834176, + "grad_norm": 0.25026705861091614, + "learning_rate": 0.00018585585585585585, + "loss": 0.4119, + "step": 2544 + }, + { + "epoch": 0.7629754543751172, + "grad_norm": 0.24770689010620117, + "learning_rate": 0.00018581081081081078, + "loss": 0.4097, + "step": 2545 + }, + { + "epoch": 0.7632752482668166, + "grad_norm": 0.27625271677970886, + "learning_rate": 0.00018576576576576575, + "loss": 0.4269, + "step": 2546 + }, + { + "epoch": 0.763575042158516, + "grad_norm": 0.27056175470352173, + "learning_rate": 0.0001857207207207207, + "loss": 0.4499, + "step": 2547 + }, + { + "epoch": 0.7638748360502154, + "grad_norm": 0.2812648415565491, + "learning_rate": 0.00018567567567567567, + "loss": 0.4736, + "step": 2548 + }, + { + "epoch": 0.764174629941915, + "grad_norm": 0.26717478036880493, + "learning_rate": 0.0001856306306306306, + "loss": 0.4072, + "step": 2549 + }, + { + "epoch": 0.7644744238336144, + "grad_norm": 0.2870055139064789, + "learning_rate": 0.00018558558558558558, + "loss": 0.4229, + "step": 2550 + }, + { + "epoch": 0.7647742177253138, + "grad_norm": 0.2580265700817108, + "learning_rate": 0.00018554054054054054, + "loss": 0.4068, + "step": 2551 + }, + { + "epoch": 0.7650740116170133, + "grad_norm": 0.28002214431762695, + "learning_rate": 0.00018549549549549548, + "loss": 0.432, + "step": 2552 + }, + { + "epoch": 0.7653738055087128, + "grad_norm": 0.27384141087532043, + "learning_rate": 0.00018545045045045044, + "loss": 0.4363, + "step": 2553 + }, + { + "epoch": 0.7656735994004122, + "grad_norm": 0.2627524137496948, + "learning_rate": 0.0001854054054054054, + "loss": 0.4197, + "step": 2554 + }, + { + "epoch": 0.7659733932921117, + "grad_norm": 0.2666347324848175, + "learning_rate": 0.00018536036036036034, + "loss": 0.3912, + "step": 2555 + }, + { + "epoch": 0.7662731871838111, + "grad_norm": 0.2756651043891907, + "learning_rate": 0.00018531531531531528, + "loss": 0.4364, + "step": 2556 + }, + { + "epoch": 0.7665729810755106, + "grad_norm": 0.2617150545120239, + "learning_rate": 0.00018527027027027027, + "loss": 0.4468, + "step": 2557 + }, + { + "epoch": 0.7668727749672101, + "grad_norm": 0.27227911353111267, + "learning_rate": 0.0001852252252252252, + "loss": 0.4246, + "step": 2558 + }, + { + "epoch": 0.7671725688589095, + "grad_norm": 0.2841823697090149, + "learning_rate": 0.00018518018018018014, + "loss": 0.4363, + "step": 2559 + }, + { + "epoch": 0.7674723627506089, + "grad_norm": 0.253366082906723, + "learning_rate": 0.00018513513513513513, + "loss": 0.419, + "step": 2560 + }, + { + "epoch": 0.7677721566423085, + "grad_norm": 0.2522357106208801, + "learning_rate": 0.00018509009009009007, + "loss": 0.4124, + "step": 2561 + }, + { + "epoch": 0.7680719505340079, + "grad_norm": 0.2550141215324402, + "learning_rate": 0.000185045045045045, + "loss": 0.4256, + "step": 2562 + }, + { + "epoch": 0.7683717444257073, + "grad_norm": 0.27578258514404297, + "learning_rate": 0.000185, + "loss": 0.4447, + "step": 2563 + }, + { + "epoch": 0.7686715383174068, + "grad_norm": 0.2517780661582947, + "learning_rate": 0.00018495495495495493, + "loss": 0.4167, + "step": 2564 + }, + { + "epoch": 0.7689713322091062, + "grad_norm": 0.2627197802066803, + "learning_rate": 0.00018490990990990987, + "loss": 0.421, + "step": 2565 + }, + { + "epoch": 0.7692711261008057, + "grad_norm": 0.2572929263114929, + "learning_rate": 0.00018486486486486486, + "loss": 0.4295, + "step": 2566 + }, + { + "epoch": 0.7695709199925052, + "grad_norm": 0.2549370229244232, + "learning_rate": 0.0001848198198198198, + "loss": 0.4353, + "step": 2567 + }, + { + "epoch": 0.7698707138842046, + "grad_norm": 0.25990357995033264, + "learning_rate": 0.00018477477477477474, + "loss": 0.4598, + "step": 2568 + }, + { + "epoch": 0.770170507775904, + "grad_norm": 0.26102861762046814, + "learning_rate": 0.00018472972972972973, + "loss": 0.4342, + "step": 2569 + }, + { + "epoch": 0.7704703016676036, + "grad_norm": 0.26292112469673157, + "learning_rate": 0.00018468468468468466, + "loss": 0.4217, + "step": 2570 + }, + { + "epoch": 0.770770095559303, + "grad_norm": 0.24879471957683563, + "learning_rate": 0.00018463963963963963, + "loss": 0.4034, + "step": 2571 + }, + { + "epoch": 0.7710698894510024, + "grad_norm": 0.249162495136261, + "learning_rate": 0.0001845945945945946, + "loss": 0.4213, + "step": 2572 + }, + { + "epoch": 0.7713696833427018, + "grad_norm": 0.25036314129829407, + "learning_rate": 0.00018454954954954953, + "loss": 0.449, + "step": 2573 + }, + { + "epoch": 0.7716694772344014, + "grad_norm": 0.2511482238769531, + "learning_rate": 0.0001845045045045045, + "loss": 0.4285, + "step": 2574 + }, + { + "epoch": 0.7719692711261008, + "grad_norm": 0.25358885526657104, + "learning_rate": 0.00018445945945945946, + "loss": 0.4392, + "step": 2575 + }, + { + "epoch": 0.7722690650178002, + "grad_norm": 0.25731760263442993, + "learning_rate": 0.0001844144144144144, + "loss": 0.4326, + "step": 2576 + }, + { + "epoch": 0.7725688589094997, + "grad_norm": 0.24017149209976196, + "learning_rate": 0.00018436936936936936, + "loss": 0.4285, + "step": 2577 + }, + { + "epoch": 0.7728686528011992, + "grad_norm": 0.24697363376617432, + "learning_rate": 0.00018432432432432432, + "loss": 0.4326, + "step": 2578 + }, + { + "epoch": 0.7731684466928986, + "grad_norm": 0.2622368335723877, + "learning_rate": 0.00018427927927927926, + "loss": 0.4051, + "step": 2579 + }, + { + "epoch": 0.7734682405845981, + "grad_norm": 0.26079848408699036, + "learning_rate": 0.00018423423423423422, + "loss": 0.4296, + "step": 2580 + }, + { + "epoch": 0.7737680344762975, + "grad_norm": 0.26790016889572144, + "learning_rate": 0.00018418918918918916, + "loss": 0.4487, + "step": 2581 + }, + { + "epoch": 0.774067828367997, + "grad_norm": 0.26801207661628723, + "learning_rate": 0.00018414414414414412, + "loss": 0.441, + "step": 2582 + }, + { + "epoch": 0.7743676222596965, + "grad_norm": 0.2615436911582947, + "learning_rate": 0.00018409909909909909, + "loss": 0.4356, + "step": 2583 + }, + { + "epoch": 0.7746674161513959, + "grad_norm": 0.26157858967781067, + "learning_rate": 0.00018405405405405402, + "loss": 0.4624, + "step": 2584 + }, + { + "epoch": 0.7749672100430953, + "grad_norm": 0.2570144832134247, + "learning_rate": 0.000184009009009009, + "loss": 0.4159, + "step": 2585 + }, + { + "epoch": 0.7752670039347949, + "grad_norm": 0.25635403394699097, + "learning_rate": 0.00018396396396396395, + "loss": 0.4479, + "step": 2586 + }, + { + "epoch": 0.7755667978264943, + "grad_norm": 0.24715913832187653, + "learning_rate": 0.00018391891891891889, + "loss": 0.4258, + "step": 2587 + }, + { + "epoch": 0.7758665917181937, + "grad_norm": 0.2577861547470093, + "learning_rate": 0.00018387387387387388, + "loss": 0.4107, + "step": 2588 + }, + { + "epoch": 0.7761663856098932, + "grad_norm": 0.24768322706222534, + "learning_rate": 0.00018382882882882881, + "loss": 0.4208, + "step": 2589 + }, + { + "epoch": 0.7764661795015927, + "grad_norm": 0.24486133456230164, + "learning_rate": 0.00018378378378378375, + "loss": 0.4128, + "step": 2590 + }, + { + "epoch": 0.7767659733932921, + "grad_norm": 0.2598220109939575, + "learning_rate": 0.00018373873873873874, + "loss": 0.4398, + "step": 2591 + }, + { + "epoch": 0.7770657672849915, + "grad_norm": 0.2616111636161804, + "learning_rate": 0.00018369369369369368, + "loss": 0.4472, + "step": 2592 + }, + { + "epoch": 0.777365561176691, + "grad_norm": 0.2481420487165451, + "learning_rate": 0.00018364864864864862, + "loss": 0.4073, + "step": 2593 + }, + { + "epoch": 0.7776653550683905, + "grad_norm": 0.26911380887031555, + "learning_rate": 0.0001836036036036036, + "loss": 0.4584, + "step": 2594 + }, + { + "epoch": 0.77796514896009, + "grad_norm": 0.2654714584350586, + "learning_rate": 0.00018355855855855854, + "loss": 0.4541, + "step": 2595 + }, + { + "epoch": 0.7782649428517894, + "grad_norm": 0.2782737612724304, + "learning_rate": 0.00018351351351351348, + "loss": 0.4496, + "step": 2596 + }, + { + "epoch": 0.7785647367434888, + "grad_norm": 0.24328523874282837, + "learning_rate": 0.00018346846846846847, + "loss": 0.4198, + "step": 2597 + }, + { + "epoch": 0.7788645306351883, + "grad_norm": 0.2627139985561371, + "learning_rate": 0.0001834234234234234, + "loss": 0.4383, + "step": 2598 + }, + { + "epoch": 0.7791643245268878, + "grad_norm": 0.25555041432380676, + "learning_rate": 0.00018337837837837834, + "loss": 0.4097, + "step": 2599 + }, + { + "epoch": 0.7794641184185872, + "grad_norm": 0.24429571628570557, + "learning_rate": 0.00018333333333333334, + "loss": 0.3932, + "step": 2600 + }, + { + "epoch": 0.7797639123102866, + "grad_norm": 0.25940433144569397, + "learning_rate": 0.00018328828828828827, + "loss": 0.4142, + "step": 2601 + }, + { + "epoch": 0.7800637062019862, + "grad_norm": 0.25724172592163086, + "learning_rate": 0.0001832432432432432, + "loss": 0.4316, + "step": 2602 + }, + { + "epoch": 0.7803635000936856, + "grad_norm": 0.2488878071308136, + "learning_rate": 0.0001831981981981982, + "loss": 0.4259, + "step": 2603 + }, + { + "epoch": 0.780663293985385, + "grad_norm": 0.25847765803337097, + "learning_rate": 0.00018315315315315314, + "loss": 0.4015, + "step": 2604 + }, + { + "epoch": 0.7809630878770845, + "grad_norm": 0.2530449628829956, + "learning_rate": 0.0001831081081081081, + "loss": 0.4015, + "step": 2605 + }, + { + "epoch": 0.781262881768784, + "grad_norm": 0.2667160630226135, + "learning_rate": 0.00018306306306306304, + "loss": 0.4333, + "step": 2606 + }, + { + "epoch": 0.7815626756604834, + "grad_norm": 0.27646440267562866, + "learning_rate": 0.000183018018018018, + "loss": 0.4585, + "step": 2607 + }, + { + "epoch": 0.7818624695521829, + "grad_norm": 0.2871904969215393, + "learning_rate": 0.00018297297297297297, + "loss": 0.4443, + "step": 2608 + }, + { + "epoch": 0.7821622634438823, + "grad_norm": 0.2631723880767822, + "learning_rate": 0.0001829279279279279, + "loss": 0.4312, + "step": 2609 + }, + { + "epoch": 0.7824620573355818, + "grad_norm": 0.27293872833251953, + "learning_rate": 0.00018288288288288287, + "loss": 0.4121, + "step": 2610 + }, + { + "epoch": 0.7827618512272813, + "grad_norm": 0.2640814781188965, + "learning_rate": 0.00018283783783783783, + "loss": 0.4253, + "step": 2611 + }, + { + "epoch": 0.7830616451189807, + "grad_norm": 0.25551357865333557, + "learning_rate": 0.00018279279279279277, + "loss": 0.4207, + "step": 2612 + }, + { + "epoch": 0.7833614390106801, + "grad_norm": 0.27317047119140625, + "learning_rate": 0.00018274774774774773, + "loss": 0.4144, + "step": 2613 + }, + { + "epoch": 0.7836612329023797, + "grad_norm": 0.2567535638809204, + "learning_rate": 0.0001827027027027027, + "loss": 0.4238, + "step": 2614 + }, + { + "epoch": 0.7839610267940791, + "grad_norm": 0.2705351710319519, + "learning_rate": 0.00018265765765765763, + "loss": 0.4431, + "step": 2615 + }, + { + "epoch": 0.7842608206857785, + "grad_norm": 0.2697280943393707, + "learning_rate": 0.0001826126126126126, + "loss": 0.4153, + "step": 2616 + }, + { + "epoch": 0.7845606145774779, + "grad_norm": 0.2661687731742859, + "learning_rate": 0.00018256756756756756, + "loss": 0.4213, + "step": 2617 + }, + { + "epoch": 0.7848604084691775, + "grad_norm": 0.2607208788394928, + "learning_rate": 0.0001825225225225225, + "loss": 0.4248, + "step": 2618 + }, + { + "epoch": 0.7851602023608769, + "grad_norm": 0.250765860080719, + "learning_rate": 0.00018247747747747749, + "loss": 0.4208, + "step": 2619 + }, + { + "epoch": 0.7854599962525763, + "grad_norm": 0.29620206356048584, + "learning_rate": 0.00018243243243243242, + "loss": 0.4231, + "step": 2620 + }, + { + "epoch": 0.7857597901442758, + "grad_norm": 0.26590675115585327, + "learning_rate": 0.00018238738738738736, + "loss": 0.4526, + "step": 2621 + }, + { + "epoch": 0.7860595840359753, + "grad_norm": 0.2643924057483673, + "learning_rate": 0.00018234234234234235, + "loss": 0.453, + "step": 2622 + }, + { + "epoch": 0.7863593779276747, + "grad_norm": 0.25070053339004517, + "learning_rate": 0.0001822972972972973, + "loss": 0.4142, + "step": 2623 + }, + { + "epoch": 0.7866591718193742, + "grad_norm": 0.2581370770931244, + "learning_rate": 0.00018225225225225222, + "loss": 0.4266, + "step": 2624 + }, + { + "epoch": 0.7869589657110736, + "grad_norm": 0.2647317945957184, + "learning_rate": 0.00018220720720720722, + "loss": 0.4648, + "step": 2625 + }, + { + "epoch": 0.7872587596027731, + "grad_norm": 0.25011420249938965, + "learning_rate": 0.00018216216216216215, + "loss": 0.441, + "step": 2626 + }, + { + "epoch": 0.7875585534944726, + "grad_norm": 0.25477978587150574, + "learning_rate": 0.0001821171171171171, + "loss": 0.4305, + "step": 2627 + }, + { + "epoch": 0.787858347386172, + "grad_norm": 0.24877221882343292, + "learning_rate": 0.00018207207207207208, + "loss": 0.4232, + "step": 2628 + }, + { + "epoch": 0.7881581412778714, + "grad_norm": 0.26181760430336, + "learning_rate": 0.00018202702702702702, + "loss": 0.4417, + "step": 2629 + }, + { + "epoch": 0.788457935169571, + "grad_norm": 0.2556387782096863, + "learning_rate": 0.00018198198198198195, + "loss": 0.4213, + "step": 2630 + }, + { + "epoch": 0.7887577290612704, + "grad_norm": 0.2542903423309326, + "learning_rate": 0.00018193693693693692, + "loss": 0.4179, + "step": 2631 + }, + { + "epoch": 0.7890575229529698, + "grad_norm": 0.2722475528717041, + "learning_rate": 0.00018189189189189188, + "loss": 0.4424, + "step": 2632 + }, + { + "epoch": 0.7893573168446693, + "grad_norm": 0.26245227456092834, + "learning_rate": 0.00018184684684684682, + "loss": 0.4263, + "step": 2633 + }, + { + "epoch": 0.7896571107363688, + "grad_norm": 0.27598729729652405, + "learning_rate": 0.00018180180180180178, + "loss": 0.438, + "step": 2634 + }, + { + "epoch": 0.7899569046280682, + "grad_norm": 0.25007325410842896, + "learning_rate": 0.00018175675675675675, + "loss": 0.4519, + "step": 2635 + }, + { + "epoch": 0.7902566985197677, + "grad_norm": 0.25102177262306213, + "learning_rate": 0.00018171171171171168, + "loss": 0.3859, + "step": 2636 + }, + { + "epoch": 0.7905564924114671, + "grad_norm": 0.26916658878326416, + "learning_rate": 0.00018166666666666665, + "loss": 0.4643, + "step": 2637 + }, + { + "epoch": 0.7908562863031666, + "grad_norm": 0.2508098781108856, + "learning_rate": 0.0001816216216216216, + "loss": 0.4211, + "step": 2638 + }, + { + "epoch": 0.791156080194866, + "grad_norm": 0.24109424650669098, + "learning_rate": 0.00018157657657657655, + "loss": 0.418, + "step": 2639 + }, + { + "epoch": 0.7914558740865655, + "grad_norm": 0.24954918026924133, + "learning_rate": 0.0001815315315315315, + "loss": 0.4219, + "step": 2640 + }, + { + "epoch": 0.7917556679782649, + "grad_norm": 0.24477514624595642, + "learning_rate": 0.00018148648648648647, + "loss": 0.4165, + "step": 2641 + }, + { + "epoch": 0.7920554618699645, + "grad_norm": 0.24159999191761017, + "learning_rate": 0.00018144144144144144, + "loss": 0.4327, + "step": 2642 + }, + { + "epoch": 0.7923552557616639, + "grad_norm": 0.2405584752559662, + "learning_rate": 0.00018139639639639638, + "loss": 0.4041, + "step": 2643 + }, + { + "epoch": 0.7926550496533633, + "grad_norm": 0.2744334638118744, + "learning_rate": 0.00018135135135135134, + "loss": 0.4128, + "step": 2644 + }, + { + "epoch": 0.7929548435450627, + "grad_norm": 0.24337726831436157, + "learning_rate": 0.0001813063063063063, + "loss": 0.4254, + "step": 2645 + }, + { + "epoch": 0.7932546374367623, + "grad_norm": 0.24543915688991547, + "learning_rate": 0.00018126126126126124, + "loss": 0.4296, + "step": 2646 + }, + { + "epoch": 0.7935544313284617, + "grad_norm": 0.2545223832130432, + "learning_rate": 0.0001812162162162162, + "loss": 0.4446, + "step": 2647 + }, + { + "epoch": 0.7938542252201611, + "grad_norm": 0.2627246379852295, + "learning_rate": 0.00018117117117117117, + "loss": 0.4304, + "step": 2648 + }, + { + "epoch": 0.7941540191118606, + "grad_norm": 0.269690603017807, + "learning_rate": 0.0001811261261261261, + "loss": 0.4417, + "step": 2649 + }, + { + "epoch": 0.7944538130035601, + "grad_norm": 0.26495659351348877, + "learning_rate": 0.00018108108108108107, + "loss": 0.4274, + "step": 2650 + }, + { + "epoch": 0.7947536068952595, + "grad_norm": 0.24386748671531677, + "learning_rate": 0.00018103603603603603, + "loss": 0.402, + "step": 2651 + }, + { + "epoch": 0.795053400786959, + "grad_norm": 0.2787957489490509, + "learning_rate": 0.00018099099099099097, + "loss": 0.4285, + "step": 2652 + }, + { + "epoch": 0.7953531946786584, + "grad_norm": 0.28965434432029724, + "learning_rate": 0.0001809459459459459, + "loss": 0.4296, + "step": 2653 + }, + { + "epoch": 0.7956529885703579, + "grad_norm": 0.27486535906791687, + "learning_rate": 0.0001809009009009009, + "loss": 0.4383, + "step": 2654 + }, + { + "epoch": 0.7959527824620574, + "grad_norm": 0.29988306760787964, + "learning_rate": 0.00018085585585585583, + "loss": 0.4326, + "step": 2655 + }, + { + "epoch": 0.7962525763537568, + "grad_norm": 0.2822316884994507, + "learning_rate": 0.00018081081081081077, + "loss": 0.4497, + "step": 2656 + }, + { + "epoch": 0.7965523702454562, + "grad_norm": 0.27767157554626465, + "learning_rate": 0.00018076576576576576, + "loss": 0.4461, + "step": 2657 + }, + { + "epoch": 0.7968521641371558, + "grad_norm": 0.27223673462867737, + "learning_rate": 0.0001807207207207207, + "loss": 0.4338, + "step": 2658 + }, + { + "epoch": 0.7971519580288552, + "grad_norm": 0.26280853152275085, + "learning_rate": 0.00018067567567567563, + "loss": 0.4556, + "step": 2659 + }, + { + "epoch": 0.7974517519205546, + "grad_norm": 0.29061955213546753, + "learning_rate": 0.00018063063063063063, + "loss": 0.4494, + "step": 2660 + }, + { + "epoch": 0.797751545812254, + "grad_norm": 0.2875960171222687, + "learning_rate": 0.00018058558558558556, + "loss": 0.4286, + "step": 2661 + }, + { + "epoch": 0.7980513397039535, + "grad_norm": 0.26884058117866516, + "learning_rate": 0.0001805405405405405, + "loss": 0.4196, + "step": 2662 + }, + { + "epoch": 0.798351133595653, + "grad_norm": 0.25288286805152893, + "learning_rate": 0.0001804954954954955, + "loss": 0.4313, + "step": 2663 + }, + { + "epoch": 0.7986509274873524, + "grad_norm": 0.2743682861328125, + "learning_rate": 0.00018045045045045043, + "loss": 0.4016, + "step": 2664 + }, + { + "epoch": 0.7989507213790519, + "grad_norm": 0.2604522109031677, + "learning_rate": 0.0001804054054054054, + "loss": 0.4199, + "step": 2665 + }, + { + "epoch": 0.7992505152707513, + "grad_norm": 0.2633124589920044, + "learning_rate": 0.00018036036036036035, + "loss": 0.4512, + "step": 2666 + }, + { + "epoch": 0.7995503091624508, + "grad_norm": 0.2687436640262604, + "learning_rate": 0.0001803153153153153, + "loss": 0.4512, + "step": 2667 + }, + { + "epoch": 0.7998501030541503, + "grad_norm": 0.26707741618156433, + "learning_rate": 0.00018027027027027026, + "loss": 0.4096, + "step": 2668 + }, + { + "epoch": 0.8001498969458497, + "grad_norm": 0.25287461280822754, + "learning_rate": 0.00018022522522522522, + "loss": 0.4235, + "step": 2669 + }, + { + "epoch": 0.8004496908375491, + "grad_norm": 0.2665085792541504, + "learning_rate": 0.00018018018018018016, + "loss": 0.4421, + "step": 2670 + }, + { + "epoch": 0.8007494847292487, + "grad_norm": 0.2533876895904541, + "learning_rate": 0.00018013513513513512, + "loss": 0.4031, + "step": 2671 + }, + { + "epoch": 0.8010492786209481, + "grad_norm": 0.27074354887008667, + "learning_rate": 0.00018009009009009008, + "loss": 0.4811, + "step": 2672 + }, + { + "epoch": 0.8013490725126475, + "grad_norm": 0.24895958602428436, + "learning_rate": 0.00018004504504504502, + "loss": 0.4346, + "step": 2673 + }, + { + "epoch": 0.801648866404347, + "grad_norm": 0.2514555752277374, + "learning_rate": 0.00017999999999999998, + "loss": 0.4188, + "step": 2674 + }, + { + "epoch": 0.8019486602960465, + "grad_norm": 0.25880131125450134, + "learning_rate": 0.00017995495495495495, + "loss": 0.4125, + "step": 2675 + }, + { + "epoch": 0.8022484541877459, + "grad_norm": 0.2755208909511566, + "learning_rate": 0.0001799099099099099, + "loss": 0.4448, + "step": 2676 + }, + { + "epoch": 0.8025482480794454, + "grad_norm": 0.251765638589859, + "learning_rate": 0.00017986486486486485, + "loss": 0.4216, + "step": 2677 + }, + { + "epoch": 0.8028480419711448, + "grad_norm": 0.27061545848846436, + "learning_rate": 0.00017981981981981979, + "loss": 0.4392, + "step": 2678 + }, + { + "epoch": 0.8031478358628443, + "grad_norm": 0.2883546054363251, + "learning_rate": 0.00017977477477477478, + "loss": 0.4486, + "step": 2679 + }, + { + "epoch": 0.8034476297545438, + "grad_norm": 0.2492683082818985, + "learning_rate": 0.0001797297297297297, + "loss": 0.441, + "step": 2680 + }, + { + "epoch": 0.8037474236462432, + "grad_norm": 0.24431340396404266, + "learning_rate": 0.00017968468468468465, + "loss": 0.4181, + "step": 2681 + }, + { + "epoch": 0.8040472175379426, + "grad_norm": 0.24715867638587952, + "learning_rate": 0.00017963963963963964, + "loss": 0.4137, + "step": 2682 + }, + { + "epoch": 0.8043470114296422, + "grad_norm": 0.27124032378196716, + "learning_rate": 0.00017959459459459458, + "loss": 0.4308, + "step": 2683 + }, + { + "epoch": 0.8046468053213416, + "grad_norm": 0.25120750069618225, + "learning_rate": 0.00017954954954954951, + "loss": 0.4183, + "step": 2684 + }, + { + "epoch": 0.804946599213041, + "grad_norm": 0.2443256676197052, + "learning_rate": 0.0001795045045045045, + "loss": 0.3974, + "step": 2685 + }, + { + "epoch": 0.8052463931047404, + "grad_norm": 0.2588818669319153, + "learning_rate": 0.00017945945945945944, + "loss": 0.4281, + "step": 2686 + }, + { + "epoch": 0.80554618699644, + "grad_norm": 0.26433560252189636, + "learning_rate": 0.00017941441441441438, + "loss": 0.4313, + "step": 2687 + }, + { + "epoch": 0.8058459808881394, + "grad_norm": 0.2529260814189911, + "learning_rate": 0.00017936936936936937, + "loss": 0.4398, + "step": 2688 + }, + { + "epoch": 0.8061457747798388, + "grad_norm": 0.25349390506744385, + "learning_rate": 0.0001793243243243243, + "loss": 0.4388, + "step": 2689 + }, + { + "epoch": 0.8064455686715383, + "grad_norm": 0.2616881728172302, + "learning_rate": 0.00017927927927927924, + "loss": 0.4135, + "step": 2690 + }, + { + "epoch": 0.8067453625632378, + "grad_norm": 0.24046184122562408, + "learning_rate": 0.00017923423423423423, + "loss": 0.4276, + "step": 2691 + }, + { + "epoch": 0.8070451564549372, + "grad_norm": 0.2653987407684326, + "learning_rate": 0.00017918918918918917, + "loss": 0.4515, + "step": 2692 + }, + { + "epoch": 0.8073449503466367, + "grad_norm": 0.27617138624191284, + "learning_rate": 0.0001791441441441441, + "loss": 0.4767, + "step": 2693 + }, + { + "epoch": 0.8076447442383361, + "grad_norm": 0.24791288375854492, + "learning_rate": 0.0001790990990990991, + "loss": 0.4203, + "step": 2694 + }, + { + "epoch": 0.8079445381300356, + "grad_norm": 0.2514311671257019, + "learning_rate": 0.00017905405405405404, + "loss": 0.4157, + "step": 2695 + }, + { + "epoch": 0.8082443320217351, + "grad_norm": 0.25074705481529236, + "learning_rate": 0.00017900900900900897, + "loss": 0.4327, + "step": 2696 + }, + { + "epoch": 0.8085441259134345, + "grad_norm": 0.2415165901184082, + "learning_rate": 0.00017896396396396396, + "loss": 0.4204, + "step": 2697 + }, + { + "epoch": 0.8088439198051339, + "grad_norm": 0.23986560106277466, + "learning_rate": 0.0001789189189189189, + "loss": 0.4143, + "step": 2698 + }, + { + "epoch": 0.8091437136968335, + "grad_norm": 0.24843257665634155, + "learning_rate": 0.00017887387387387386, + "loss": 0.4406, + "step": 2699 + }, + { + "epoch": 0.8094435075885329, + "grad_norm": 0.2549048960208893, + "learning_rate": 0.00017882882882882883, + "loss": 0.4293, + "step": 2700 + }, + { + "epoch": 0.8097433014802323, + "grad_norm": 0.2509189248085022, + "learning_rate": 0.00017878378378378376, + "loss": 0.4236, + "step": 2701 + }, + { + "epoch": 0.8100430953719318, + "grad_norm": 0.2555721402168274, + "learning_rate": 0.00017873873873873873, + "loss": 0.4387, + "step": 2702 + }, + { + "epoch": 0.8103428892636313, + "grad_norm": 0.2514118552207947, + "learning_rate": 0.00017869369369369367, + "loss": 0.4182, + "step": 2703 + }, + { + "epoch": 0.8106426831553307, + "grad_norm": 0.2508332133293152, + "learning_rate": 0.00017864864864864863, + "loss": 0.4283, + "step": 2704 + }, + { + "epoch": 0.8109424770470302, + "grad_norm": 0.2642100155353546, + "learning_rate": 0.0001786036036036036, + "loss": 0.4069, + "step": 2705 + }, + { + "epoch": 0.8112422709387296, + "grad_norm": 0.24579380452632904, + "learning_rate": 0.00017855855855855853, + "loss": 0.4229, + "step": 2706 + }, + { + "epoch": 0.8115420648304291, + "grad_norm": 0.25910264253616333, + "learning_rate": 0.0001785135135135135, + "loss": 0.422, + "step": 2707 + }, + { + "epoch": 0.8118418587221286, + "grad_norm": 0.256245493888855, + "learning_rate": 0.00017846846846846846, + "loss": 0.4223, + "step": 2708 + }, + { + "epoch": 0.812141652613828, + "grad_norm": 0.2674013376235962, + "learning_rate": 0.0001784234234234234, + "loss": 0.4256, + "step": 2709 + }, + { + "epoch": 0.8124414465055274, + "grad_norm": 0.25418075919151306, + "learning_rate": 0.00017837837837837839, + "loss": 0.4173, + "step": 2710 + }, + { + "epoch": 0.812741240397227, + "grad_norm": 0.25557002425193787, + "learning_rate": 0.00017833333333333332, + "loss": 0.4434, + "step": 2711 + }, + { + "epoch": 0.8130410342889264, + "grad_norm": 0.25232791900634766, + "learning_rate": 0.00017828828828828826, + "loss": 0.4366, + "step": 2712 + }, + { + "epoch": 0.8133408281806258, + "grad_norm": 0.24985311925411224, + "learning_rate": 0.00017824324324324325, + "loss": 0.4352, + "step": 2713 + }, + { + "epoch": 0.8136406220723252, + "grad_norm": 0.27299973368644714, + "learning_rate": 0.0001781981981981982, + "loss": 0.4017, + "step": 2714 + }, + { + "epoch": 0.8139404159640248, + "grad_norm": 0.24529805779457092, + "learning_rate": 0.00017815315315315312, + "loss": 0.4089, + "step": 2715 + }, + { + "epoch": 0.8142402098557242, + "grad_norm": 0.28326576948165894, + "learning_rate": 0.00017810810810810811, + "loss": 0.427, + "step": 2716 + }, + { + "epoch": 0.8145400037474236, + "grad_norm": 0.25834745168685913, + "learning_rate": 0.00017806306306306305, + "loss": 0.4257, + "step": 2717 + }, + { + "epoch": 0.8148397976391231, + "grad_norm": 0.31774160265922546, + "learning_rate": 0.000178018018018018, + "loss": 0.4321, + "step": 2718 + }, + { + "epoch": 0.8151395915308226, + "grad_norm": 0.26752525568008423, + "learning_rate": 0.00017797297297297298, + "loss": 0.4263, + "step": 2719 + }, + { + "epoch": 0.815439385422522, + "grad_norm": 0.26342013478279114, + "learning_rate": 0.00017792792792792792, + "loss": 0.4214, + "step": 2720 + }, + { + "epoch": 0.8157391793142215, + "grad_norm": 0.2852500081062317, + "learning_rate": 0.00017788288288288285, + "loss": 0.4123, + "step": 2721 + }, + { + "epoch": 0.8160389732059209, + "grad_norm": 0.26281121373176575, + "learning_rate": 0.00017783783783783784, + "loss": 0.4199, + "step": 2722 + }, + { + "epoch": 0.8163387670976204, + "grad_norm": 0.2701656222343445, + "learning_rate": 0.00017779279279279278, + "loss": 0.4002, + "step": 2723 + }, + { + "epoch": 0.8166385609893199, + "grad_norm": 0.28216591477394104, + "learning_rate": 0.00017774774774774772, + "loss": 0.4383, + "step": 2724 + }, + { + "epoch": 0.8169383548810193, + "grad_norm": 0.2807302176952362, + "learning_rate": 0.00017770270270270268, + "loss": 0.4481, + "step": 2725 + }, + { + "epoch": 0.8172381487727187, + "grad_norm": 0.27764102816581726, + "learning_rate": 0.00017765765765765764, + "loss": 0.4168, + "step": 2726 + }, + { + "epoch": 0.8175379426644183, + "grad_norm": 0.25099071860313416, + "learning_rate": 0.00017761261261261258, + "loss": 0.4052, + "step": 2727 + }, + { + "epoch": 0.8178377365561177, + "grad_norm": 0.24437499046325684, + "learning_rate": 0.00017756756756756755, + "loss": 0.4181, + "step": 2728 + }, + { + "epoch": 0.8181375304478171, + "grad_norm": 0.2668071389198303, + "learning_rate": 0.0001775225225225225, + "loss": 0.415, + "step": 2729 + }, + { + "epoch": 0.8184373243395165, + "grad_norm": 0.24040313065052032, + "learning_rate": 0.00017747747747747745, + "loss": 0.4251, + "step": 2730 + }, + { + "epoch": 0.8187371182312161, + "grad_norm": 0.2401929497718811, + "learning_rate": 0.0001774324324324324, + "loss": 0.4153, + "step": 2731 + }, + { + "epoch": 0.8190369121229155, + "grad_norm": 0.2651936709880829, + "learning_rate": 0.00017738738738738737, + "loss": 0.4327, + "step": 2732 + }, + { + "epoch": 0.819336706014615, + "grad_norm": 0.24846699833869934, + "learning_rate": 0.00017734234234234234, + "loss": 0.4213, + "step": 2733 + }, + { + "epoch": 0.8196364999063144, + "grad_norm": 0.24691832065582275, + "learning_rate": 0.00017729729729729727, + "loss": 0.4218, + "step": 2734 + }, + { + "epoch": 0.8199362937980139, + "grad_norm": 0.25850343704223633, + "learning_rate": 0.00017725225225225224, + "loss": 0.4363, + "step": 2735 + }, + { + "epoch": 0.8202360876897133, + "grad_norm": 0.27458709478378296, + "learning_rate": 0.0001772072072072072, + "loss": 0.4353, + "step": 2736 + }, + { + "epoch": 0.8205358815814128, + "grad_norm": 0.26524078845977783, + "learning_rate": 0.00017716216216216214, + "loss": 0.4367, + "step": 2737 + }, + { + "epoch": 0.8208356754731122, + "grad_norm": 0.24760177731513977, + "learning_rate": 0.0001771171171171171, + "loss": 0.4272, + "step": 2738 + }, + { + "epoch": 0.8211354693648117, + "grad_norm": 0.2548927068710327, + "learning_rate": 0.00017707207207207207, + "loss": 0.4247, + "step": 2739 + }, + { + "epoch": 0.8214352632565112, + "grad_norm": 0.2804696559906006, + "learning_rate": 0.000177027027027027, + "loss": 0.4296, + "step": 2740 + }, + { + "epoch": 0.8217350571482106, + "grad_norm": 0.27337226271629333, + "learning_rate": 0.00017698198198198197, + "loss": 0.4573, + "step": 2741 + }, + { + "epoch": 0.82203485103991, + "grad_norm": 0.26369425654411316, + "learning_rate": 0.00017693693693693693, + "loss": 0.4569, + "step": 2742 + }, + { + "epoch": 0.8223346449316096, + "grad_norm": 0.2607530653476715, + "learning_rate": 0.00017689189189189187, + "loss": 0.43, + "step": 2743 + }, + { + "epoch": 0.822634438823309, + "grad_norm": 0.2663254737854004, + "learning_rate": 0.00017684684684684686, + "loss": 0.4144, + "step": 2744 + }, + { + "epoch": 0.8229342327150084, + "grad_norm": 0.25760191679000854, + "learning_rate": 0.0001768018018018018, + "loss": 0.4257, + "step": 2745 + }, + { + "epoch": 0.8232340266067079, + "grad_norm": 0.27796119451522827, + "learning_rate": 0.00017675675675675673, + "loss": 0.4204, + "step": 2746 + }, + { + "epoch": 0.8235338204984074, + "grad_norm": 0.27072879672050476, + "learning_rate": 0.00017671171171171172, + "loss": 0.4296, + "step": 2747 + }, + { + "epoch": 0.8238336143901068, + "grad_norm": 0.2645798623561859, + "learning_rate": 0.00017666666666666666, + "loss": 0.4181, + "step": 2748 + }, + { + "epoch": 0.8241334082818063, + "grad_norm": 0.2641856372356415, + "learning_rate": 0.0001766216216216216, + "loss": 0.4341, + "step": 2749 + }, + { + "epoch": 0.8244332021735057, + "grad_norm": 0.26364126801490784, + "learning_rate": 0.00017657657657657653, + "loss": 0.3884, + "step": 2750 + }, + { + "epoch": 0.8247329960652052, + "grad_norm": 0.23896346986293793, + "learning_rate": 0.00017653153153153152, + "loss": 0.4193, + "step": 2751 + }, + { + "epoch": 0.8250327899569047, + "grad_norm": 0.2746429443359375, + "learning_rate": 0.00017648648648648646, + "loss": 0.4248, + "step": 2752 + }, + { + "epoch": 0.8253325838486041, + "grad_norm": 0.251149445772171, + "learning_rate": 0.0001764414414414414, + "loss": 0.4223, + "step": 2753 + }, + { + "epoch": 0.8256323777403035, + "grad_norm": 0.26169952750205994, + "learning_rate": 0.0001763963963963964, + "loss": 0.4257, + "step": 2754 + }, + { + "epoch": 0.8259321716320029, + "grad_norm": 0.2569408416748047, + "learning_rate": 0.00017635135135135133, + "loss": 0.4032, + "step": 2755 + }, + { + "epoch": 0.8262319655237025, + "grad_norm": 0.24922534823417664, + "learning_rate": 0.0001763063063063063, + "loss": 0.414, + "step": 2756 + }, + { + "epoch": 0.8265317594154019, + "grad_norm": 0.26194071769714355, + "learning_rate": 0.00017626126126126125, + "loss": 0.3991, + "step": 2757 + }, + { + "epoch": 0.8268315533071013, + "grad_norm": 0.2403927892446518, + "learning_rate": 0.0001762162162162162, + "loss": 0.3935, + "step": 2758 + }, + { + "epoch": 0.8271313471988008, + "grad_norm": 0.25983864068984985, + "learning_rate": 0.00017617117117117115, + "loss": 0.4249, + "step": 2759 + }, + { + "epoch": 0.8274311410905003, + "grad_norm": 0.2580634355545044, + "learning_rate": 0.00017612612612612612, + "loss": 0.4209, + "step": 2760 + }, + { + "epoch": 0.8277309349821997, + "grad_norm": 0.2543208599090576, + "learning_rate": 0.00017608108108108106, + "loss": 0.4126, + "step": 2761 + }, + { + "epoch": 0.8280307288738992, + "grad_norm": 0.27977827191352844, + "learning_rate": 0.00017603603603603602, + "loss": 0.4161, + "step": 2762 + }, + { + "epoch": 0.8283305227655986, + "grad_norm": 0.27077874541282654, + "learning_rate": 0.00017599099099099098, + "loss": 0.4341, + "step": 2763 + }, + { + "epoch": 0.8286303166572981, + "grad_norm": 0.2763903737068176, + "learning_rate": 0.00017594594594594592, + "loss": 0.4444, + "step": 2764 + }, + { + "epoch": 0.8289301105489976, + "grad_norm": 0.26092103123664856, + "learning_rate": 0.00017590090090090088, + "loss": 0.4542, + "step": 2765 + }, + { + "epoch": 0.829229904440697, + "grad_norm": 0.25859686732292175, + "learning_rate": 0.00017585585585585585, + "loss": 0.443, + "step": 2766 + }, + { + "epoch": 0.8295296983323964, + "grad_norm": 0.26898616552352905, + "learning_rate": 0.0001758108108108108, + "loss": 0.421, + "step": 2767 + }, + { + "epoch": 0.829829492224096, + "grad_norm": 0.2678453028202057, + "learning_rate": 0.00017576576576576575, + "loss": 0.4431, + "step": 2768 + }, + { + "epoch": 0.8301292861157954, + "grad_norm": 0.27072227001190186, + "learning_rate": 0.0001757207207207207, + "loss": 0.4353, + "step": 2769 + }, + { + "epoch": 0.8304290800074948, + "grad_norm": 0.2593313455581665, + "learning_rate": 0.00017567567567567568, + "loss": 0.4102, + "step": 2770 + }, + { + "epoch": 0.8307288738991943, + "grad_norm": 0.2503952085971832, + "learning_rate": 0.0001756306306306306, + "loss": 0.4102, + "step": 2771 + }, + { + "epoch": 0.8310286677908938, + "grad_norm": 0.2745724618434906, + "learning_rate": 0.00017558558558558558, + "loss": 0.4364, + "step": 2772 + }, + { + "epoch": 0.8313284616825932, + "grad_norm": 0.27861306071281433, + "learning_rate": 0.00017554054054054054, + "loss": 0.4501, + "step": 2773 + }, + { + "epoch": 0.8316282555742927, + "grad_norm": 0.2669075131416321, + "learning_rate": 0.00017549549549549548, + "loss": 0.4388, + "step": 2774 + }, + { + "epoch": 0.8319280494659921, + "grad_norm": 0.25937843322753906, + "learning_rate": 0.00017545045045045041, + "loss": 0.4528, + "step": 2775 + }, + { + "epoch": 0.8322278433576916, + "grad_norm": 0.26331016421318054, + "learning_rate": 0.0001754054054054054, + "loss": 0.4609, + "step": 2776 + }, + { + "epoch": 0.832527637249391, + "grad_norm": 0.27154049277305603, + "learning_rate": 0.00017536036036036034, + "loss": 0.4908, + "step": 2777 + }, + { + "epoch": 0.8328274311410905, + "grad_norm": 0.27023017406463623, + "learning_rate": 0.00017531531531531528, + "loss": 0.4294, + "step": 2778 + }, + { + "epoch": 0.8331272250327899, + "grad_norm": 0.26888102293014526, + "learning_rate": 0.00017527027027027027, + "loss": 0.4371, + "step": 2779 + }, + { + "epoch": 0.8334270189244894, + "grad_norm": 0.24405723810195923, + "learning_rate": 0.0001752252252252252, + "loss": 0.4035, + "step": 2780 + }, + { + "epoch": 0.8337268128161889, + "grad_norm": 0.26839005947113037, + "learning_rate": 0.00017518018018018014, + "loss": 0.4109, + "step": 2781 + }, + { + "epoch": 0.8340266067078883, + "grad_norm": 0.26782795786857605, + "learning_rate": 0.00017513513513513513, + "loss": 0.4072, + "step": 2782 + }, + { + "epoch": 0.8343264005995877, + "grad_norm": 0.2640438973903656, + "learning_rate": 0.00017509009009009007, + "loss": 0.4183, + "step": 2783 + }, + { + "epoch": 0.8346261944912873, + "grad_norm": 0.26554253697395325, + "learning_rate": 0.000175045045045045, + "loss": 0.3979, + "step": 2784 + }, + { + "epoch": 0.8349259883829867, + "grad_norm": 0.25945335626602173, + "learning_rate": 0.000175, + "loss": 0.4196, + "step": 2785 + }, + { + "epoch": 0.8352257822746861, + "grad_norm": 0.29560789465904236, + "learning_rate": 0.00017495495495495493, + "loss": 0.4584, + "step": 2786 + }, + { + "epoch": 0.8355255761663856, + "grad_norm": 0.2612930238246918, + "learning_rate": 0.00017490990990990987, + "loss": 0.4172, + "step": 2787 + }, + { + "epoch": 0.8358253700580851, + "grad_norm": 0.23529918491840363, + "learning_rate": 0.00017486486486486486, + "loss": 0.3909, + "step": 2788 + }, + { + "epoch": 0.8361251639497845, + "grad_norm": 0.2858099341392517, + "learning_rate": 0.0001748198198198198, + "loss": 0.444, + "step": 2789 + }, + { + "epoch": 0.836424957841484, + "grad_norm": 0.26430973410606384, + "learning_rate": 0.00017477477477477476, + "loss": 0.4329, + "step": 2790 + }, + { + "epoch": 0.8367247517331834, + "grad_norm": 0.256142795085907, + "learning_rate": 0.00017472972972972973, + "loss": 0.4299, + "step": 2791 + }, + { + "epoch": 0.8370245456248829, + "grad_norm": 0.26407790184020996, + "learning_rate": 0.00017468468468468466, + "loss": 0.4296, + "step": 2792 + }, + { + "epoch": 0.8373243395165824, + "grad_norm": 0.2579902708530426, + "learning_rate": 0.00017463963963963963, + "loss": 0.4243, + "step": 2793 + }, + { + "epoch": 0.8376241334082818, + "grad_norm": 0.25428980588912964, + "learning_rate": 0.0001745945945945946, + "loss": 0.4091, + "step": 2794 + }, + { + "epoch": 0.8379239272999812, + "grad_norm": 0.2652067542076111, + "learning_rate": 0.00017454954954954953, + "loss": 0.4341, + "step": 2795 + }, + { + "epoch": 0.8382237211916808, + "grad_norm": 0.2717462182044983, + "learning_rate": 0.0001745045045045045, + "loss": 0.4426, + "step": 2796 + }, + { + "epoch": 0.8385235150833802, + "grad_norm": 0.24994242191314697, + "learning_rate": 0.00017445945945945946, + "loss": 0.3897, + "step": 2797 + }, + { + "epoch": 0.8388233089750796, + "grad_norm": 0.26126357913017273, + "learning_rate": 0.0001744144144144144, + "loss": 0.4238, + "step": 2798 + }, + { + "epoch": 0.839123102866779, + "grad_norm": 0.25549226999282837, + "learning_rate": 0.00017436936936936936, + "loss": 0.4342, + "step": 2799 + }, + { + "epoch": 0.8394228967584786, + "grad_norm": 0.26869869232177734, + "learning_rate": 0.0001743243243243243, + "loss": 0.436, + "step": 2800 + }, + { + "epoch": 0.839722690650178, + "grad_norm": 0.2841474115848541, + "learning_rate": 0.00017427927927927928, + "loss": 0.4325, + "step": 2801 + }, + { + "epoch": 0.8400224845418774, + "grad_norm": 0.25196996331214905, + "learning_rate": 0.00017423423423423422, + "loss": 0.3917, + "step": 2802 + }, + { + "epoch": 0.8403222784335769, + "grad_norm": 0.26642677187919617, + "learning_rate": 0.00017418918918918916, + "loss": 0.4429, + "step": 2803 + }, + { + "epoch": 0.8406220723252764, + "grad_norm": 0.27364763617515564, + "learning_rate": 0.00017414414414414415, + "loss": 0.4326, + "step": 2804 + }, + { + "epoch": 0.8409218662169758, + "grad_norm": 0.28387343883514404, + "learning_rate": 0.00017409909909909909, + "loss": 0.465, + "step": 2805 + }, + { + "epoch": 0.8412216601086753, + "grad_norm": 0.2638298273086548, + "learning_rate": 0.00017405405405405402, + "loss": 0.4394, + "step": 2806 + }, + { + "epoch": 0.8415214540003747, + "grad_norm": 0.23615007102489471, + "learning_rate": 0.00017400900900900901, + "loss": 0.4126, + "step": 2807 + }, + { + "epoch": 0.8418212478920742, + "grad_norm": 0.2662603259086609, + "learning_rate": 0.00017396396396396395, + "loss": 0.4601, + "step": 2808 + }, + { + "epoch": 0.8421210417837737, + "grad_norm": 0.2668648660182953, + "learning_rate": 0.0001739189189189189, + "loss": 0.4152, + "step": 2809 + }, + { + "epoch": 0.8424208356754731, + "grad_norm": 0.23364293575286865, + "learning_rate": 0.00017387387387387388, + "loss": 0.3843, + "step": 2810 + }, + { + "epoch": 0.8427206295671725, + "grad_norm": 0.2743973731994629, + "learning_rate": 0.00017382882882882881, + "loss": 0.4041, + "step": 2811 + }, + { + "epoch": 0.8430204234588721, + "grad_norm": 0.25221478939056396, + "learning_rate": 0.00017378378378378375, + "loss": 0.4082, + "step": 2812 + }, + { + "epoch": 0.8433202173505715, + "grad_norm": 0.27902311086654663, + "learning_rate": 0.00017373873873873874, + "loss": 0.4437, + "step": 2813 + }, + { + "epoch": 0.8436200112422709, + "grad_norm": 0.26741036772727966, + "learning_rate": 0.00017369369369369368, + "loss": 0.4305, + "step": 2814 + }, + { + "epoch": 0.8439198051339704, + "grad_norm": 0.2413070648908615, + "learning_rate": 0.00017364864864864862, + "loss": 0.3938, + "step": 2815 + }, + { + "epoch": 0.8442195990256699, + "grad_norm": 0.2679976224899292, + "learning_rate": 0.0001736036036036036, + "loss": 0.4505, + "step": 2816 + }, + { + "epoch": 0.8445193929173693, + "grad_norm": 0.25923097133636475, + "learning_rate": 0.00017355855855855854, + "loss": 0.4025, + "step": 2817 + }, + { + "epoch": 0.8448191868090688, + "grad_norm": 0.2735726833343506, + "learning_rate": 0.00017351351351351348, + "loss": 0.4189, + "step": 2818 + }, + { + "epoch": 0.8451189807007682, + "grad_norm": 0.24195829033851624, + "learning_rate": 0.00017346846846846847, + "loss": 0.4193, + "step": 2819 + }, + { + "epoch": 0.8454187745924677, + "grad_norm": 0.26585015654563904, + "learning_rate": 0.0001734234234234234, + "loss": 0.4529, + "step": 2820 + }, + { + "epoch": 0.8457185684841672, + "grad_norm": 0.27688056230545044, + "learning_rate": 0.00017337837837837835, + "loss": 0.4234, + "step": 2821 + }, + { + "epoch": 0.8460183623758666, + "grad_norm": 0.2597983181476593, + "learning_rate": 0.0001733333333333333, + "loss": 0.4262, + "step": 2822 + }, + { + "epoch": 0.846318156267566, + "grad_norm": 0.26929470896720886, + "learning_rate": 0.00017328828828828827, + "loss": 0.4063, + "step": 2823 + }, + { + "epoch": 0.8466179501592656, + "grad_norm": 0.2761951982975006, + "learning_rate": 0.00017324324324324324, + "loss": 0.4005, + "step": 2824 + }, + { + "epoch": 0.846917744050965, + "grad_norm": 0.24759140610694885, + "learning_rate": 0.00017319819819819817, + "loss": 0.4171, + "step": 2825 + }, + { + "epoch": 0.8472175379426644, + "grad_norm": 0.2795017659664154, + "learning_rate": 0.00017315315315315314, + "loss": 0.3981, + "step": 2826 + }, + { + "epoch": 0.8475173318343638, + "grad_norm": 0.2652096450328827, + "learning_rate": 0.0001731081081081081, + "loss": 0.4013, + "step": 2827 + }, + { + "epoch": 0.8478171257260634, + "grad_norm": 0.2893044650554657, + "learning_rate": 0.00017306306306306304, + "loss": 0.4317, + "step": 2828 + }, + { + "epoch": 0.8481169196177628, + "grad_norm": 0.28209424018859863, + "learning_rate": 0.000173018018018018, + "loss": 0.4563, + "step": 2829 + }, + { + "epoch": 0.8484167135094622, + "grad_norm": 0.2647794485092163, + "learning_rate": 0.00017297297297297297, + "loss": 0.4261, + "step": 2830 + }, + { + "epoch": 0.8487165074011617, + "grad_norm": 0.25158509612083435, + "learning_rate": 0.0001729279279279279, + "loss": 0.4109, + "step": 2831 + }, + { + "epoch": 0.8490163012928612, + "grad_norm": 0.26878440380096436, + "learning_rate": 0.00017288288288288287, + "loss": 0.4322, + "step": 2832 + }, + { + "epoch": 0.8493160951845606, + "grad_norm": 0.25768351554870605, + "learning_rate": 0.00017283783783783783, + "loss": 0.4266, + "step": 2833 + }, + { + "epoch": 0.8496158890762601, + "grad_norm": 0.26001647114753723, + "learning_rate": 0.00017279279279279277, + "loss": 0.4299, + "step": 2834 + }, + { + "epoch": 0.8499156829679595, + "grad_norm": 0.250027060508728, + "learning_rate": 0.00017274774774774773, + "loss": 0.4119, + "step": 2835 + }, + { + "epoch": 0.850215476859659, + "grad_norm": 0.23465260863304138, + "learning_rate": 0.0001727027027027027, + "loss": 0.3822, + "step": 2836 + }, + { + "epoch": 0.8505152707513585, + "grad_norm": 0.23945166170597076, + "learning_rate": 0.00017265765765765763, + "loss": 0.4063, + "step": 2837 + }, + { + "epoch": 0.8508150646430579, + "grad_norm": 0.2597178518772125, + "learning_rate": 0.00017261261261261262, + "loss": 0.4011, + "step": 2838 + }, + { + "epoch": 0.8511148585347573, + "grad_norm": 0.2437753677368164, + "learning_rate": 0.00017256756756756756, + "loss": 0.4199, + "step": 2839 + }, + { + "epoch": 0.8514146524264569, + "grad_norm": 0.26032543182373047, + "learning_rate": 0.0001725225225225225, + "loss": 0.4666, + "step": 2840 + }, + { + "epoch": 0.8517144463181563, + "grad_norm": 0.2443908005952835, + "learning_rate": 0.0001724774774774775, + "loss": 0.4109, + "step": 2841 + }, + { + "epoch": 0.8520142402098557, + "grad_norm": 0.25420573353767395, + "learning_rate": 0.00017243243243243242, + "loss": 0.4357, + "step": 2842 + }, + { + "epoch": 0.8523140341015552, + "grad_norm": 0.2527938783168793, + "learning_rate": 0.00017238738738738736, + "loss": 0.4299, + "step": 2843 + }, + { + "epoch": 0.8526138279932547, + "grad_norm": 0.24754786491394043, + "learning_rate": 0.00017234234234234235, + "loss": 0.4161, + "step": 2844 + }, + { + "epoch": 0.8529136218849541, + "grad_norm": 0.2448507398366928, + "learning_rate": 0.0001722972972972973, + "loss": 0.4238, + "step": 2845 + }, + { + "epoch": 0.8532134157766535, + "grad_norm": 0.2616099715232849, + "learning_rate": 0.00017225225225225223, + "loss": 0.4328, + "step": 2846 + }, + { + "epoch": 0.853513209668353, + "grad_norm": 0.263560950756073, + "learning_rate": 0.0001722072072072072, + "loss": 0.4204, + "step": 2847 + }, + { + "epoch": 0.8538130035600525, + "grad_norm": 0.25410616397857666, + "learning_rate": 0.00017216216216216215, + "loss": 0.4333, + "step": 2848 + }, + { + "epoch": 0.854112797451752, + "grad_norm": 0.2678585946559906, + "learning_rate": 0.0001721171171171171, + "loss": 0.4498, + "step": 2849 + }, + { + "epoch": 0.8544125913434514, + "grad_norm": 0.2663200795650482, + "learning_rate": 0.00017207207207207205, + "loss": 0.455, + "step": 2850 + }, + { + "epoch": 0.8547123852351508, + "grad_norm": 0.25283655524253845, + "learning_rate": 0.00017202702702702702, + "loss": 0.4167, + "step": 2851 + }, + { + "epoch": 0.8550121791268502, + "grad_norm": 0.25023430585861206, + "learning_rate": 0.00017198198198198195, + "loss": 0.3981, + "step": 2852 + }, + { + "epoch": 0.8553119730185498, + "grad_norm": 0.23520569503307343, + "learning_rate": 0.00017193693693693692, + "loss": 0.3895, + "step": 2853 + }, + { + "epoch": 0.8556117669102492, + "grad_norm": 0.26464787125587463, + "learning_rate": 0.00017189189189189188, + "loss": 0.4639, + "step": 2854 + }, + { + "epoch": 0.8559115608019486, + "grad_norm": 0.24785351753234863, + "learning_rate": 0.00017184684684684682, + "loss": 0.4202, + "step": 2855 + }, + { + "epoch": 0.8562113546936481, + "grad_norm": 0.2626986503601074, + "learning_rate": 0.00017180180180180178, + "loss": 0.4222, + "step": 2856 + }, + { + "epoch": 0.8565111485853476, + "grad_norm": 0.2554967403411865, + "learning_rate": 0.00017175675675675675, + "loss": 0.4182, + "step": 2857 + }, + { + "epoch": 0.856810942477047, + "grad_norm": 0.2597710192203522, + "learning_rate": 0.0001717117117117117, + "loss": 0.4296, + "step": 2858 + }, + { + "epoch": 0.8571107363687465, + "grad_norm": 0.26038360595703125, + "learning_rate": 0.00017166666666666665, + "loss": 0.4275, + "step": 2859 + }, + { + "epoch": 0.8574105302604459, + "grad_norm": 0.24397097527980804, + "learning_rate": 0.0001716216216216216, + "loss": 0.3991, + "step": 2860 + }, + { + "epoch": 0.8577103241521454, + "grad_norm": 0.2713565528392792, + "learning_rate": 0.00017157657657657657, + "loss": 0.4211, + "step": 2861 + }, + { + "epoch": 0.8580101180438449, + "grad_norm": 0.24685804545879364, + "learning_rate": 0.0001715315315315315, + "loss": 0.4008, + "step": 2862 + }, + { + "epoch": 0.8583099119355443, + "grad_norm": 0.2701510488986969, + "learning_rate": 0.00017148648648648648, + "loss": 0.422, + "step": 2863 + }, + { + "epoch": 0.8586097058272437, + "grad_norm": 0.2619568705558777, + "learning_rate": 0.00017144144144144144, + "loss": 0.4271, + "step": 2864 + }, + { + "epoch": 0.8589094997189433, + "grad_norm": 0.2503330707550049, + "learning_rate": 0.00017139639639639638, + "loss": 0.4517, + "step": 2865 + }, + { + "epoch": 0.8592092936106427, + "grad_norm": 0.2657839059829712, + "learning_rate": 0.00017135135135135134, + "loss": 0.431, + "step": 2866 + }, + { + "epoch": 0.8595090875023421, + "grad_norm": 0.24868468940258026, + "learning_rate": 0.0001713063063063063, + "loss": 0.415, + "step": 2867 + }, + { + "epoch": 0.8598088813940415, + "grad_norm": 0.2581307888031006, + "learning_rate": 0.00017126126126126124, + "loss": 0.4465, + "step": 2868 + }, + { + "epoch": 0.8601086752857411, + "grad_norm": 0.24836871027946472, + "learning_rate": 0.0001712162162162162, + "loss": 0.418, + "step": 2869 + }, + { + "epoch": 0.8604084691774405, + "grad_norm": 0.2541236877441406, + "learning_rate": 0.00017117117117117117, + "loss": 0.3917, + "step": 2870 + }, + { + "epoch": 0.86070826306914, + "grad_norm": 0.24935103952884674, + "learning_rate": 0.0001711261261261261, + "loss": 0.4002, + "step": 2871 + }, + { + "epoch": 0.8610080569608394, + "grad_norm": 0.2818658649921417, + "learning_rate": 0.00017108108108108104, + "loss": 0.441, + "step": 2872 + }, + { + "epoch": 0.8613078508525389, + "grad_norm": 0.2605019509792328, + "learning_rate": 0.00017103603603603603, + "loss": 0.4324, + "step": 2873 + }, + { + "epoch": 0.8616076447442383, + "grad_norm": 0.26078200340270996, + "learning_rate": 0.00017099099099099097, + "loss": 0.4206, + "step": 2874 + }, + { + "epoch": 0.8619074386359378, + "grad_norm": 0.26391538977622986, + "learning_rate": 0.0001709459459459459, + "loss": 0.3951, + "step": 2875 + }, + { + "epoch": 0.8622072325276372, + "grad_norm": 0.28473007678985596, + "learning_rate": 0.0001709009009009009, + "loss": 0.4655, + "step": 2876 + }, + { + "epoch": 0.8625070264193367, + "grad_norm": 0.24676088988780975, + "learning_rate": 0.00017085585585585583, + "loss": 0.4196, + "step": 2877 + }, + { + "epoch": 0.8628068203110362, + "grad_norm": 0.26333364844322205, + "learning_rate": 0.00017081081081081077, + "loss": 0.4204, + "step": 2878 + }, + { + "epoch": 0.8631066142027356, + "grad_norm": 0.24308471381664276, + "learning_rate": 0.00017076576576576576, + "loss": 0.4047, + "step": 2879 + }, + { + "epoch": 0.863406408094435, + "grad_norm": 0.24952733516693115, + "learning_rate": 0.0001707207207207207, + "loss": 0.417, + "step": 2880 + }, + { + "epoch": 0.8637062019861346, + "grad_norm": 0.26905426383018494, + "learning_rate": 0.00017067567567567566, + "loss": 0.432, + "step": 2881 + }, + { + "epoch": 0.864005995877834, + "grad_norm": 0.2668468654155731, + "learning_rate": 0.00017063063063063063, + "loss": 0.4278, + "step": 2882 + }, + { + "epoch": 0.8643057897695334, + "grad_norm": 0.2533245384693146, + "learning_rate": 0.00017058558558558556, + "loss": 0.4228, + "step": 2883 + }, + { + "epoch": 0.8646055836612329, + "grad_norm": 0.25289660692214966, + "learning_rate": 0.00017054054054054053, + "loss": 0.4136, + "step": 2884 + }, + { + "epoch": 0.8649053775529324, + "grad_norm": 0.25045090913772583, + "learning_rate": 0.0001704954954954955, + "loss": 0.425, + "step": 2885 + }, + { + "epoch": 0.8652051714446318, + "grad_norm": 0.2620987296104431, + "learning_rate": 0.00017045045045045043, + "loss": 0.4115, + "step": 2886 + }, + { + "epoch": 0.8655049653363313, + "grad_norm": 0.2613299489021301, + "learning_rate": 0.0001704054054054054, + "loss": 0.4311, + "step": 2887 + }, + { + "epoch": 0.8658047592280307, + "grad_norm": 0.25921955704689026, + "learning_rate": 0.00017036036036036036, + "loss": 0.4254, + "step": 2888 + }, + { + "epoch": 0.8661045531197302, + "grad_norm": 0.24337226152420044, + "learning_rate": 0.0001703153153153153, + "loss": 0.4282, + "step": 2889 + }, + { + "epoch": 0.8664043470114297, + "grad_norm": 0.25295788049697876, + "learning_rate": 0.00017027027027027026, + "loss": 0.3921, + "step": 2890 + }, + { + "epoch": 0.8667041409031291, + "grad_norm": 0.26155343651771545, + "learning_rate": 0.00017022522522522522, + "loss": 0.4379, + "step": 2891 + }, + { + "epoch": 0.8670039347948285, + "grad_norm": 0.25492122769355774, + "learning_rate": 0.00017018018018018016, + "loss": 0.4225, + "step": 2892 + }, + { + "epoch": 0.867303728686528, + "grad_norm": 0.27565327286720276, + "learning_rate": 0.00017013513513513512, + "loss": 0.4254, + "step": 2893 + }, + { + "epoch": 0.8676035225782275, + "grad_norm": 0.24523693323135376, + "learning_rate": 0.00017009009009009006, + "loss": 0.4103, + "step": 2894 + }, + { + "epoch": 0.8679033164699269, + "grad_norm": 0.29129156470298767, + "learning_rate": 0.00017004504504504505, + "loss": 0.4366, + "step": 2895 + }, + { + "epoch": 0.8682031103616263, + "grad_norm": 0.2691640555858612, + "learning_rate": 0.00016999999999999999, + "loss": 0.4152, + "step": 2896 + }, + { + "epoch": 0.8685029042533259, + "grad_norm": 0.2639869153499603, + "learning_rate": 0.00016995495495495492, + "loss": 0.3987, + "step": 2897 + }, + { + "epoch": 0.8688026981450253, + "grad_norm": 0.254861056804657, + "learning_rate": 0.0001699099099099099, + "loss": 0.387, + "step": 2898 + }, + { + "epoch": 0.8691024920367247, + "grad_norm": 0.2625315189361572, + "learning_rate": 0.00016986486486486485, + "loss": 0.4306, + "step": 2899 + }, + { + "epoch": 0.8694022859284242, + "grad_norm": 0.26464176177978516, + "learning_rate": 0.00016981981981981979, + "loss": 0.4035, + "step": 2900 + }, + { + "epoch": 0.8697020798201237, + "grad_norm": 0.2657167613506317, + "learning_rate": 0.00016977477477477478, + "loss": 0.4321, + "step": 2901 + }, + { + "epoch": 0.8700018737118231, + "grad_norm": 0.268087238073349, + "learning_rate": 0.00016972972972972971, + "loss": 0.4187, + "step": 2902 + }, + { + "epoch": 0.8703016676035226, + "grad_norm": 0.2585284411907196, + "learning_rate": 0.00016968468468468465, + "loss": 0.4213, + "step": 2903 + }, + { + "epoch": 0.870601461495222, + "grad_norm": 0.26495012640953064, + "learning_rate": 0.00016963963963963964, + "loss": 0.42, + "step": 2904 + }, + { + "epoch": 0.8709012553869215, + "grad_norm": 0.27982714772224426, + "learning_rate": 0.00016959459459459458, + "loss": 0.402, + "step": 2905 + }, + { + "epoch": 0.871201049278621, + "grad_norm": 0.2580246925354004, + "learning_rate": 0.00016954954954954952, + "loss": 0.4028, + "step": 2906 + }, + { + "epoch": 0.8715008431703204, + "grad_norm": 0.254881888628006, + "learning_rate": 0.0001695045045045045, + "loss": 0.4186, + "step": 2907 + }, + { + "epoch": 0.8718006370620198, + "grad_norm": 0.26736170053482056, + "learning_rate": 0.00016945945945945944, + "loss": 0.4322, + "step": 2908 + }, + { + "epoch": 0.8721004309537194, + "grad_norm": 0.27539244294166565, + "learning_rate": 0.00016941441441441438, + "loss": 0.4362, + "step": 2909 + }, + { + "epoch": 0.8724002248454188, + "grad_norm": 0.2490614801645279, + "learning_rate": 0.00016936936936936937, + "loss": 0.4193, + "step": 2910 + }, + { + "epoch": 0.8727000187371182, + "grad_norm": 0.25294286012649536, + "learning_rate": 0.0001693243243243243, + "loss": 0.4202, + "step": 2911 + }, + { + "epoch": 0.8729998126288177, + "grad_norm": 0.2698332965373993, + "learning_rate": 0.00016927927927927924, + "loss": 0.4393, + "step": 2912 + }, + { + "epoch": 0.8732996065205172, + "grad_norm": 0.2698095738887787, + "learning_rate": 0.00016923423423423424, + "loss": 0.4265, + "step": 2913 + }, + { + "epoch": 0.8735994004122166, + "grad_norm": 0.25060606002807617, + "learning_rate": 0.00016918918918918917, + "loss": 0.4201, + "step": 2914 + }, + { + "epoch": 0.873899194303916, + "grad_norm": 0.2536526918411255, + "learning_rate": 0.00016914414414414414, + "loss": 0.4038, + "step": 2915 + }, + { + "epoch": 0.8741989881956155, + "grad_norm": 0.25597628951072693, + "learning_rate": 0.0001690990990990991, + "loss": 0.422, + "step": 2916 + }, + { + "epoch": 0.874498782087315, + "grad_norm": 0.25855180621147156, + "learning_rate": 0.00016905405405405404, + "loss": 0.4017, + "step": 2917 + }, + { + "epoch": 0.8747985759790144, + "grad_norm": 0.23959973454475403, + "learning_rate": 0.000169009009009009, + "loss": 0.3882, + "step": 2918 + }, + { + "epoch": 0.8750983698707139, + "grad_norm": 0.24938194453716278, + "learning_rate": 0.00016896396396396394, + "loss": 0.435, + "step": 2919 + }, + { + "epoch": 0.8753981637624133, + "grad_norm": 0.268046110868454, + "learning_rate": 0.0001689189189189189, + "loss": 0.4323, + "step": 2920 + }, + { + "epoch": 0.8756979576541128, + "grad_norm": 0.28519049286842346, + "learning_rate": 0.00016887387387387387, + "loss": 0.446, + "step": 2921 + }, + { + "epoch": 0.8759977515458123, + "grad_norm": 0.2485249787569046, + "learning_rate": 0.0001688288288288288, + "loss": 0.4328, + "step": 2922 + }, + { + "epoch": 0.8762975454375117, + "grad_norm": 0.28231462836265564, + "learning_rate": 0.00016878378378378377, + "loss": 0.4201, + "step": 2923 + }, + { + "epoch": 0.8765973393292111, + "grad_norm": 0.2693428695201874, + "learning_rate": 0.00016873873873873873, + "loss": 0.451, + "step": 2924 + }, + { + "epoch": 0.8768971332209107, + "grad_norm": 0.2595715820789337, + "learning_rate": 0.00016869369369369367, + "loss": 0.4319, + "step": 2925 + }, + { + "epoch": 0.8771969271126101, + "grad_norm": 0.2597898840904236, + "learning_rate": 0.00016864864864864863, + "loss": 0.4056, + "step": 2926 + }, + { + "epoch": 0.8774967210043095, + "grad_norm": 0.2534911632537842, + "learning_rate": 0.0001686036036036036, + "loss": 0.4325, + "step": 2927 + }, + { + "epoch": 0.877796514896009, + "grad_norm": 0.2742319107055664, + "learning_rate": 0.00016855855855855853, + "loss": 0.456, + "step": 2928 + }, + { + "epoch": 0.8780963087877085, + "grad_norm": 0.25808632373809814, + "learning_rate": 0.00016851351351351352, + "loss": 0.4419, + "step": 2929 + }, + { + "epoch": 0.8783961026794079, + "grad_norm": 0.2860279083251953, + "learning_rate": 0.00016846846846846846, + "loss": 0.4534, + "step": 2930 + }, + { + "epoch": 0.8786958965711074, + "grad_norm": 0.26722362637519836, + "learning_rate": 0.0001684234234234234, + "loss": 0.445, + "step": 2931 + }, + { + "epoch": 0.8789956904628068, + "grad_norm": 0.2761351764202118, + "learning_rate": 0.00016837837837837839, + "loss": 0.4175, + "step": 2932 + }, + { + "epoch": 0.8792954843545063, + "grad_norm": 0.2346809059381485, + "learning_rate": 0.00016833333333333332, + "loss": 0.4107, + "step": 2933 + }, + { + "epoch": 0.8795952782462058, + "grad_norm": 0.24212133884429932, + "learning_rate": 0.00016828828828828826, + "loss": 0.393, + "step": 2934 + }, + { + "epoch": 0.8798950721379052, + "grad_norm": 0.252184122800827, + "learning_rate": 0.00016824324324324325, + "loss": 0.4069, + "step": 2935 + }, + { + "epoch": 0.8801948660296046, + "grad_norm": 0.26833096146583557, + "learning_rate": 0.0001681981981981982, + "loss": 0.4455, + "step": 2936 + }, + { + "epoch": 0.8804946599213042, + "grad_norm": 0.2804100811481476, + "learning_rate": 0.00016815315315315312, + "loss": 0.4348, + "step": 2937 + }, + { + "epoch": 0.8807944538130036, + "grad_norm": 0.2611362338066101, + "learning_rate": 0.00016810810810810812, + "loss": 0.4407, + "step": 2938 + }, + { + "epoch": 0.881094247704703, + "grad_norm": 0.2497948855161667, + "learning_rate": 0.00016806306306306305, + "loss": 0.437, + "step": 2939 + }, + { + "epoch": 0.8813940415964024, + "grad_norm": 0.2518768906593323, + "learning_rate": 0.000168018018018018, + "loss": 0.4279, + "step": 2940 + }, + { + "epoch": 0.881693835488102, + "grad_norm": 0.2658750116825104, + "learning_rate": 0.00016797297297297298, + "loss": 0.4444, + "step": 2941 + }, + { + "epoch": 0.8819936293798014, + "grad_norm": 0.25899848341941833, + "learning_rate": 0.00016792792792792792, + "loss": 0.4145, + "step": 2942 + }, + { + "epoch": 0.8822934232715008, + "grad_norm": 0.2500343322753906, + "learning_rate": 0.00016788288288288285, + "loss": 0.4142, + "step": 2943 + }, + { + "epoch": 0.8825932171632003, + "grad_norm": 0.2614184319972992, + "learning_rate": 0.00016783783783783782, + "loss": 0.4125, + "step": 2944 + }, + { + "epoch": 0.8828930110548997, + "grad_norm": 0.2568380832672119, + "learning_rate": 0.00016779279279279278, + "loss": 0.4184, + "step": 2945 + }, + { + "epoch": 0.8831928049465992, + "grad_norm": 0.2491275817155838, + "learning_rate": 0.00016774774774774772, + "loss": 0.4386, + "step": 2946 + }, + { + "epoch": 0.8834925988382987, + "grad_norm": 0.2676237225532532, + "learning_rate": 0.00016770270270270268, + "loss": 0.4395, + "step": 2947 + }, + { + "epoch": 0.8837923927299981, + "grad_norm": 0.26965218782424927, + "learning_rate": 0.00016765765765765765, + "loss": 0.4235, + "step": 2948 + }, + { + "epoch": 0.8840921866216975, + "grad_norm": 0.26237690448760986, + "learning_rate": 0.00016761261261261258, + "loss": 0.4246, + "step": 2949 + }, + { + "epoch": 0.8843919805133971, + "grad_norm": 0.2397497445344925, + "learning_rate": 0.00016756756756756755, + "loss": 0.4092, + "step": 2950 + }, + { + "epoch": 0.8846917744050965, + "grad_norm": 0.25894200801849365, + "learning_rate": 0.0001675225225225225, + "loss": 0.4355, + "step": 2951 + }, + { + "epoch": 0.8849915682967959, + "grad_norm": 0.24452635645866394, + "learning_rate": 0.00016747747747747747, + "loss": 0.4062, + "step": 2952 + }, + { + "epoch": 0.8852913621884954, + "grad_norm": 0.25013741850852966, + "learning_rate": 0.0001674324324324324, + "loss": 0.4195, + "step": 2953 + }, + { + "epoch": 0.8855911560801949, + "grad_norm": 0.2652290463447571, + "learning_rate": 0.00016738738738738737, + "loss": 0.4442, + "step": 2954 + }, + { + "epoch": 0.8858909499718943, + "grad_norm": 0.25907742977142334, + "learning_rate": 0.00016734234234234234, + "loss": 0.4353, + "step": 2955 + }, + { + "epoch": 0.8861907438635938, + "grad_norm": 0.26591479778289795, + "learning_rate": 0.00016729729729729728, + "loss": 0.4427, + "step": 2956 + }, + { + "epoch": 0.8864905377552932, + "grad_norm": 0.25649645924568176, + "learning_rate": 0.00016725225225225224, + "loss": 0.4194, + "step": 2957 + }, + { + "epoch": 0.8867903316469927, + "grad_norm": 0.23913215100765228, + "learning_rate": 0.0001672072072072072, + "loss": 0.4114, + "step": 2958 + }, + { + "epoch": 0.8870901255386922, + "grad_norm": 0.2712022364139557, + "learning_rate": 0.00016716216216216214, + "loss": 0.4358, + "step": 2959 + }, + { + "epoch": 0.8873899194303916, + "grad_norm": 0.2589244544506073, + "learning_rate": 0.0001671171171171171, + "loss": 0.4325, + "step": 2960 + }, + { + "epoch": 0.887689713322091, + "grad_norm": 0.2385113388299942, + "learning_rate": 0.00016707207207207207, + "loss": 0.4057, + "step": 2961 + }, + { + "epoch": 0.8879895072137906, + "grad_norm": 0.2598143219947815, + "learning_rate": 0.000167027027027027, + "loss": 0.436, + "step": 2962 + }, + { + "epoch": 0.88828930110549, + "grad_norm": 0.24032726883888245, + "learning_rate": 0.000166981981981982, + "loss": 0.3991, + "step": 2963 + }, + { + "epoch": 0.8885890949971894, + "grad_norm": 0.25453847646713257, + "learning_rate": 0.00016693693693693693, + "loss": 0.3913, + "step": 2964 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.24269893765449524, + "learning_rate": 0.00016689189189189187, + "loss": 0.4134, + "step": 2965 + }, + { + "epoch": 0.8891886827805884, + "grad_norm": 0.25419285893440247, + "learning_rate": 0.0001668468468468468, + "loss": 0.4218, + "step": 2966 + }, + { + "epoch": 0.8894884766722878, + "grad_norm": 0.2587622106075287, + "learning_rate": 0.0001668018018018018, + "loss": 0.4269, + "step": 2967 + }, + { + "epoch": 0.8897882705639872, + "grad_norm": 0.25632867217063904, + "learning_rate": 0.00016675675675675673, + "loss": 0.4379, + "step": 2968 + }, + { + "epoch": 0.8900880644556867, + "grad_norm": 0.2640867531299591, + "learning_rate": 0.00016671171171171167, + "loss": 0.4345, + "step": 2969 + }, + { + "epoch": 0.8903878583473862, + "grad_norm": 0.24803832173347473, + "learning_rate": 0.00016666666666666666, + "loss": 0.4126, + "step": 2970 + }, + { + "epoch": 0.8906876522390856, + "grad_norm": 0.24913029372692108, + "learning_rate": 0.0001666216216216216, + "loss": 0.3953, + "step": 2971 + }, + { + "epoch": 0.8909874461307851, + "grad_norm": 0.2572920620441437, + "learning_rate": 0.00016657657657657653, + "loss": 0.4184, + "step": 2972 + }, + { + "epoch": 0.8912872400224845, + "grad_norm": 0.26253217458724976, + "learning_rate": 0.00016653153153153153, + "loss": 0.4509, + "step": 2973 + }, + { + "epoch": 0.891587033914184, + "grad_norm": 0.24650625884532928, + "learning_rate": 0.00016648648648648646, + "loss": 0.4371, + "step": 2974 + }, + { + "epoch": 0.8918868278058835, + "grad_norm": 0.26514360308647156, + "learning_rate": 0.00016644144144144143, + "loss": 0.4547, + "step": 2975 + }, + { + "epoch": 0.8921866216975829, + "grad_norm": 0.2522089183330536, + "learning_rate": 0.0001663963963963964, + "loss": 0.4141, + "step": 2976 + }, + { + "epoch": 0.8924864155892823, + "grad_norm": 0.2426896095275879, + "learning_rate": 0.00016635135135135133, + "loss": 0.4124, + "step": 2977 + }, + { + "epoch": 0.8927862094809819, + "grad_norm": 0.24892792105674744, + "learning_rate": 0.0001663063063063063, + "loss": 0.3971, + "step": 2978 + }, + { + "epoch": 0.8930860033726813, + "grad_norm": 0.25551697611808777, + "learning_rate": 0.00016626126126126125, + "loss": 0.4191, + "step": 2979 + }, + { + "epoch": 0.8933857972643807, + "grad_norm": 0.2599818706512451, + "learning_rate": 0.0001662162162162162, + "loss": 0.4616, + "step": 2980 + }, + { + "epoch": 0.8936855911560802, + "grad_norm": 0.24369800090789795, + "learning_rate": 0.00016617117117117116, + "loss": 0.4092, + "step": 2981 + }, + { + "epoch": 0.8939853850477797, + "grad_norm": 0.24979013204574585, + "learning_rate": 0.00016612612612612612, + "loss": 0.4184, + "step": 2982 + }, + { + "epoch": 0.8942851789394791, + "grad_norm": 0.24617372453212738, + "learning_rate": 0.00016608108108108106, + "loss": 0.4022, + "step": 2983 + }, + { + "epoch": 0.8945849728311785, + "grad_norm": 0.25377964973449707, + "learning_rate": 0.00016603603603603602, + "loss": 0.4246, + "step": 2984 + }, + { + "epoch": 0.894884766722878, + "grad_norm": 0.25180289149284363, + "learning_rate": 0.00016599099099099098, + "loss": 0.4262, + "step": 2985 + }, + { + "epoch": 0.8951845606145775, + "grad_norm": 0.26268377900123596, + "learning_rate": 0.00016594594594594595, + "loss": 0.4083, + "step": 2986 + }, + { + "epoch": 0.895484354506277, + "grad_norm": 0.2539224624633789, + "learning_rate": 0.00016590090090090088, + "loss": 0.4328, + "step": 2987 + }, + { + "epoch": 0.8957841483979764, + "grad_norm": 0.2591584324836731, + "learning_rate": 0.00016585585585585585, + "loss": 0.4312, + "step": 2988 + }, + { + "epoch": 0.8960839422896758, + "grad_norm": 0.246408611536026, + "learning_rate": 0.0001658108108108108, + "loss": 0.3991, + "step": 2989 + }, + { + "epoch": 0.8963837361813753, + "grad_norm": 0.25761210918426514, + "learning_rate": 0.00016576576576576575, + "loss": 0.4225, + "step": 2990 + }, + { + "epoch": 0.8966835300730748, + "grad_norm": 0.2380562722682953, + "learning_rate": 0.00016572072072072069, + "loss": 0.4174, + "step": 2991 + }, + { + "epoch": 0.8969833239647742, + "grad_norm": 0.2515964210033417, + "learning_rate": 0.00016567567567567568, + "loss": 0.4347, + "step": 2992 + }, + { + "epoch": 0.8972831178564736, + "grad_norm": 0.24080772697925568, + "learning_rate": 0.0001656306306306306, + "loss": 0.4331, + "step": 2993 + }, + { + "epoch": 0.8975829117481732, + "grad_norm": 0.25413286685943604, + "learning_rate": 0.00016558558558558555, + "loss": 0.4101, + "step": 2994 + }, + { + "epoch": 0.8978827056398726, + "grad_norm": 0.2696084678173065, + "learning_rate": 0.00016554054054054054, + "loss": 0.4446, + "step": 2995 + }, + { + "epoch": 0.898182499531572, + "grad_norm": 0.24439360201358795, + "learning_rate": 0.00016549549549549548, + "loss": 0.4136, + "step": 2996 + }, + { + "epoch": 0.8984822934232715, + "grad_norm": 0.25744253396987915, + "learning_rate": 0.00016545045045045041, + "loss": 0.4321, + "step": 2997 + }, + { + "epoch": 0.898782087314971, + "grad_norm": 0.26146844029426575, + "learning_rate": 0.0001654054054054054, + "loss": 0.4214, + "step": 2998 + }, + { + "epoch": 0.8990818812066704, + "grad_norm": 0.24922147393226624, + "learning_rate": 0.00016536036036036034, + "loss": 0.3922, + "step": 2999 + }, + { + "epoch": 0.8993816750983699, + "grad_norm": 0.24045661091804504, + "learning_rate": 0.00016531531531531528, + "loss": 0.423, + "step": 3000 + }, + { + "epoch": 0.8993816750983699, + "eval_loss": 0.4257502257823944, + "eval_runtime": 565.3599, + "eval_samples_per_second": 3.819, + "eval_steps_per_second": 0.478, + "step": 3000 + }, + { + "epoch": 0.8996814689900693, + "grad_norm": 0.2597513198852539, + "learning_rate": 0.00016527027027027027, + "loss": 0.4506, + "step": 3001 + }, + { + "epoch": 0.8999812628817688, + "grad_norm": 0.24458399415016174, + "learning_rate": 0.0001652252252252252, + "loss": 0.419, + "step": 3002 + }, + { + "epoch": 0.9002810567734683, + "grad_norm": 0.24708233773708344, + "learning_rate": 0.00016518018018018014, + "loss": 0.4016, + "step": 3003 + }, + { + "epoch": 0.9005808506651677, + "grad_norm": 0.25798776745796204, + "learning_rate": 0.00016513513513513513, + "loss": 0.4394, + "step": 3004 + }, + { + "epoch": 0.9008806445568671, + "grad_norm": 0.27492186427116394, + "learning_rate": 0.00016509009009009007, + "loss": 0.4262, + "step": 3005 + }, + { + "epoch": 0.9011804384485667, + "grad_norm": 0.2706008553504944, + "learning_rate": 0.000165045045045045, + "loss": 0.4416, + "step": 3006 + }, + { + "epoch": 0.9014802323402661, + "grad_norm": 0.269711971282959, + "learning_rate": 0.000165, + "loss": 0.4568, + "step": 3007 + }, + { + "epoch": 0.9017800262319655, + "grad_norm": 0.2955131530761719, + "learning_rate": 0.00016495495495495494, + "loss": 0.4043, + "step": 3008 + }, + { + "epoch": 0.9020798201236649, + "grad_norm": 0.2553322911262512, + "learning_rate": 0.0001649099099099099, + "loss": 0.4135, + "step": 3009 + }, + { + "epoch": 0.9023796140153645, + "grad_norm": 0.2554074227809906, + "learning_rate": 0.00016486486486486486, + "loss": 0.4277, + "step": 3010 + }, + { + "epoch": 0.9026794079070639, + "grad_norm": 0.2844032645225525, + "learning_rate": 0.0001648198198198198, + "loss": 0.4327, + "step": 3011 + }, + { + "epoch": 0.9029792017987633, + "grad_norm": 0.28705543279647827, + "learning_rate": 0.00016477477477477476, + "loss": 0.426, + "step": 3012 + }, + { + "epoch": 0.9032789956904628, + "grad_norm": 0.2566993832588196, + "learning_rate": 0.00016472972972972973, + "loss": 0.4094, + "step": 3013 + }, + { + "epoch": 0.9035787895821623, + "grad_norm": 0.24438917636871338, + "learning_rate": 0.00016468468468468466, + "loss": 0.4009, + "step": 3014 + }, + { + "epoch": 0.9038785834738617, + "grad_norm": 0.2831672430038452, + "learning_rate": 0.00016463963963963963, + "loss": 0.4533, + "step": 3015 + }, + { + "epoch": 0.9041783773655612, + "grad_norm": 0.24414128065109253, + "learning_rate": 0.00016459459459459457, + "loss": 0.4161, + "step": 3016 + }, + { + "epoch": 0.9044781712572606, + "grad_norm": 0.27103421092033386, + "learning_rate": 0.00016454954954954953, + "loss": 0.396, + "step": 3017 + }, + { + "epoch": 0.9047779651489601, + "grad_norm": 0.2692296504974365, + "learning_rate": 0.0001645045045045045, + "loss": 0.4136, + "step": 3018 + }, + { + "epoch": 0.9050777590406596, + "grad_norm": 0.267905056476593, + "learning_rate": 0.00016445945945945943, + "loss": 0.4126, + "step": 3019 + }, + { + "epoch": 0.905377552932359, + "grad_norm": 0.286996990442276, + "learning_rate": 0.00016441441441441442, + "loss": 0.4518, + "step": 3020 + }, + { + "epoch": 0.9056773468240584, + "grad_norm": 0.2529492676258087, + "learning_rate": 0.00016436936936936936, + "loss": 0.422, + "step": 3021 + }, + { + "epoch": 0.905977140715758, + "grad_norm": 0.23985780775547028, + "learning_rate": 0.0001643243243243243, + "loss": 0.3948, + "step": 3022 + }, + { + "epoch": 0.9062769346074574, + "grad_norm": 0.2592794895172119, + "learning_rate": 0.00016427927927927929, + "loss": 0.426, + "step": 3023 + }, + { + "epoch": 0.9065767284991568, + "grad_norm": 0.2730545401573181, + "learning_rate": 0.00016423423423423422, + "loss": 0.4524, + "step": 3024 + }, + { + "epoch": 0.9068765223908563, + "grad_norm": 0.2558513581752777, + "learning_rate": 0.00016418918918918916, + "loss": 0.4151, + "step": 3025 + }, + { + "epoch": 0.9071763162825558, + "grad_norm": 0.26334816217422485, + "learning_rate": 0.00016414414414414415, + "loss": 0.4069, + "step": 3026 + }, + { + "epoch": 0.9074761101742552, + "grad_norm": 0.25048863887786865, + "learning_rate": 0.0001640990990990991, + "loss": 0.4251, + "step": 3027 + }, + { + "epoch": 0.9077759040659547, + "grad_norm": 0.24627207219600677, + "learning_rate": 0.00016405405405405402, + "loss": 0.4181, + "step": 3028 + }, + { + "epoch": 0.9080756979576541, + "grad_norm": 0.2565348744392395, + "learning_rate": 0.00016400900900900901, + "loss": 0.4088, + "step": 3029 + }, + { + "epoch": 0.9083754918493536, + "grad_norm": 0.23633405566215515, + "learning_rate": 0.00016396396396396395, + "loss": 0.3953, + "step": 3030 + }, + { + "epoch": 0.908675285741053, + "grad_norm": 0.25836828351020813, + "learning_rate": 0.0001639189189189189, + "loss": 0.4177, + "step": 3031 + }, + { + "epoch": 0.9089750796327525, + "grad_norm": 0.2472517192363739, + "learning_rate": 0.00016387387387387388, + "loss": 0.401, + "step": 3032 + }, + { + "epoch": 0.9092748735244519, + "grad_norm": 0.2629941403865814, + "learning_rate": 0.00016382882882882882, + "loss": 0.4367, + "step": 3033 + }, + { + "epoch": 0.9095746674161515, + "grad_norm": 0.2588827908039093, + "learning_rate": 0.00016378378378378375, + "loss": 0.4569, + "step": 3034 + }, + { + "epoch": 0.9098744613078509, + "grad_norm": 0.2503022253513336, + "learning_rate": 0.00016373873873873874, + "loss": 0.4193, + "step": 3035 + }, + { + "epoch": 0.9101742551995503, + "grad_norm": 0.2534733712673187, + "learning_rate": 0.00016369369369369368, + "loss": 0.4277, + "step": 3036 + }, + { + "epoch": 0.9104740490912497, + "grad_norm": 0.2703090310096741, + "learning_rate": 0.00016364864864864862, + "loss": 0.4363, + "step": 3037 + }, + { + "epoch": 0.9107738429829493, + "grad_norm": 0.25499293208122253, + "learning_rate": 0.0001636036036036036, + "loss": 0.4213, + "step": 3038 + }, + { + "epoch": 0.9110736368746487, + "grad_norm": 0.25429412722587585, + "learning_rate": 0.00016355855855855854, + "loss": 0.4242, + "step": 3039 + }, + { + "epoch": 0.9113734307663481, + "grad_norm": 0.2607458829879761, + "learning_rate": 0.00016351351351351348, + "loss": 0.4148, + "step": 3040 + }, + { + "epoch": 0.9116732246580476, + "grad_norm": 0.2532136142253876, + "learning_rate": 0.00016346846846846845, + "loss": 0.4274, + "step": 3041 + }, + { + "epoch": 0.911973018549747, + "grad_norm": 0.25032955408096313, + "learning_rate": 0.0001634234234234234, + "loss": 0.4325, + "step": 3042 + }, + { + "epoch": 0.9122728124414465, + "grad_norm": 0.23373675346374512, + "learning_rate": 0.00016337837837837837, + "loss": 0.4207, + "step": 3043 + }, + { + "epoch": 0.912572606333146, + "grad_norm": 0.2524081766605377, + "learning_rate": 0.0001633333333333333, + "loss": 0.4308, + "step": 3044 + }, + { + "epoch": 0.9128724002248454, + "grad_norm": 0.2704564034938812, + "learning_rate": 0.00016328828828828827, + "loss": 0.4296, + "step": 3045 + }, + { + "epoch": 0.9131721941165448, + "grad_norm": 0.241916686296463, + "learning_rate": 0.00016324324324324324, + "loss": 0.4113, + "step": 3046 + }, + { + "epoch": 0.9134719880082444, + "grad_norm": 0.29074499011039734, + "learning_rate": 0.00016319819819819817, + "loss": 0.4772, + "step": 3047 + }, + { + "epoch": 0.9137717818999438, + "grad_norm": 0.2655307948589325, + "learning_rate": 0.00016315315315315314, + "loss": 0.4378, + "step": 3048 + }, + { + "epoch": 0.9140715757916432, + "grad_norm": 0.23870833218097687, + "learning_rate": 0.0001631081081081081, + "loss": 0.3933, + "step": 3049 + }, + { + "epoch": 0.9143713696833426, + "grad_norm": 0.28048643469810486, + "learning_rate": 0.00016306306306306304, + "loss": 0.4366, + "step": 3050 + }, + { + "epoch": 0.9146711635750422, + "grad_norm": 0.2783832550048828, + "learning_rate": 0.000163018018018018, + "loss": 0.4435, + "step": 3051 + }, + { + "epoch": 0.9149709574667416, + "grad_norm": 0.25969600677490234, + "learning_rate": 0.00016297297297297297, + "loss": 0.4226, + "step": 3052 + }, + { + "epoch": 0.915270751358441, + "grad_norm": 0.2698688209056854, + "learning_rate": 0.0001629279279279279, + "loss": 0.4153, + "step": 3053 + }, + { + "epoch": 0.9155705452501405, + "grad_norm": 0.26543521881103516, + "learning_rate": 0.0001628828828828829, + "loss": 0.4439, + "step": 3054 + }, + { + "epoch": 0.91587033914184, + "grad_norm": 0.2702194154262543, + "learning_rate": 0.00016283783783783783, + "loss": 0.4416, + "step": 3055 + }, + { + "epoch": 0.9161701330335394, + "grad_norm": 0.2544569671154022, + "learning_rate": 0.00016279279279279277, + "loss": 0.4245, + "step": 3056 + }, + { + "epoch": 0.9164699269252389, + "grad_norm": 0.2640305161476135, + "learning_rate": 0.00016274774774774776, + "loss": 0.4342, + "step": 3057 + }, + { + "epoch": 0.9167697208169383, + "grad_norm": 0.26513299345970154, + "learning_rate": 0.0001627027027027027, + "loss": 0.448, + "step": 3058 + }, + { + "epoch": 0.9170695147086378, + "grad_norm": 0.25756070017814636, + "learning_rate": 0.00016265765765765763, + "loss": 0.4196, + "step": 3059 + }, + { + "epoch": 0.9173693086003373, + "grad_norm": 0.26089876890182495, + "learning_rate": 0.00016261261261261262, + "loss": 0.359, + "step": 3060 + }, + { + "epoch": 0.9176691024920367, + "grad_norm": 0.28472572565078735, + "learning_rate": 0.00016256756756756756, + "loss": 0.454, + "step": 3061 + }, + { + "epoch": 0.9179688963837361, + "grad_norm": 0.26528477668762207, + "learning_rate": 0.0001625225225225225, + "loss": 0.4422, + "step": 3062 + }, + { + "epoch": 0.9182686902754357, + "grad_norm": 0.26731163263320923, + "learning_rate": 0.00016247747747747743, + "loss": 0.4211, + "step": 3063 + }, + { + "epoch": 0.9185684841671351, + "grad_norm": 0.26028740406036377, + "learning_rate": 0.00016243243243243242, + "loss": 0.4338, + "step": 3064 + }, + { + "epoch": 0.9188682780588345, + "grad_norm": 0.25657540559768677, + "learning_rate": 0.00016238738738738736, + "loss": 0.4262, + "step": 3065 + }, + { + "epoch": 0.919168071950534, + "grad_norm": 0.2732202410697937, + "learning_rate": 0.00016234234234234233, + "loss": 0.441, + "step": 3066 + }, + { + "epoch": 0.9194678658422335, + "grad_norm": 0.25451597571372986, + "learning_rate": 0.0001622972972972973, + "loss": 0.4412, + "step": 3067 + }, + { + "epoch": 0.9197676597339329, + "grad_norm": 0.2619662582874298, + "learning_rate": 0.00016225225225225223, + "loss": 0.4642, + "step": 3068 + }, + { + "epoch": 0.9200674536256324, + "grad_norm": 0.25545254349708557, + "learning_rate": 0.0001622072072072072, + "loss": 0.4091, + "step": 3069 + }, + { + "epoch": 0.9203672475173318, + "grad_norm": 0.25901320576667786, + "learning_rate": 0.00016216216216216215, + "loss": 0.3968, + "step": 3070 + }, + { + "epoch": 0.9206670414090313, + "grad_norm": 0.26680001616477966, + "learning_rate": 0.0001621171171171171, + "loss": 0.4391, + "step": 3071 + }, + { + "epoch": 0.9209668353007308, + "grad_norm": 0.2471819370985031, + "learning_rate": 0.00016207207207207205, + "loss": 0.4167, + "step": 3072 + }, + { + "epoch": 0.9212666291924302, + "grad_norm": 0.25497692823410034, + "learning_rate": 0.00016202702702702702, + "loss": 0.4061, + "step": 3073 + }, + { + "epoch": 0.9215664230841296, + "grad_norm": 0.25427013635635376, + "learning_rate": 0.00016198198198198196, + "loss": 0.4223, + "step": 3074 + }, + { + "epoch": 0.9218662169758292, + "grad_norm": 0.24736852943897247, + "learning_rate": 0.00016193693693693692, + "loss": 0.415, + "step": 3075 + }, + { + "epoch": 0.9221660108675286, + "grad_norm": 0.26432523131370544, + "learning_rate": 0.00016189189189189188, + "loss": 0.4289, + "step": 3076 + }, + { + "epoch": 0.922465804759228, + "grad_norm": 0.2562811076641083, + "learning_rate": 0.00016184684684684685, + "loss": 0.4291, + "step": 3077 + }, + { + "epoch": 0.9227655986509274, + "grad_norm": 0.25394096970558167, + "learning_rate": 0.00016180180180180178, + "loss": 0.4386, + "step": 3078 + }, + { + "epoch": 0.923065392542627, + "grad_norm": 0.23978635668754578, + "learning_rate": 0.00016175675675675675, + "loss": 0.4207, + "step": 3079 + }, + { + "epoch": 0.9233651864343264, + "grad_norm": 0.26115700602531433, + "learning_rate": 0.0001617117117117117, + "loss": 0.4368, + "step": 3080 + }, + { + "epoch": 0.9236649803260258, + "grad_norm": 0.2565377652645111, + "learning_rate": 0.00016166666666666665, + "loss": 0.4203, + "step": 3081 + }, + { + "epoch": 0.9239647742177253, + "grad_norm": 0.24304890632629395, + "learning_rate": 0.0001616216216216216, + "loss": 0.4194, + "step": 3082 + }, + { + "epoch": 0.9242645681094248, + "grad_norm": 0.25062617659568787, + "learning_rate": 0.00016157657657657658, + "loss": 0.4465, + "step": 3083 + }, + { + "epoch": 0.9245643620011242, + "grad_norm": 0.24126526713371277, + "learning_rate": 0.0001615315315315315, + "loss": 0.4245, + "step": 3084 + }, + { + "epoch": 0.9248641558928237, + "grad_norm": 0.2625589072704315, + "learning_rate": 0.00016148648648648648, + "loss": 0.4211, + "step": 3085 + }, + { + "epoch": 0.9251639497845231, + "grad_norm": 0.25375208258628845, + "learning_rate": 0.00016144144144144144, + "loss": 0.3876, + "step": 3086 + }, + { + "epoch": 0.9254637436762226, + "grad_norm": 0.25839680433273315, + "learning_rate": 0.00016139639639639638, + "loss": 0.4405, + "step": 3087 + }, + { + "epoch": 0.9257635375679221, + "grad_norm": 0.248674213886261, + "learning_rate": 0.00016135135135135131, + "loss": 0.4173, + "step": 3088 + }, + { + "epoch": 0.9260633314596215, + "grad_norm": 0.2578020989894867, + "learning_rate": 0.0001613063063063063, + "loss": 0.4059, + "step": 3089 + }, + { + "epoch": 0.9263631253513209, + "grad_norm": 0.24993066489696503, + "learning_rate": 0.00016126126126126124, + "loss": 0.4385, + "step": 3090 + }, + { + "epoch": 0.9266629192430205, + "grad_norm": 0.24710343778133392, + "learning_rate": 0.00016121621621621618, + "loss": 0.4163, + "step": 3091 + }, + { + "epoch": 0.9269627131347199, + "grad_norm": 0.26110345125198364, + "learning_rate": 0.00016117117117117117, + "loss": 0.464, + "step": 3092 + }, + { + "epoch": 0.9272625070264193, + "grad_norm": 0.22636494040489197, + "learning_rate": 0.0001611261261261261, + "loss": 0.3878, + "step": 3093 + }, + { + "epoch": 0.9275623009181188, + "grad_norm": 0.2674306333065033, + "learning_rate": 0.00016108108108108104, + "loss": 0.4009, + "step": 3094 + }, + { + "epoch": 0.9278620948098183, + "grad_norm": 0.2749866843223572, + "learning_rate": 0.00016103603603603603, + "loss": 0.4189, + "step": 3095 + }, + { + "epoch": 0.9281618887015177, + "grad_norm": 0.24979329109191895, + "learning_rate": 0.00016099099099099097, + "loss": 0.4296, + "step": 3096 + }, + { + "epoch": 0.9284616825932172, + "grad_norm": 0.2526085376739502, + "learning_rate": 0.0001609459459459459, + "loss": 0.4158, + "step": 3097 + }, + { + "epoch": 0.9287614764849166, + "grad_norm": 0.274911493062973, + "learning_rate": 0.0001609009009009009, + "loss": 0.4162, + "step": 3098 + }, + { + "epoch": 0.9290612703766161, + "grad_norm": 0.25422587990760803, + "learning_rate": 0.00016085585585585584, + "loss": 0.4305, + "step": 3099 + }, + { + "epoch": 0.9293610642683156, + "grad_norm": 0.26116907596588135, + "learning_rate": 0.0001608108108108108, + "loss": 0.4612, + "step": 3100 + }, + { + "epoch": 0.929660858160015, + "grad_norm": 0.24997106194496155, + "learning_rate": 0.00016076576576576576, + "loss": 0.4039, + "step": 3101 + }, + { + "epoch": 0.9299606520517144, + "grad_norm": 0.24362730979919434, + "learning_rate": 0.0001607207207207207, + "loss": 0.3995, + "step": 3102 + }, + { + "epoch": 0.930260445943414, + "grad_norm": 0.2644561529159546, + "learning_rate": 0.00016067567567567566, + "loss": 0.4282, + "step": 3103 + }, + { + "epoch": 0.9305602398351134, + "grad_norm": 0.2574602961540222, + "learning_rate": 0.00016063063063063063, + "loss": 0.4243, + "step": 3104 + }, + { + "epoch": 0.9308600337268128, + "grad_norm": 0.27288708090782166, + "learning_rate": 0.00016058558558558556, + "loss": 0.4057, + "step": 3105 + }, + { + "epoch": 0.9311598276185122, + "grad_norm": 0.2651262879371643, + "learning_rate": 0.00016054054054054053, + "loss": 0.407, + "step": 3106 + }, + { + "epoch": 0.9314596215102118, + "grad_norm": 0.26550403237342834, + "learning_rate": 0.0001604954954954955, + "loss": 0.4294, + "step": 3107 + }, + { + "epoch": 0.9317594154019112, + "grad_norm": 0.25612547993659973, + "learning_rate": 0.00016045045045045043, + "loss": 0.4224, + "step": 3108 + }, + { + "epoch": 0.9320592092936106, + "grad_norm": 0.25533682107925415, + "learning_rate": 0.0001604054054054054, + "loss": 0.4134, + "step": 3109 + }, + { + "epoch": 0.9323590031853101, + "grad_norm": 0.24888603389263153, + "learning_rate": 0.00016036036036036036, + "loss": 0.3807, + "step": 3110 + }, + { + "epoch": 0.9326587970770096, + "grad_norm": 0.25240257382392883, + "learning_rate": 0.00016031531531531532, + "loss": 0.406, + "step": 3111 + }, + { + "epoch": 0.932958590968709, + "grad_norm": 0.25701963901519775, + "learning_rate": 0.00016027027027027026, + "loss": 0.4319, + "step": 3112 + }, + { + "epoch": 0.9332583848604085, + "grad_norm": 0.2428182065486908, + "learning_rate": 0.0001602252252252252, + "loss": 0.4, + "step": 3113 + }, + { + "epoch": 0.9335581787521079, + "grad_norm": 0.2379184514284134, + "learning_rate": 0.00016018018018018018, + "loss": 0.4209, + "step": 3114 + }, + { + "epoch": 0.9338579726438074, + "grad_norm": 0.28138455748558044, + "learning_rate": 0.00016013513513513512, + "loss": 0.446, + "step": 3115 + }, + { + "epoch": 0.9341577665355069, + "grad_norm": 0.2478276938199997, + "learning_rate": 0.00016009009009009006, + "loss": 0.384, + "step": 3116 + }, + { + "epoch": 0.9344575604272063, + "grad_norm": 0.2511236369609833, + "learning_rate": 0.00016004504504504505, + "loss": 0.4289, + "step": 3117 + }, + { + "epoch": 0.9347573543189057, + "grad_norm": 0.24487215280532837, + "learning_rate": 0.00015999999999999999, + "loss": 0.4271, + "step": 3118 + }, + { + "epoch": 0.9350571482106053, + "grad_norm": 0.27695828676223755, + "learning_rate": 0.00015995495495495492, + "loss": 0.4384, + "step": 3119 + }, + { + "epoch": 0.9353569421023047, + "grad_norm": 0.24011778831481934, + "learning_rate": 0.00015990990990990991, + "loss": 0.3903, + "step": 3120 + }, + { + "epoch": 0.9356567359940041, + "grad_norm": 0.2450532615184784, + "learning_rate": 0.00015986486486486485, + "loss": 0.4258, + "step": 3121 + }, + { + "epoch": 0.9359565298857035, + "grad_norm": 0.26375943422317505, + "learning_rate": 0.0001598198198198198, + "loss": 0.4395, + "step": 3122 + }, + { + "epoch": 0.9362563237774031, + "grad_norm": 0.2335769087076187, + "learning_rate": 0.00015977477477477478, + "loss": 0.3922, + "step": 3123 + }, + { + "epoch": 0.9365561176691025, + "grad_norm": 0.2576378285884857, + "learning_rate": 0.00015972972972972972, + "loss": 0.412, + "step": 3124 + }, + { + "epoch": 0.936855911560802, + "grad_norm": 0.24350136518478394, + "learning_rate": 0.00015968468468468465, + "loss": 0.4207, + "step": 3125 + }, + { + "epoch": 0.9371557054525014, + "grad_norm": 0.24622796475887299, + "learning_rate": 0.00015963963963963964, + "loss": 0.4137, + "step": 3126 + }, + { + "epoch": 0.9374554993442009, + "grad_norm": 0.24007757008075714, + "learning_rate": 0.00015959459459459458, + "loss": 0.4297, + "step": 3127 + }, + { + "epoch": 0.9377552932359003, + "grad_norm": 0.2597469091415405, + "learning_rate": 0.00015954954954954952, + "loss": 0.4262, + "step": 3128 + }, + { + "epoch": 0.9380550871275998, + "grad_norm": 0.25026780366897583, + "learning_rate": 0.0001595045045045045, + "loss": 0.4206, + "step": 3129 + }, + { + "epoch": 0.9383548810192992, + "grad_norm": 0.2386496216058731, + "learning_rate": 0.00015945945945945944, + "loss": 0.4266, + "step": 3130 + }, + { + "epoch": 0.9386546749109987, + "grad_norm": 0.25480231642723083, + "learning_rate": 0.00015941441441441438, + "loss": 0.4373, + "step": 3131 + }, + { + "epoch": 0.9389544688026982, + "grad_norm": 0.23915326595306396, + "learning_rate": 0.00015936936936936937, + "loss": 0.411, + "step": 3132 + }, + { + "epoch": 0.9392542626943976, + "grad_norm": 0.2645804286003113, + "learning_rate": 0.0001593243243243243, + "loss": 0.4334, + "step": 3133 + }, + { + "epoch": 0.939554056586097, + "grad_norm": 0.26099300384521484, + "learning_rate": 0.00015927927927927927, + "loss": 0.4264, + "step": 3134 + }, + { + "epoch": 0.9398538504777965, + "grad_norm": 0.2576426863670349, + "learning_rate": 0.0001592342342342342, + "loss": 0.4276, + "step": 3135 + }, + { + "epoch": 0.940153644369496, + "grad_norm": 0.2772998511791229, + "learning_rate": 0.00015918918918918917, + "loss": 0.4317, + "step": 3136 + }, + { + "epoch": 0.9404534382611954, + "grad_norm": 0.2620845437049866, + "learning_rate": 0.00015914414414414414, + "loss": 0.4139, + "step": 3137 + }, + { + "epoch": 0.9407532321528949, + "grad_norm": 0.24328507483005524, + "learning_rate": 0.00015909909909909907, + "loss": 0.3952, + "step": 3138 + }, + { + "epoch": 0.9410530260445943, + "grad_norm": 0.25061240792274475, + "learning_rate": 0.00015905405405405404, + "loss": 0.3669, + "step": 3139 + }, + { + "epoch": 0.9413528199362938, + "grad_norm": 0.24149778485298157, + "learning_rate": 0.000159009009009009, + "loss": 0.416, + "step": 3140 + }, + { + "epoch": 0.9416526138279933, + "grad_norm": 0.2483665943145752, + "learning_rate": 0.00015896396396396394, + "loss": 0.3935, + "step": 3141 + }, + { + "epoch": 0.9419524077196927, + "grad_norm": 0.270732045173645, + "learning_rate": 0.0001589189189189189, + "loss": 0.3991, + "step": 3142 + }, + { + "epoch": 0.9422522016113921, + "grad_norm": 0.2482568472623825, + "learning_rate": 0.00015887387387387387, + "loss": 0.417, + "step": 3143 + }, + { + "epoch": 0.9425519955030917, + "grad_norm": 0.241281196475029, + "learning_rate": 0.0001588288288288288, + "loss": 0.4144, + "step": 3144 + }, + { + "epoch": 0.9428517893947911, + "grad_norm": 0.2527291774749756, + "learning_rate": 0.00015878378378378377, + "loss": 0.4318, + "step": 3145 + }, + { + "epoch": 0.9431515832864905, + "grad_norm": 0.25932496786117554, + "learning_rate": 0.00015873873873873873, + "loss": 0.4305, + "step": 3146 + }, + { + "epoch": 0.9434513771781899, + "grad_norm": 0.2633473873138428, + "learning_rate": 0.00015869369369369367, + "loss": 0.4395, + "step": 3147 + }, + { + "epoch": 0.9437511710698895, + "grad_norm": 0.23622636497020721, + "learning_rate": 0.00015864864864864866, + "loss": 0.3921, + "step": 3148 + }, + { + "epoch": 0.9440509649615889, + "grad_norm": 0.2766309976577759, + "learning_rate": 0.0001586036036036036, + "loss": 0.477, + "step": 3149 + }, + { + "epoch": 0.9443507588532883, + "grad_norm": 0.24325770139694214, + "learning_rate": 0.00015855855855855853, + "loss": 0.3976, + "step": 3150 + }, + { + "epoch": 0.9446505527449878, + "grad_norm": 0.25284042954444885, + "learning_rate": 0.00015851351351351352, + "loss": 0.4077, + "step": 3151 + }, + { + "epoch": 0.9449503466366873, + "grad_norm": 0.2570466995239258, + "learning_rate": 0.00015846846846846846, + "loss": 0.4361, + "step": 3152 + }, + { + "epoch": 0.9452501405283867, + "grad_norm": 0.2617407441139221, + "learning_rate": 0.0001584234234234234, + "loss": 0.4358, + "step": 3153 + }, + { + "epoch": 0.9455499344200862, + "grad_norm": 0.25398409366607666, + "learning_rate": 0.0001583783783783784, + "loss": 0.4148, + "step": 3154 + }, + { + "epoch": 0.9458497283117856, + "grad_norm": 0.23910127580165863, + "learning_rate": 0.00015833333333333332, + "loss": 0.4149, + "step": 3155 + }, + { + "epoch": 0.9461495222034851, + "grad_norm": 0.24659039080142975, + "learning_rate": 0.00015828828828828826, + "loss": 0.4026, + "step": 3156 + }, + { + "epoch": 0.9464493160951846, + "grad_norm": 0.23952017724514008, + "learning_rate": 0.00015824324324324325, + "loss": 0.3825, + "step": 3157 + }, + { + "epoch": 0.946749109986884, + "grad_norm": 0.2537391185760498, + "learning_rate": 0.0001581981981981982, + "loss": 0.4131, + "step": 3158 + }, + { + "epoch": 0.9470489038785834, + "grad_norm": 0.25354787707328796, + "learning_rate": 0.00015815315315315313, + "loss": 0.4318, + "step": 3159 + }, + { + "epoch": 0.947348697770283, + "grad_norm": 0.23185314238071442, + "learning_rate": 0.0001581081081081081, + "loss": 0.4062, + "step": 3160 + }, + { + "epoch": 0.9476484916619824, + "grad_norm": 0.23187117278575897, + "learning_rate": 0.00015806306306306305, + "loss": 0.4265, + "step": 3161 + }, + { + "epoch": 0.9479482855536818, + "grad_norm": 0.2618250548839569, + "learning_rate": 0.000158018018018018, + "loss": 0.3997, + "step": 3162 + }, + { + "epoch": 0.9482480794453813, + "grad_norm": 0.2492150217294693, + "learning_rate": 0.00015797297297297295, + "loss": 0.4418, + "step": 3163 + }, + { + "epoch": 0.9485478733370808, + "grad_norm": 0.23397988080978394, + "learning_rate": 0.00015792792792792792, + "loss": 0.4056, + "step": 3164 + }, + { + "epoch": 0.9488476672287802, + "grad_norm": 0.2404291331768036, + "learning_rate": 0.00015788288288288285, + "loss": 0.4234, + "step": 3165 + }, + { + "epoch": 0.9491474611204797, + "grad_norm": 0.25739723443984985, + "learning_rate": 0.00015783783783783782, + "loss": 0.4376, + "step": 3166 + }, + { + "epoch": 0.9494472550121791, + "grad_norm": 0.2516269385814667, + "learning_rate": 0.00015779279279279278, + "loss": 0.4371, + "step": 3167 + }, + { + "epoch": 0.9497470489038786, + "grad_norm": 0.2476969063282013, + "learning_rate": 0.00015774774774774775, + "loss": 0.4042, + "step": 3168 + }, + { + "epoch": 0.950046842795578, + "grad_norm": 0.2627060115337372, + "learning_rate": 0.00015770270270270268, + "loss": 0.4651, + "step": 3169 + }, + { + "epoch": 0.9503466366872775, + "grad_norm": 0.2544974982738495, + "learning_rate": 0.00015765765765765765, + "loss": 0.439, + "step": 3170 + }, + { + "epoch": 0.9506464305789769, + "grad_norm": 0.2589476406574249, + "learning_rate": 0.0001576126126126126, + "loss": 0.4093, + "step": 3171 + }, + { + "epoch": 0.9509462244706764, + "grad_norm": 0.24550233781337738, + "learning_rate": 0.00015756756756756755, + "loss": 0.4176, + "step": 3172 + }, + { + "epoch": 0.9512460183623759, + "grad_norm": 0.2545708119869232, + "learning_rate": 0.0001575225225225225, + "loss": 0.4308, + "step": 3173 + }, + { + "epoch": 0.9515458122540753, + "grad_norm": 0.24934959411621094, + "learning_rate": 0.00015747747747747747, + "loss": 0.3848, + "step": 3174 + }, + { + "epoch": 0.9518456061457747, + "grad_norm": 0.2537474036216736, + "learning_rate": 0.0001574324324324324, + "loss": 0.4149, + "step": 3175 + }, + { + "epoch": 0.9521454000374743, + "grad_norm": 0.2342180609703064, + "learning_rate": 0.00015738738738738738, + "loss": 0.3815, + "step": 3176 + }, + { + "epoch": 0.9524451939291737, + "grad_norm": 0.2533189356327057, + "learning_rate": 0.00015734234234234234, + "loss": 0.3993, + "step": 3177 + }, + { + "epoch": 0.9527449878208731, + "grad_norm": 0.2543899714946747, + "learning_rate": 0.00015729729729729728, + "loss": 0.4062, + "step": 3178 + }, + { + "epoch": 0.9530447817125726, + "grad_norm": 0.2690375745296478, + "learning_rate": 0.00015725225225225224, + "loss": 0.4109, + "step": 3179 + }, + { + "epoch": 0.9533445756042721, + "grad_norm": 0.2458389550447464, + "learning_rate": 0.0001572072072072072, + "loss": 0.4056, + "step": 3180 + }, + { + "epoch": 0.9536443694959715, + "grad_norm": 0.2488129585981369, + "learning_rate": 0.00015716216216216214, + "loss": 0.4312, + "step": 3181 + }, + { + "epoch": 0.953944163387671, + "grad_norm": 0.23376654088497162, + "learning_rate": 0.00015711711711711713, + "loss": 0.4254, + "step": 3182 + }, + { + "epoch": 0.9542439572793704, + "grad_norm": 0.2395961731672287, + "learning_rate": 0.00015707207207207207, + "loss": 0.4169, + "step": 3183 + }, + { + "epoch": 0.9545437511710699, + "grad_norm": 0.2538315951824188, + "learning_rate": 0.000157027027027027, + "loss": 0.4418, + "step": 3184 + }, + { + "epoch": 0.9548435450627694, + "grad_norm": 0.25304052233695984, + "learning_rate": 0.00015698198198198194, + "loss": 0.4082, + "step": 3185 + }, + { + "epoch": 0.9551433389544688, + "grad_norm": 0.25075477361679077, + "learning_rate": 0.00015693693693693693, + "loss": 0.4288, + "step": 3186 + }, + { + "epoch": 0.9554431328461682, + "grad_norm": 0.2536257803440094, + "learning_rate": 0.00015689189189189187, + "loss": 0.4326, + "step": 3187 + }, + { + "epoch": 0.9557429267378678, + "grad_norm": 0.24698510766029358, + "learning_rate": 0.0001568468468468468, + "loss": 0.4245, + "step": 3188 + }, + { + "epoch": 0.9560427206295672, + "grad_norm": 0.2647172212600708, + "learning_rate": 0.0001568018018018018, + "loss": 0.4018, + "step": 3189 + }, + { + "epoch": 0.9563425145212666, + "grad_norm": 0.2701962888240814, + "learning_rate": 0.00015675675675675673, + "loss": 0.4289, + "step": 3190 + }, + { + "epoch": 0.956642308412966, + "grad_norm": 0.24717958271503448, + "learning_rate": 0.0001567117117117117, + "loss": 0.4202, + "step": 3191 + }, + { + "epoch": 0.9569421023046656, + "grad_norm": 0.24269205331802368, + "learning_rate": 0.00015666666666666666, + "loss": 0.3864, + "step": 3192 + }, + { + "epoch": 0.957241896196365, + "grad_norm": 0.2608867287635803, + "learning_rate": 0.0001566216216216216, + "loss": 0.4403, + "step": 3193 + }, + { + "epoch": 0.9575416900880644, + "grad_norm": 0.2656077444553375, + "learning_rate": 0.00015657657657657656, + "loss": 0.4203, + "step": 3194 + }, + { + "epoch": 0.9578414839797639, + "grad_norm": 0.2502601444721222, + "learning_rate": 0.00015653153153153153, + "loss": 0.4367, + "step": 3195 + }, + { + "epoch": 0.9581412778714634, + "grad_norm": 0.26323074102401733, + "learning_rate": 0.00015648648648648646, + "loss": 0.4299, + "step": 3196 + }, + { + "epoch": 0.9584410717631628, + "grad_norm": 0.251655250787735, + "learning_rate": 0.00015644144144144143, + "loss": 0.4003, + "step": 3197 + }, + { + "epoch": 0.9587408656548623, + "grad_norm": 0.25573211908340454, + "learning_rate": 0.0001563963963963964, + "loss": 0.4005, + "step": 3198 + }, + { + "epoch": 0.9590406595465617, + "grad_norm": 0.2425386756658554, + "learning_rate": 0.00015635135135135133, + "loss": 0.3839, + "step": 3199 + }, + { + "epoch": 0.9593404534382612, + "grad_norm": 0.25007614493370056, + "learning_rate": 0.0001563063063063063, + "loss": 0.4108, + "step": 3200 + }, + { + "epoch": 0.9596402473299607, + "grad_norm": 0.26201489567756653, + "learning_rate": 0.00015626126126126126, + "loss": 0.4448, + "step": 3201 + }, + { + "epoch": 0.9599400412216601, + "grad_norm": 0.25759240984916687, + "learning_rate": 0.0001562162162162162, + "loss": 0.4318, + "step": 3202 + }, + { + "epoch": 0.9602398351133595, + "grad_norm": 0.24872949719429016, + "learning_rate": 0.00015617117117117116, + "loss": 0.4018, + "step": 3203 + }, + { + "epoch": 0.9605396290050591, + "grad_norm": 0.2599150538444519, + "learning_rate": 0.00015612612612612612, + "loss": 0.4071, + "step": 3204 + }, + { + "epoch": 0.9608394228967585, + "grad_norm": 0.2560199499130249, + "learning_rate": 0.00015608108108108108, + "loss": 0.4251, + "step": 3205 + }, + { + "epoch": 0.9611392167884579, + "grad_norm": 0.24916759133338928, + "learning_rate": 0.00015603603603603602, + "loss": 0.3954, + "step": 3206 + }, + { + "epoch": 0.9614390106801574, + "grad_norm": 0.24545198678970337, + "learning_rate": 0.00015599099099099098, + "loss": 0.3978, + "step": 3207 + }, + { + "epoch": 0.9617388045718569, + "grad_norm": 0.2455730438232422, + "learning_rate": 0.00015594594594594595, + "loss": 0.3949, + "step": 3208 + }, + { + "epoch": 0.9620385984635563, + "grad_norm": 0.23964105546474457, + "learning_rate": 0.00015590090090090089, + "loss": 0.413, + "step": 3209 + }, + { + "epoch": 0.9623383923552558, + "grad_norm": 0.2658955156803131, + "learning_rate": 0.00015585585585585582, + "loss": 0.4455, + "step": 3210 + }, + { + "epoch": 0.9626381862469552, + "grad_norm": 0.2649344205856323, + "learning_rate": 0.0001558108108108108, + "loss": 0.409, + "step": 3211 + }, + { + "epoch": 0.9629379801386547, + "grad_norm": 0.24537350237369537, + "learning_rate": 0.00015576576576576575, + "loss": 0.404, + "step": 3212 + }, + { + "epoch": 0.9632377740303542, + "grad_norm": 0.2801573574542999, + "learning_rate": 0.00015572072072072069, + "loss": 0.4254, + "step": 3213 + }, + { + "epoch": 0.9635375679220536, + "grad_norm": 0.2631642818450928, + "learning_rate": 0.00015567567567567568, + "loss": 0.4201, + "step": 3214 + }, + { + "epoch": 0.963837361813753, + "grad_norm": 0.25088247656822205, + "learning_rate": 0.00015563063063063061, + "loss": 0.3975, + "step": 3215 + }, + { + "epoch": 0.9641371557054526, + "grad_norm": 0.271305650472641, + "learning_rate": 0.00015558558558558555, + "loss": 0.4317, + "step": 3216 + }, + { + "epoch": 0.964436949597152, + "grad_norm": 0.2699548304080963, + "learning_rate": 0.00015554054054054054, + "loss": 0.4175, + "step": 3217 + }, + { + "epoch": 0.9647367434888514, + "grad_norm": 0.25501522421836853, + "learning_rate": 0.00015549549549549548, + "loss": 0.4228, + "step": 3218 + }, + { + "epoch": 0.9650365373805508, + "grad_norm": 0.2544115483760834, + "learning_rate": 0.00015545045045045042, + "loss": 0.3931, + "step": 3219 + }, + { + "epoch": 0.9653363312722504, + "grad_norm": 0.26797887682914734, + "learning_rate": 0.0001554054054054054, + "loss": 0.3828, + "step": 3220 + }, + { + "epoch": 0.9656361251639498, + "grad_norm": 0.2449193000793457, + "learning_rate": 0.00015536036036036034, + "loss": 0.4103, + "step": 3221 + }, + { + "epoch": 0.9659359190556492, + "grad_norm": 0.24695518612861633, + "learning_rate": 0.00015531531531531528, + "loss": 0.4088, + "step": 3222 + }, + { + "epoch": 0.9662357129473487, + "grad_norm": 0.24960634112358093, + "learning_rate": 0.00015527027027027027, + "loss": 0.4193, + "step": 3223 + }, + { + "epoch": 0.9665355068390482, + "grad_norm": 0.255014568567276, + "learning_rate": 0.0001552252252252252, + "loss": 0.4041, + "step": 3224 + }, + { + "epoch": 0.9668353007307476, + "grad_norm": 0.2603205442428589, + "learning_rate": 0.00015518018018018017, + "loss": 0.4438, + "step": 3225 + }, + { + "epoch": 0.9671350946224471, + "grad_norm": 0.2639630436897278, + "learning_rate": 0.00015513513513513514, + "loss": 0.4313, + "step": 3226 + }, + { + "epoch": 0.9674348885141465, + "grad_norm": 0.25526097416877747, + "learning_rate": 0.00015509009009009007, + "loss": 0.4165, + "step": 3227 + }, + { + "epoch": 0.967734682405846, + "grad_norm": 0.26529547572135925, + "learning_rate": 0.00015504504504504504, + "loss": 0.4046, + "step": 3228 + }, + { + "epoch": 0.9680344762975455, + "grad_norm": 0.25639456510543823, + "learning_rate": 0.000155, + "loss": 0.4092, + "step": 3229 + }, + { + "epoch": 0.9683342701892449, + "grad_norm": 0.24461041390895844, + "learning_rate": 0.00015495495495495494, + "loss": 0.4155, + "step": 3230 + }, + { + "epoch": 0.9686340640809443, + "grad_norm": 0.25683319568634033, + "learning_rate": 0.0001549099099099099, + "loss": 0.4152, + "step": 3231 + }, + { + "epoch": 0.9689338579726438, + "grad_norm": 0.2518060505390167, + "learning_rate": 0.00015486486486486484, + "loss": 0.4299, + "step": 3232 + }, + { + "epoch": 0.9692336518643433, + "grad_norm": 0.24693354964256287, + "learning_rate": 0.0001548198198198198, + "loss": 0.4178, + "step": 3233 + }, + { + "epoch": 0.9695334457560427, + "grad_norm": 0.2540181577205658, + "learning_rate": 0.00015477477477477477, + "loss": 0.4158, + "step": 3234 + }, + { + "epoch": 0.9698332396477422, + "grad_norm": 0.27640795707702637, + "learning_rate": 0.0001547297297297297, + "loss": 0.4387, + "step": 3235 + }, + { + "epoch": 0.9701330335394416, + "grad_norm": 0.23969754576683044, + "learning_rate": 0.00015468468468468467, + "loss": 0.4322, + "step": 3236 + }, + { + "epoch": 0.9704328274311411, + "grad_norm": 0.27284395694732666, + "learning_rate": 0.00015463963963963963, + "loss": 0.4359, + "step": 3237 + }, + { + "epoch": 0.9707326213228405, + "grad_norm": 0.2576393187046051, + "learning_rate": 0.00015459459459459457, + "loss": 0.4282, + "step": 3238 + }, + { + "epoch": 0.97103241521454, + "grad_norm": 0.2445439100265503, + "learning_rate": 0.00015454954954954956, + "loss": 0.3871, + "step": 3239 + }, + { + "epoch": 0.9713322091062394, + "grad_norm": 0.25176161527633667, + "learning_rate": 0.0001545045045045045, + "loss": 0.4512, + "step": 3240 + }, + { + "epoch": 0.971632002997939, + "grad_norm": 0.25426721572875977, + "learning_rate": 0.00015445945945945943, + "loss": 0.4265, + "step": 3241 + }, + { + "epoch": 0.9719317968896384, + "grad_norm": 0.25714048743247986, + "learning_rate": 0.00015441441441441442, + "loss": 0.4169, + "step": 3242 + }, + { + "epoch": 0.9722315907813378, + "grad_norm": 0.2531634569168091, + "learning_rate": 0.00015436936936936936, + "loss": 0.4404, + "step": 3243 + }, + { + "epoch": 0.9725313846730372, + "grad_norm": 0.2426653355360031, + "learning_rate": 0.0001543243243243243, + "loss": 0.3981, + "step": 3244 + }, + { + "epoch": 0.9728311785647368, + "grad_norm": 0.24552041292190552, + "learning_rate": 0.00015427927927927929, + "loss": 0.4124, + "step": 3245 + }, + { + "epoch": 0.9731309724564362, + "grad_norm": 0.2520458698272705, + "learning_rate": 0.00015423423423423422, + "loss": 0.4132, + "step": 3246 + }, + { + "epoch": 0.9734307663481356, + "grad_norm": 0.2456391304731369, + "learning_rate": 0.00015418918918918916, + "loss": 0.3962, + "step": 3247 + }, + { + "epoch": 0.9737305602398351, + "grad_norm": 0.2388329654932022, + "learning_rate": 0.00015414414414414415, + "loss": 0.4302, + "step": 3248 + }, + { + "epoch": 0.9740303541315346, + "grad_norm": 0.24677686393260956, + "learning_rate": 0.0001540990990990991, + "loss": 0.3775, + "step": 3249 + }, + { + "epoch": 0.974330148023234, + "grad_norm": 0.260043203830719, + "learning_rate": 0.00015405405405405402, + "loss": 0.4242, + "step": 3250 + }, + { + "epoch": 0.9746299419149335, + "grad_norm": 0.25858423113822937, + "learning_rate": 0.00015400900900900902, + "loss": 0.4033, + "step": 3251 + }, + { + "epoch": 0.9749297358066329, + "grad_norm": 0.245976984500885, + "learning_rate": 0.00015396396396396395, + "loss": 0.4111, + "step": 3252 + }, + { + "epoch": 0.9752295296983324, + "grad_norm": 0.2492712438106537, + "learning_rate": 0.0001539189189189189, + "loss": 0.4013, + "step": 3253 + }, + { + "epoch": 0.9755293235900319, + "grad_norm": 0.2476668655872345, + "learning_rate": 0.00015387387387387388, + "loss": 0.3959, + "step": 3254 + }, + { + "epoch": 0.9758291174817313, + "grad_norm": 0.2594514787197113, + "learning_rate": 0.00015382882882882882, + "loss": 0.4646, + "step": 3255 + }, + { + "epoch": 0.9761289113734307, + "grad_norm": 0.27506035566329956, + "learning_rate": 0.00015378378378378375, + "loss": 0.4511, + "step": 3256 + }, + { + "epoch": 0.9764287052651303, + "grad_norm": 0.27209872007369995, + "learning_rate": 0.00015373873873873872, + "loss": 0.4084, + "step": 3257 + }, + { + "epoch": 0.9767284991568297, + "grad_norm": 0.24144190549850464, + "learning_rate": 0.00015369369369369368, + "loss": 0.3934, + "step": 3258 + }, + { + "epoch": 0.9770282930485291, + "grad_norm": 0.25071820616722107, + "learning_rate": 0.00015364864864864862, + "loss": 0.4203, + "step": 3259 + }, + { + "epoch": 0.9773280869402285, + "grad_norm": 0.24017009139060974, + "learning_rate": 0.00015360360360360358, + "loss": 0.4086, + "step": 3260 + }, + { + "epoch": 0.9776278808319281, + "grad_norm": 0.2667907774448395, + "learning_rate": 0.00015355855855855855, + "loss": 0.4008, + "step": 3261 + }, + { + "epoch": 0.9779276747236275, + "grad_norm": 0.26080140471458435, + "learning_rate": 0.0001535135135135135, + "loss": 0.4328, + "step": 3262 + }, + { + "epoch": 0.978227468615327, + "grad_norm": 0.25462666153907776, + "learning_rate": 0.00015346846846846845, + "loss": 0.446, + "step": 3263 + }, + { + "epoch": 0.9785272625070264, + "grad_norm": 0.23574258387088776, + "learning_rate": 0.0001534234234234234, + "loss": 0.3997, + "step": 3264 + }, + { + "epoch": 0.9788270563987259, + "grad_norm": 0.2527666389942169, + "learning_rate": 0.00015337837837837837, + "loss": 0.4495, + "step": 3265 + }, + { + "epoch": 0.9791268502904253, + "grad_norm": 0.24528340995311737, + "learning_rate": 0.0001533333333333333, + "loss": 0.4119, + "step": 3266 + }, + { + "epoch": 0.9794266441821248, + "grad_norm": 0.2558531165122986, + "learning_rate": 0.00015328828828828827, + "loss": 0.4028, + "step": 3267 + }, + { + "epoch": 0.9797264380738242, + "grad_norm": 0.2466735988855362, + "learning_rate": 0.00015324324324324324, + "loss": 0.4302, + "step": 3268 + }, + { + "epoch": 0.9800262319655237, + "grad_norm": 0.2911660075187683, + "learning_rate": 0.00015319819819819818, + "loss": 0.4551, + "step": 3269 + }, + { + "epoch": 0.9803260258572232, + "grad_norm": 0.2526625990867615, + "learning_rate": 0.00015315315315315314, + "loss": 0.41, + "step": 3270 + }, + { + "epoch": 0.9806258197489226, + "grad_norm": 0.24766401946544647, + "learning_rate": 0.0001531081081081081, + "loss": 0.3924, + "step": 3271 + }, + { + "epoch": 0.980925613640622, + "grad_norm": 0.2853473424911499, + "learning_rate": 0.00015306306306306304, + "loss": 0.4287, + "step": 3272 + }, + { + "epoch": 0.9812254075323216, + "grad_norm": 0.25968533754348755, + "learning_rate": 0.00015301801801801803, + "loss": 0.4395, + "step": 3273 + }, + { + "epoch": 0.981525201424021, + "grad_norm": 0.2610696852207184, + "learning_rate": 0.00015297297297297297, + "loss": 0.4075, + "step": 3274 + }, + { + "epoch": 0.9818249953157204, + "grad_norm": 0.23721027374267578, + "learning_rate": 0.0001529279279279279, + "loss": 0.4071, + "step": 3275 + }, + { + "epoch": 0.9821247892074199, + "grad_norm": 0.26066192984580994, + "learning_rate": 0.0001528828828828829, + "loss": 0.4205, + "step": 3276 + }, + { + "epoch": 0.9824245830991194, + "grad_norm": 0.285672128200531, + "learning_rate": 0.00015283783783783783, + "loss": 0.4046, + "step": 3277 + }, + { + "epoch": 0.9827243769908188, + "grad_norm": 0.22871406376361847, + "learning_rate": 0.00015279279279279277, + "loss": 0.3944, + "step": 3278 + }, + { + "epoch": 0.9830241708825183, + "grad_norm": 0.26142746210098267, + "learning_rate": 0.00015274774774774776, + "loss": 0.4084, + "step": 3279 + }, + { + "epoch": 0.9833239647742177, + "grad_norm": 0.2570386528968811, + "learning_rate": 0.0001527027027027027, + "loss": 0.4225, + "step": 3280 + }, + { + "epoch": 0.9836237586659172, + "grad_norm": 0.25191813707351685, + "learning_rate": 0.00015265765765765763, + "loss": 0.4146, + "step": 3281 + }, + { + "epoch": 0.9839235525576167, + "grad_norm": 0.2769797444343567, + "learning_rate": 0.00015261261261261257, + "loss": 0.4379, + "step": 3282 + }, + { + "epoch": 0.9842233464493161, + "grad_norm": 0.2423688769340515, + "learning_rate": 0.00015256756756756756, + "loss": 0.4221, + "step": 3283 + }, + { + "epoch": 0.9845231403410155, + "grad_norm": 0.2752641439437866, + "learning_rate": 0.0001525225225225225, + "loss": 0.4269, + "step": 3284 + }, + { + "epoch": 0.984822934232715, + "grad_norm": 0.27877339720726013, + "learning_rate": 0.00015247747747747746, + "loss": 0.4391, + "step": 3285 + }, + { + "epoch": 0.9851227281244145, + "grad_norm": 0.24187946319580078, + "learning_rate": 0.00015243243243243243, + "loss": 0.4196, + "step": 3286 + }, + { + "epoch": 0.9854225220161139, + "grad_norm": 0.24265043437480927, + "learning_rate": 0.00015238738738738736, + "loss": 0.4081, + "step": 3287 + }, + { + "epoch": 0.9857223159078133, + "grad_norm": 0.24394306540489197, + "learning_rate": 0.00015234234234234233, + "loss": 0.3875, + "step": 3288 + }, + { + "epoch": 0.9860221097995129, + "grad_norm": 0.23722290992736816, + "learning_rate": 0.0001522972972972973, + "loss": 0.3906, + "step": 3289 + }, + { + "epoch": 0.9863219036912123, + "grad_norm": 0.2603495121002197, + "learning_rate": 0.00015225225225225223, + "loss": 0.4435, + "step": 3290 + }, + { + "epoch": 0.9866216975829117, + "grad_norm": 0.23921896517276764, + "learning_rate": 0.0001522072072072072, + "loss": 0.4234, + "step": 3291 + }, + { + "epoch": 0.9869214914746112, + "grad_norm": 0.24632927775382996, + "learning_rate": 0.00015216216216216215, + "loss": 0.4411, + "step": 3292 + }, + { + "epoch": 0.9872212853663107, + "grad_norm": 0.2542307674884796, + "learning_rate": 0.0001521171171171171, + "loss": 0.4192, + "step": 3293 + }, + { + "epoch": 0.9875210792580101, + "grad_norm": 0.26318350434303284, + "learning_rate": 0.00015207207207207206, + "loss": 0.4286, + "step": 3294 + }, + { + "epoch": 0.9878208731497096, + "grad_norm": 0.2523062527179718, + "learning_rate": 0.00015202702702702702, + "loss": 0.42, + "step": 3295 + }, + { + "epoch": 0.988120667041409, + "grad_norm": 0.2566467225551605, + "learning_rate": 0.00015198198198198198, + "loss": 0.3847, + "step": 3296 + }, + { + "epoch": 0.9884204609331085, + "grad_norm": 0.2654307782649994, + "learning_rate": 0.00015193693693693692, + "loss": 0.4448, + "step": 3297 + }, + { + "epoch": 0.988720254824808, + "grad_norm": 0.25399646162986755, + "learning_rate": 0.00015189189189189188, + "loss": 0.4374, + "step": 3298 + }, + { + "epoch": 0.9890200487165074, + "grad_norm": 0.24990344047546387, + "learning_rate": 0.00015184684684684685, + "loss": 0.4365, + "step": 3299 + }, + { + "epoch": 0.9893198426082068, + "grad_norm": 0.24241049587726593, + "learning_rate": 0.00015180180180180178, + "loss": 0.399, + "step": 3300 + }, + { + "epoch": 0.9896196364999064, + "grad_norm": 0.24599871039390564, + "learning_rate": 0.00015175675675675675, + "loss": 0.4081, + "step": 3301 + }, + { + "epoch": 0.9899194303916058, + "grad_norm": 0.26060429215431213, + "learning_rate": 0.0001517117117117117, + "loss": 0.4247, + "step": 3302 + }, + { + "epoch": 0.9902192242833052, + "grad_norm": 0.2448931187391281, + "learning_rate": 0.00015166666666666665, + "loss": 0.4289, + "step": 3303 + }, + { + "epoch": 0.9905190181750047, + "grad_norm": 0.2747792899608612, + "learning_rate": 0.00015162162162162159, + "loss": 0.4398, + "step": 3304 + }, + { + "epoch": 0.9908188120667042, + "grad_norm": 0.29383328557014465, + "learning_rate": 0.00015157657657657658, + "loss": 0.4737, + "step": 3305 + }, + { + "epoch": 0.9911186059584036, + "grad_norm": 0.24790361523628235, + "learning_rate": 0.0001515315315315315, + "loss": 0.3991, + "step": 3306 + }, + { + "epoch": 0.991418399850103, + "grad_norm": 0.24991101026535034, + "learning_rate": 0.00015148648648648645, + "loss": 0.4146, + "step": 3307 + }, + { + "epoch": 0.9917181937418025, + "grad_norm": 0.2633141279220581, + "learning_rate": 0.00015144144144144144, + "loss": 0.4349, + "step": 3308 + }, + { + "epoch": 0.992017987633502, + "grad_norm": 0.25757932662963867, + "learning_rate": 0.00015139639639639638, + "loss": 0.417, + "step": 3309 + }, + { + "epoch": 0.9923177815252014, + "grad_norm": 0.2572961747646332, + "learning_rate": 0.00015135135135135131, + "loss": 0.413, + "step": 3310 + }, + { + "epoch": 0.9926175754169009, + "grad_norm": 0.2692258358001709, + "learning_rate": 0.0001513063063063063, + "loss": 0.4364, + "step": 3311 + }, + { + "epoch": 0.9929173693086003, + "grad_norm": 0.2538521885871887, + "learning_rate": 0.00015126126126126124, + "loss": 0.4182, + "step": 3312 + }, + { + "epoch": 0.9932171632002998, + "grad_norm": 0.2588602304458618, + "learning_rate": 0.00015121621621621618, + "loss": 0.4508, + "step": 3313 + }, + { + "epoch": 0.9935169570919993, + "grad_norm": 0.26588940620422363, + "learning_rate": 0.00015117117117117117, + "loss": 0.4345, + "step": 3314 + }, + { + "epoch": 0.9938167509836987, + "grad_norm": 0.24780629575252533, + "learning_rate": 0.0001511261261261261, + "loss": 0.4148, + "step": 3315 + }, + { + "epoch": 0.9941165448753981, + "grad_norm": 0.2527424693107605, + "learning_rate": 0.00015108108108108104, + "loss": 0.4369, + "step": 3316 + }, + { + "epoch": 0.9944163387670977, + "grad_norm": 0.24510394036769867, + "learning_rate": 0.00015103603603603603, + "loss": 0.3947, + "step": 3317 + }, + { + "epoch": 0.9947161326587971, + "grad_norm": 0.25519701838493347, + "learning_rate": 0.00015099099099099097, + "loss": 0.4395, + "step": 3318 + }, + { + "epoch": 0.9950159265504965, + "grad_norm": 0.2711610794067383, + "learning_rate": 0.00015094594594594594, + "loss": 0.4171, + "step": 3319 + }, + { + "epoch": 0.995315720442196, + "grad_norm": 0.25308021903038025, + "learning_rate": 0.0001509009009009009, + "loss": 0.4334, + "step": 3320 + }, + { + "epoch": 0.9956155143338955, + "grad_norm": 0.24517041444778442, + "learning_rate": 0.00015085585585585584, + "loss": 0.4122, + "step": 3321 + }, + { + "epoch": 0.9959153082255949, + "grad_norm": 0.26738440990448, + "learning_rate": 0.0001508108108108108, + "loss": 0.4221, + "step": 3322 + }, + { + "epoch": 0.9962151021172944, + "grad_norm": 0.2602112293243408, + "learning_rate": 0.00015076576576576576, + "loss": 0.4566, + "step": 3323 + }, + { + "epoch": 0.9965148960089938, + "grad_norm": 0.2651210129261017, + "learning_rate": 0.0001507207207207207, + "loss": 0.4169, + "step": 3324 + }, + { + "epoch": 0.9968146899006932, + "grad_norm": 0.23928852379322052, + "learning_rate": 0.00015067567567567566, + "loss": 0.4146, + "step": 3325 + }, + { + "epoch": 0.9971144837923928, + "grad_norm": 0.24811214208602905, + "learning_rate": 0.00015063063063063063, + "loss": 0.4192, + "step": 3326 + }, + { + "epoch": 0.9974142776840922, + "grad_norm": 0.24762240052223206, + "learning_rate": 0.00015058558558558556, + "loss": 0.4094, + "step": 3327 + }, + { + "epoch": 0.9977140715757916, + "grad_norm": 0.27879321575164795, + "learning_rate": 0.00015054054054054053, + "loss": 0.4221, + "step": 3328 + }, + { + "epoch": 0.998013865467491, + "grad_norm": 0.23711088299751282, + "learning_rate": 0.00015049549549549547, + "loss": 0.4065, + "step": 3329 + }, + { + "epoch": 0.9983136593591906, + "grad_norm": 0.2728164792060852, + "learning_rate": 0.00015045045045045046, + "loss": 0.4159, + "step": 3330 + }, + { + "epoch": 0.99861345325089, + "grad_norm": 0.24097463488578796, + "learning_rate": 0.0001504054054054054, + "loss": 0.4325, + "step": 3331 + }, + { + "epoch": 0.9989132471425894, + "grad_norm": 0.2429143637418747, + "learning_rate": 0.00015036036036036033, + "loss": 0.4285, + "step": 3332 + }, + { + "epoch": 0.9992130410342889, + "grad_norm": 0.24508728086948395, + "learning_rate": 0.00015031531531531532, + "loss": 0.4049, + "step": 3333 + }, + { + "epoch": 0.9995128349259884, + "grad_norm": 0.2516220510005951, + "learning_rate": 0.00015027027027027026, + "loss": 0.442, + "step": 3334 + }, + { + "epoch": 0.9998126288176878, + "grad_norm": 0.24752123653888702, + "learning_rate": 0.0001502252252252252, + "loss": 0.3956, + "step": 3335 + }, + { + "epoch": 1.0, + "grad_norm": 0.3306880593299866, + "learning_rate": 0.00015018018018018019, + "loss": 0.4501, + "step": 3336 + }, + { + "epoch": 1.0002997938916995, + "grad_norm": 0.24475271999835968, + "learning_rate": 0.00015013513513513512, + "loss": 0.3665, + "step": 3337 + }, + { + "epoch": 1.0005995877833989, + "grad_norm": 0.263034850358963, + "learning_rate": 0.00015009009009009006, + "loss": 0.3607, + "step": 3338 + }, + { + "epoch": 1.0008993816750984, + "grad_norm": 0.22813622653484344, + "learning_rate": 0.00015004504504504505, + "loss": 0.3377, + "step": 3339 + }, + { + "epoch": 1.0011991755667977, + "grad_norm": 0.2284983992576599, + "learning_rate": 0.00015, + "loss": 0.3635, + "step": 3340 + }, + { + "epoch": 1.0014989694584973, + "grad_norm": 0.2420538365840912, + "learning_rate": 0.00014995495495495495, + "loss": 0.3567, + "step": 3341 + }, + { + "epoch": 1.0017987633501968, + "grad_norm": 0.24065744876861572, + "learning_rate": 0.0001499099099099099, + "loss": 0.3633, + "step": 3342 + }, + { + "epoch": 1.0020985572418961, + "grad_norm": 0.22252792119979858, + "learning_rate": 0.00014986486486486485, + "loss": 0.3598, + "step": 3343 + }, + { + "epoch": 1.0023983511335957, + "grad_norm": 0.2273607850074768, + "learning_rate": 0.00014981981981981982, + "loss": 0.3306, + "step": 3344 + }, + { + "epoch": 1.0026981450252952, + "grad_norm": 0.25264090299606323, + "learning_rate": 0.00014977477477477475, + "loss": 0.3911, + "step": 3345 + }, + { + "epoch": 1.0029979389169945, + "grad_norm": 0.2376517355442047, + "learning_rate": 0.00014972972972972972, + "loss": 0.355, + "step": 3346 + }, + { + "epoch": 1.003297732808694, + "grad_norm": 0.23404935002326965, + "learning_rate": 0.00014968468468468465, + "loss": 0.3946, + "step": 3347 + }, + { + "epoch": 1.0035975267003934, + "grad_norm": 0.23313717544078827, + "learning_rate": 0.00014963963963963962, + "loss": 0.3639, + "step": 3348 + }, + { + "epoch": 1.003897320592093, + "grad_norm": 0.25040316581726074, + "learning_rate": 0.00014959459459459458, + "loss": 0.3761, + "step": 3349 + }, + { + "epoch": 1.0041971144837925, + "grad_norm": 0.22730188071727753, + "learning_rate": 0.00014954954954954952, + "loss": 0.3625, + "step": 3350 + }, + { + "epoch": 1.0044969083754918, + "grad_norm": 0.22673781216144562, + "learning_rate": 0.00014950450450450448, + "loss": 0.3451, + "step": 3351 + }, + { + "epoch": 1.0047967022671913, + "grad_norm": 0.2615843713283539, + "learning_rate": 0.00014945945945945944, + "loss": 0.3629, + "step": 3352 + }, + { + "epoch": 1.0050964961588909, + "grad_norm": 0.2444346845149994, + "learning_rate": 0.0001494144144144144, + "loss": 0.3594, + "step": 3353 + }, + { + "epoch": 1.0053962900505902, + "grad_norm": 0.24913236498832703, + "learning_rate": 0.00014936936936936935, + "loss": 0.3615, + "step": 3354 + }, + { + "epoch": 1.0056960839422897, + "grad_norm": 0.2454083263874054, + "learning_rate": 0.0001493243243243243, + "loss": 0.3855, + "step": 3355 + }, + { + "epoch": 1.005995877833989, + "grad_norm": 0.2320137619972229, + "learning_rate": 0.00014927927927927927, + "loss": 0.3697, + "step": 3356 + }, + { + "epoch": 1.0062956717256886, + "grad_norm": 0.2395658791065216, + "learning_rate": 0.0001492342342342342, + "loss": 0.3744, + "step": 3357 + }, + { + "epoch": 1.006595465617388, + "grad_norm": 0.24167458713054657, + "learning_rate": 0.00014918918918918917, + "loss": 0.3782, + "step": 3358 + }, + { + "epoch": 1.0068952595090874, + "grad_norm": 0.2351546287536621, + "learning_rate": 0.00014914414414414414, + "loss": 0.3539, + "step": 3359 + }, + { + "epoch": 1.007195053400787, + "grad_norm": 0.249009907245636, + "learning_rate": 0.0001490990990990991, + "loss": 0.3881, + "step": 3360 + }, + { + "epoch": 1.0074948472924865, + "grad_norm": 0.2699730396270752, + "learning_rate": 0.00014905405405405404, + "loss": 0.3797, + "step": 3361 + }, + { + "epoch": 1.0077946411841858, + "grad_norm": 0.23312336206436157, + "learning_rate": 0.000149009009009009, + "loss": 0.3502, + "step": 3362 + }, + { + "epoch": 1.0080944350758854, + "grad_norm": 0.27450332045555115, + "learning_rate": 0.00014896396396396397, + "loss": 0.3715, + "step": 3363 + }, + { + "epoch": 1.0083942289675847, + "grad_norm": 0.2660040259361267, + "learning_rate": 0.0001489189189189189, + "loss": 0.4085, + "step": 3364 + }, + { + "epoch": 1.0086940228592842, + "grad_norm": 0.2341001033782959, + "learning_rate": 0.00014887387387387387, + "loss": 0.3729, + "step": 3365 + }, + { + "epoch": 1.0089938167509838, + "grad_norm": 0.27408385276794434, + "learning_rate": 0.00014882882882882883, + "loss": 0.4096, + "step": 3366 + }, + { + "epoch": 1.009293610642683, + "grad_norm": 0.2749234437942505, + "learning_rate": 0.00014878378378378377, + "loss": 0.3569, + "step": 3367 + }, + { + "epoch": 1.0095934045343826, + "grad_norm": 0.24223123490810394, + "learning_rate": 0.00014873873873873873, + "loss": 0.3605, + "step": 3368 + }, + { + "epoch": 1.0098931984260822, + "grad_norm": 0.2597927749156952, + "learning_rate": 0.0001486936936936937, + "loss": 0.3578, + "step": 3369 + }, + { + "epoch": 1.0101929923177815, + "grad_norm": 0.2509942054748535, + "learning_rate": 0.00014864864864864863, + "loss": 0.3782, + "step": 3370 + }, + { + "epoch": 1.010492786209481, + "grad_norm": 0.24225856363773346, + "learning_rate": 0.0001486036036036036, + "loss": 0.3745, + "step": 3371 + }, + { + "epoch": 1.0107925801011803, + "grad_norm": 0.23656389117240906, + "learning_rate": 0.00014855855855855853, + "loss": 0.3808, + "step": 3372 + }, + { + "epoch": 1.0110923739928799, + "grad_norm": 0.24159061908721924, + "learning_rate": 0.0001485135135135135, + "loss": 0.3841, + "step": 3373 + }, + { + "epoch": 1.0113921678845794, + "grad_norm": 0.23979109525680542, + "learning_rate": 0.00014846846846846846, + "loss": 0.3828, + "step": 3374 + }, + { + "epoch": 1.0116919617762787, + "grad_norm": 0.25807490944862366, + "learning_rate": 0.0001484234234234234, + "loss": 0.3494, + "step": 3375 + }, + { + "epoch": 1.0119917556679783, + "grad_norm": 0.23768498003482819, + "learning_rate": 0.00014837837837837836, + "loss": 0.3718, + "step": 3376 + }, + { + "epoch": 1.0122915495596778, + "grad_norm": 0.23397590219974518, + "learning_rate": 0.00014833333333333332, + "loss": 0.3737, + "step": 3377 + }, + { + "epoch": 1.0125913434513771, + "grad_norm": 0.26081061363220215, + "learning_rate": 0.00014828828828828826, + "loss": 0.3721, + "step": 3378 + }, + { + "epoch": 1.0128911373430767, + "grad_norm": 0.24995768070220947, + "learning_rate": 0.00014824324324324323, + "loss": 0.3839, + "step": 3379 + }, + { + "epoch": 1.013190931234776, + "grad_norm": 0.22104698419570923, + "learning_rate": 0.0001481981981981982, + "loss": 0.3325, + "step": 3380 + }, + { + "epoch": 1.0134907251264755, + "grad_norm": 0.2202545702457428, + "learning_rate": 0.00014815315315315313, + "loss": 0.3552, + "step": 3381 + }, + { + "epoch": 1.013790519018175, + "grad_norm": 0.2345467209815979, + "learning_rate": 0.0001481081081081081, + "loss": 0.3616, + "step": 3382 + }, + { + "epoch": 1.0140903129098744, + "grad_norm": 0.24063590168952942, + "learning_rate": 0.00014806306306306305, + "loss": 0.3837, + "step": 3383 + }, + { + "epoch": 1.014390106801574, + "grad_norm": 0.25035032629966736, + "learning_rate": 0.000148018018018018, + "loss": 0.3869, + "step": 3384 + }, + { + "epoch": 1.0146899006932735, + "grad_norm": 0.235214963555336, + "learning_rate": 0.00014797297297297295, + "loss": 0.3799, + "step": 3385 + }, + { + "epoch": 1.0149896945849728, + "grad_norm": 0.2631278336048126, + "learning_rate": 0.00014792792792792792, + "loss": 0.3736, + "step": 3386 + }, + { + "epoch": 1.0152894884766723, + "grad_norm": 0.23787197470664978, + "learning_rate": 0.00014788288288288288, + "loss": 0.3606, + "step": 3387 + }, + { + "epoch": 1.0155892823683716, + "grad_norm": 0.21479521691799164, + "learning_rate": 0.00014783783783783782, + "loss": 0.3588, + "step": 3388 + }, + { + "epoch": 1.0158890762600712, + "grad_norm": 0.25930336117744446, + "learning_rate": 0.00014779279279279278, + "loss": 0.3515, + "step": 3389 + }, + { + "epoch": 1.0161888701517707, + "grad_norm": 0.25570812821388245, + "learning_rate": 0.00014774774774774775, + "loss": 0.3626, + "step": 3390 + }, + { + "epoch": 1.01648866404347, + "grad_norm": 0.2151179164648056, + "learning_rate": 0.00014770270270270268, + "loss": 0.3366, + "step": 3391 + }, + { + "epoch": 1.0167884579351696, + "grad_norm": 0.24506697058677673, + "learning_rate": 0.00014765765765765765, + "loss": 0.3446, + "step": 3392 + }, + { + "epoch": 1.0170882518268691, + "grad_norm": 0.22405314445495605, + "learning_rate": 0.0001476126126126126, + "loss": 0.3493, + "step": 3393 + }, + { + "epoch": 1.0173880457185684, + "grad_norm": 0.24243111908435822, + "learning_rate": 0.00014756756756756758, + "loss": 0.3628, + "step": 3394 + }, + { + "epoch": 1.017687839610268, + "grad_norm": 0.2257484495639801, + "learning_rate": 0.0001475225225225225, + "loss": 0.3364, + "step": 3395 + }, + { + "epoch": 1.0179876335019673, + "grad_norm": 0.24955156445503235, + "learning_rate": 0.00014747747747747748, + "loss": 0.3629, + "step": 3396 + }, + { + "epoch": 1.0182874273936668, + "grad_norm": 0.2347087413072586, + "learning_rate": 0.0001474324324324324, + "loss": 0.3602, + "step": 3397 + }, + { + "epoch": 1.0185872212853664, + "grad_norm": 0.23353172838687897, + "learning_rate": 0.00014738738738738738, + "loss": 0.3715, + "step": 3398 + }, + { + "epoch": 1.0188870151770657, + "grad_norm": 0.2449275553226471, + "learning_rate": 0.00014734234234234234, + "loss": 0.38, + "step": 3399 + }, + { + "epoch": 1.0191868090687652, + "grad_norm": 0.2428295761346817, + "learning_rate": 0.00014729729729729728, + "loss": 0.3774, + "step": 3400 + }, + { + "epoch": 1.0194866029604648, + "grad_norm": 0.22880619764328003, + "learning_rate": 0.00014725225225225224, + "loss": 0.3718, + "step": 3401 + }, + { + "epoch": 1.019786396852164, + "grad_norm": 0.24155092239379883, + "learning_rate": 0.0001472072072072072, + "loss": 0.3724, + "step": 3402 + }, + { + "epoch": 1.0200861907438636, + "grad_norm": 0.2445833534002304, + "learning_rate": 0.00014716216216216214, + "loss": 0.3568, + "step": 3403 + }, + { + "epoch": 1.020385984635563, + "grad_norm": 0.2260163277387619, + "learning_rate": 0.0001471171171171171, + "loss": 0.3424, + "step": 3404 + }, + { + "epoch": 1.0206857785272625, + "grad_norm": 0.2485581338405609, + "learning_rate": 0.00014707207207207207, + "loss": 0.3806, + "step": 3405 + }, + { + "epoch": 1.020985572418962, + "grad_norm": 0.2234206348657608, + "learning_rate": 0.000147027027027027, + "loss": 0.3482, + "step": 3406 + }, + { + "epoch": 1.0212853663106614, + "grad_norm": 0.2420557290315628, + "learning_rate": 0.00014698198198198197, + "loss": 0.3747, + "step": 3407 + }, + { + "epoch": 1.021585160202361, + "grad_norm": 0.2375592589378357, + "learning_rate": 0.0001469369369369369, + "loss": 0.3455, + "step": 3408 + }, + { + "epoch": 1.0218849540940604, + "grad_norm": 0.21995361149311066, + "learning_rate": 0.00014689189189189187, + "loss": 0.3837, + "step": 3409 + }, + { + "epoch": 1.0221847479857598, + "grad_norm": 0.24082349240779877, + "learning_rate": 0.00014684684684684683, + "loss": 0.3594, + "step": 3410 + }, + { + "epoch": 1.0224845418774593, + "grad_norm": 0.24804848432540894, + "learning_rate": 0.00014680180180180177, + "loss": 0.3747, + "step": 3411 + }, + { + "epoch": 1.0227843357691586, + "grad_norm": 0.2306135892868042, + "learning_rate": 0.00014675675675675674, + "loss": 0.3665, + "step": 3412 + }, + { + "epoch": 1.0230841296608582, + "grad_norm": 0.23374010622501373, + "learning_rate": 0.0001467117117117117, + "loss": 0.3755, + "step": 3413 + }, + { + "epoch": 1.0233839235525577, + "grad_norm": 0.24325768649578094, + "learning_rate": 0.00014666666666666664, + "loss": 0.3761, + "step": 3414 + }, + { + "epoch": 1.023683717444257, + "grad_norm": 0.21893738210201263, + "learning_rate": 0.0001466216216216216, + "loss": 0.3527, + "step": 3415 + }, + { + "epoch": 1.0239835113359566, + "grad_norm": 0.22672438621520996, + "learning_rate": 0.00014657657657657656, + "loss": 0.3718, + "step": 3416 + }, + { + "epoch": 1.024283305227656, + "grad_norm": 0.24228642880916595, + "learning_rate": 0.00014653153153153153, + "loss": 0.3856, + "step": 3417 + }, + { + "epoch": 1.0245830991193554, + "grad_norm": 0.24045732617378235, + "learning_rate": 0.00014648648648648646, + "loss": 0.4047, + "step": 3418 + }, + { + "epoch": 1.024882893011055, + "grad_norm": 0.23358234763145447, + "learning_rate": 0.00014644144144144143, + "loss": 0.3719, + "step": 3419 + }, + { + "epoch": 1.0251826869027543, + "grad_norm": 0.22614139318466187, + "learning_rate": 0.0001463963963963964, + "loss": 0.3677, + "step": 3420 + }, + { + "epoch": 1.0254824807944538, + "grad_norm": 0.23576593399047852, + "learning_rate": 0.00014635135135135136, + "loss": 0.366, + "step": 3421 + }, + { + "epoch": 1.0257822746861534, + "grad_norm": 0.23267975449562073, + "learning_rate": 0.0001463063063063063, + "loss": 0.3725, + "step": 3422 + }, + { + "epoch": 1.0260820685778527, + "grad_norm": 0.23192216455936432, + "learning_rate": 0.00014626126126126126, + "loss": 0.3689, + "step": 3423 + }, + { + "epoch": 1.0263818624695522, + "grad_norm": 0.22963547706604004, + "learning_rate": 0.00014621621621621622, + "loss": 0.3804, + "step": 3424 + }, + { + "epoch": 1.0266816563612517, + "grad_norm": 0.22560270130634308, + "learning_rate": 0.00014617117117117116, + "loss": 0.3264, + "step": 3425 + }, + { + "epoch": 1.026981450252951, + "grad_norm": 0.24126774072647095, + "learning_rate": 0.00014612612612612612, + "loss": 0.3646, + "step": 3426 + }, + { + "epoch": 1.0272812441446506, + "grad_norm": 0.2244587242603302, + "learning_rate": 0.00014608108108108108, + "loss": 0.3728, + "step": 3427 + }, + { + "epoch": 1.02758103803635, + "grad_norm": 0.23782998323440552, + "learning_rate": 0.00014603603603603602, + "loss": 0.3628, + "step": 3428 + }, + { + "epoch": 1.0278808319280495, + "grad_norm": 0.22830158472061157, + "learning_rate": 0.00014599099099099099, + "loss": 0.3852, + "step": 3429 + }, + { + "epoch": 1.028180625819749, + "grad_norm": 0.23033784329891205, + "learning_rate": 0.00014594594594594595, + "loss": 0.367, + "step": 3430 + }, + { + "epoch": 1.0284804197114483, + "grad_norm": 0.22531138360500336, + "learning_rate": 0.00014590090090090089, + "loss": 0.3553, + "step": 3431 + }, + { + "epoch": 1.0287802136031479, + "grad_norm": 0.21490846574306488, + "learning_rate": 0.00014585585585585585, + "loss": 0.3541, + "step": 3432 + }, + { + "epoch": 1.0290800074948474, + "grad_norm": 0.23411285877227783, + "learning_rate": 0.0001458108108108108, + "loss": 0.3669, + "step": 3433 + }, + { + "epoch": 1.0293798013865467, + "grad_norm": 0.2277381718158722, + "learning_rate": 0.00014576576576576575, + "loss": 0.3566, + "step": 3434 + }, + { + "epoch": 1.0296795952782463, + "grad_norm": 0.24335390329360962, + "learning_rate": 0.00014572072072072071, + "loss": 0.3533, + "step": 3435 + }, + { + "epoch": 1.0299793891699456, + "grad_norm": 0.2254333347082138, + "learning_rate": 0.00014567567567567565, + "loss": 0.3484, + "step": 3436 + }, + { + "epoch": 1.0302791830616451, + "grad_norm": 0.22899997234344482, + "learning_rate": 0.00014563063063063062, + "loss": 0.3839, + "step": 3437 + }, + { + "epoch": 1.0305789769533447, + "grad_norm": 0.23717020452022552, + "learning_rate": 0.00014558558558558558, + "loss": 0.3697, + "step": 3438 + }, + { + "epoch": 1.030878770845044, + "grad_norm": 0.23469127714633942, + "learning_rate": 0.00014554054054054052, + "loss": 0.3912, + "step": 3439 + }, + { + "epoch": 1.0311785647367435, + "grad_norm": 0.21906235814094543, + "learning_rate": 0.00014549549549549548, + "loss": 0.3555, + "step": 3440 + }, + { + "epoch": 1.0314783586284428, + "grad_norm": 0.2294732928276062, + "learning_rate": 0.00014545045045045044, + "loss": 0.3521, + "step": 3441 + }, + { + "epoch": 1.0317781525201424, + "grad_norm": 0.22533249855041504, + "learning_rate": 0.00014540540540540538, + "loss": 0.3574, + "step": 3442 + }, + { + "epoch": 1.032077946411842, + "grad_norm": 0.21970278024673462, + "learning_rate": 0.00014536036036036034, + "loss": 0.3414, + "step": 3443 + }, + { + "epoch": 1.0323777403035412, + "grad_norm": 0.2248661071062088, + "learning_rate": 0.0001453153153153153, + "loss": 0.3603, + "step": 3444 + }, + { + "epoch": 1.0326775341952408, + "grad_norm": 0.237972229719162, + "learning_rate": 0.00014527027027027024, + "loss": 0.3716, + "step": 3445 + }, + { + "epoch": 1.0329773280869403, + "grad_norm": 0.22870191931724548, + "learning_rate": 0.0001452252252252252, + "loss": 0.3642, + "step": 3446 + }, + { + "epoch": 1.0332771219786396, + "grad_norm": 0.20847219228744507, + "learning_rate": 0.00014518018018018017, + "loss": 0.3238, + "step": 3447 + }, + { + "epoch": 1.0335769158703392, + "grad_norm": 0.22527335584163666, + "learning_rate": 0.0001451351351351351, + "loss": 0.3771, + "step": 3448 + }, + { + "epoch": 1.0338767097620385, + "grad_norm": 0.21900662779808044, + "learning_rate": 0.00014509009009009007, + "loss": 0.3759, + "step": 3449 + }, + { + "epoch": 1.034176503653738, + "grad_norm": 0.23750483989715576, + "learning_rate": 0.00014504504504504504, + "loss": 0.357, + "step": 3450 + }, + { + "epoch": 1.0344762975454376, + "grad_norm": 0.22595860064029694, + "learning_rate": 0.000145, + "loss": 0.3491, + "step": 3451 + }, + { + "epoch": 1.034776091437137, + "grad_norm": 0.22662843763828278, + "learning_rate": 0.00014495495495495494, + "loss": 0.3629, + "step": 3452 + }, + { + "epoch": 1.0350758853288364, + "grad_norm": 0.22160004079341888, + "learning_rate": 0.0001449099099099099, + "loss": 0.3508, + "step": 3453 + }, + { + "epoch": 1.035375679220536, + "grad_norm": 0.22288091480731964, + "learning_rate": 0.00014486486486486487, + "loss": 0.3681, + "step": 3454 + }, + { + "epoch": 1.0356754731122353, + "grad_norm": 0.2303716242313385, + "learning_rate": 0.0001448198198198198, + "loss": 0.3635, + "step": 3455 + }, + { + "epoch": 1.0359752670039348, + "grad_norm": 0.22402909398078918, + "learning_rate": 0.00014477477477477477, + "loss": 0.3712, + "step": 3456 + }, + { + "epoch": 1.0362750608956341, + "grad_norm": 0.2331233024597168, + "learning_rate": 0.00014472972972972973, + "loss": 0.369, + "step": 3457 + }, + { + "epoch": 1.0365748547873337, + "grad_norm": 0.2336449921131134, + "learning_rate": 0.00014468468468468467, + "loss": 0.3672, + "step": 3458 + }, + { + "epoch": 1.0368746486790332, + "grad_norm": 0.22813144326210022, + "learning_rate": 0.00014463963963963963, + "loss": 0.3692, + "step": 3459 + }, + { + "epoch": 1.0371744425707325, + "grad_norm": 0.23772816359996796, + "learning_rate": 0.0001445945945945946, + "loss": 0.3726, + "step": 3460 + }, + { + "epoch": 1.037474236462432, + "grad_norm": 0.23593769967556, + "learning_rate": 0.00014454954954954953, + "loss": 0.3857, + "step": 3461 + }, + { + "epoch": 1.0377740303541316, + "grad_norm": 0.2305610328912735, + "learning_rate": 0.0001445045045045045, + "loss": 0.3595, + "step": 3462 + }, + { + "epoch": 1.038073824245831, + "grad_norm": 0.22642318904399872, + "learning_rate": 0.00014445945945945946, + "loss": 0.3483, + "step": 3463 + }, + { + "epoch": 1.0383736181375305, + "grad_norm": 0.23105306923389435, + "learning_rate": 0.0001444144144144144, + "loss": 0.3714, + "step": 3464 + }, + { + "epoch": 1.0386734120292298, + "grad_norm": 0.2172197848558426, + "learning_rate": 0.00014436936936936936, + "loss": 0.3439, + "step": 3465 + }, + { + "epoch": 1.0389732059209293, + "grad_norm": 0.2232641875743866, + "learning_rate": 0.00014432432432432432, + "loss": 0.3552, + "step": 3466 + }, + { + "epoch": 1.0392729998126289, + "grad_norm": 0.21522095799446106, + "learning_rate": 0.00014427927927927926, + "loss": 0.3323, + "step": 3467 + }, + { + "epoch": 1.0395727937043282, + "grad_norm": 0.24464233219623566, + "learning_rate": 0.00014423423423423422, + "loss": 0.3632, + "step": 3468 + }, + { + "epoch": 1.0398725875960277, + "grad_norm": 0.21524198353290558, + "learning_rate": 0.00014418918918918916, + "loss": 0.3408, + "step": 3469 + }, + { + "epoch": 1.0401723814877273, + "grad_norm": 0.21136979758739471, + "learning_rate": 0.00014414414414414412, + "loss": 0.345, + "step": 3470 + }, + { + "epoch": 1.0404721753794266, + "grad_norm": 0.23737655580043793, + "learning_rate": 0.0001440990990990991, + "loss": 0.3643, + "step": 3471 + }, + { + "epoch": 1.0407719692711261, + "grad_norm": 0.22656260430812836, + "learning_rate": 0.00014405405405405403, + "loss": 0.356, + "step": 3472 + }, + { + "epoch": 1.0410717631628255, + "grad_norm": 0.24382594227790833, + "learning_rate": 0.000144009009009009, + "loss": 0.367, + "step": 3473 + }, + { + "epoch": 1.041371557054525, + "grad_norm": 0.2477714568376541, + "learning_rate": 0.00014396396396396395, + "loss": 0.3744, + "step": 3474 + }, + { + "epoch": 1.0416713509462245, + "grad_norm": 0.22636958956718445, + "learning_rate": 0.0001439189189189189, + "loss": 0.3599, + "step": 3475 + }, + { + "epoch": 1.0419711448379239, + "grad_norm": 0.24886015057563782, + "learning_rate": 0.00014387387387387385, + "loss": 0.3772, + "step": 3476 + }, + { + "epoch": 1.0422709387296234, + "grad_norm": 0.21825815737247467, + "learning_rate": 0.00014382882882882882, + "loss": 0.3517, + "step": 3477 + }, + { + "epoch": 1.042570732621323, + "grad_norm": 0.23218809068202972, + "learning_rate": 0.00014378378378378378, + "loss": 0.364, + "step": 3478 + }, + { + "epoch": 1.0428705265130223, + "grad_norm": 0.23707517981529236, + "learning_rate": 0.00014373873873873872, + "loss": 0.3711, + "step": 3479 + }, + { + "epoch": 1.0431703204047218, + "grad_norm": 0.2252931147813797, + "learning_rate": 0.00014369369369369368, + "loss": 0.3774, + "step": 3480 + }, + { + "epoch": 1.0434701142964211, + "grad_norm": 0.2177061289548874, + "learning_rate": 0.00014364864864864865, + "loss": 0.3448, + "step": 3481 + }, + { + "epoch": 1.0437699081881207, + "grad_norm": 0.22449122369289398, + "learning_rate": 0.00014360360360360358, + "loss": 0.3588, + "step": 3482 + }, + { + "epoch": 1.0440697020798202, + "grad_norm": 0.2354591339826584, + "learning_rate": 0.00014355855855855855, + "loss": 0.3743, + "step": 3483 + }, + { + "epoch": 1.0443694959715195, + "grad_norm": 0.2261870801448822, + "learning_rate": 0.0001435135135135135, + "loss": 0.3554, + "step": 3484 + }, + { + "epoch": 1.044669289863219, + "grad_norm": 0.23348073661327362, + "learning_rate": 0.00014346846846846847, + "loss": 0.387, + "step": 3485 + }, + { + "epoch": 1.0449690837549186, + "grad_norm": 0.23522013425827026, + "learning_rate": 0.0001434234234234234, + "loss": 0.3567, + "step": 3486 + }, + { + "epoch": 1.045268877646618, + "grad_norm": 0.2213398516178131, + "learning_rate": 0.00014337837837837837, + "loss": 0.3543, + "step": 3487 + }, + { + "epoch": 1.0455686715383175, + "grad_norm": 0.24790069460868835, + "learning_rate": 0.00014333333333333334, + "loss": 0.3769, + "step": 3488 + }, + { + "epoch": 1.0458684654300168, + "grad_norm": 0.23697051405906677, + "learning_rate": 0.00014328828828828828, + "loss": 0.3662, + "step": 3489 + }, + { + "epoch": 1.0461682593217163, + "grad_norm": 0.2302704155445099, + "learning_rate": 0.00014324324324324324, + "loss": 0.3761, + "step": 3490 + }, + { + "epoch": 1.0464680532134158, + "grad_norm": 0.22645635902881622, + "learning_rate": 0.00014319819819819818, + "loss": 0.3678, + "step": 3491 + }, + { + "epoch": 1.0467678471051152, + "grad_norm": 0.23438039422035217, + "learning_rate": 0.00014315315315315314, + "loss": 0.3633, + "step": 3492 + }, + { + "epoch": 1.0470676409968147, + "grad_norm": 0.23394618928432465, + "learning_rate": 0.0001431081081081081, + "loss": 0.3636, + "step": 3493 + }, + { + "epoch": 1.0473674348885142, + "grad_norm": 0.24473746120929718, + "learning_rate": 0.00014306306306306304, + "loss": 0.3769, + "step": 3494 + }, + { + "epoch": 1.0476672287802136, + "grad_norm": 0.22074007987976074, + "learning_rate": 0.000143018018018018, + "loss": 0.3611, + "step": 3495 + }, + { + "epoch": 1.047967022671913, + "grad_norm": 0.2372681349515915, + "learning_rate": 0.00014297297297297297, + "loss": 0.3639, + "step": 3496 + }, + { + "epoch": 1.0482668165636124, + "grad_norm": 0.2267588973045349, + "learning_rate": 0.0001429279279279279, + "loss": 0.3734, + "step": 3497 + }, + { + "epoch": 1.048566610455312, + "grad_norm": 0.22533351182937622, + "learning_rate": 0.00014288288288288287, + "loss": 0.3763, + "step": 3498 + }, + { + "epoch": 1.0488664043470115, + "grad_norm": 0.22511237859725952, + "learning_rate": 0.00014283783783783783, + "loss": 0.3501, + "step": 3499 + }, + { + "epoch": 1.0491661982387108, + "grad_norm": 0.2493421882390976, + "learning_rate": 0.00014279279279279277, + "loss": 0.3953, + "step": 3500 + }, + { + "epoch": 1.0491661982387108, + "eval_loss": 0.42186981439590454, + "eval_runtime": 565.6283, + "eval_samples_per_second": 3.817, + "eval_steps_per_second": 0.477, + "step": 3500 + }, + { + "epoch": 1.0494659921304104, + "grad_norm": 0.22452357411384583, + "learning_rate": 0.00014274774774774773, + "loss": 0.3531, + "step": 3501 + }, + { + "epoch": 1.04976578602211, + "grad_norm": 0.2269439995288849, + "learning_rate": 0.0001427027027027027, + "loss": 0.3756, + "step": 3502 + }, + { + "epoch": 1.0500655799138092, + "grad_norm": 0.2126823365688324, + "learning_rate": 0.00014265765765765763, + "loss": 0.3563, + "step": 3503 + }, + { + "epoch": 1.0503653738055088, + "grad_norm": 0.2116537094116211, + "learning_rate": 0.0001426126126126126, + "loss": 0.3619, + "step": 3504 + }, + { + "epoch": 1.050665167697208, + "grad_norm": 0.2395385056734085, + "learning_rate": 0.00014256756756756753, + "loss": 0.3746, + "step": 3505 + }, + { + "epoch": 1.0509649615889076, + "grad_norm": 0.22017928957939148, + "learning_rate": 0.0001425225225225225, + "loss": 0.3582, + "step": 3506 + }, + { + "epoch": 1.0512647554806072, + "grad_norm": 0.22230744361877441, + "learning_rate": 0.00014247747747747746, + "loss": 0.3367, + "step": 3507 + }, + { + "epoch": 1.0515645493723065, + "grad_norm": 0.22529073059558868, + "learning_rate": 0.00014243243243243243, + "loss": 0.3423, + "step": 3508 + }, + { + "epoch": 1.051864343264006, + "grad_norm": 0.24858680367469788, + "learning_rate": 0.00014238738738738736, + "loss": 0.4004, + "step": 3509 + }, + { + "epoch": 1.0521641371557056, + "grad_norm": 0.22725269198417664, + "learning_rate": 0.00014234234234234233, + "loss": 0.3673, + "step": 3510 + }, + { + "epoch": 1.0524639310474049, + "grad_norm": 0.2208700180053711, + "learning_rate": 0.0001422972972972973, + "loss": 0.3492, + "step": 3511 + }, + { + "epoch": 1.0527637249391044, + "grad_norm": 0.253497451543808, + "learning_rate": 0.00014225225225225223, + "loss": 0.3884, + "step": 3512 + }, + { + "epoch": 1.0530635188308037, + "grad_norm": 0.2408270537853241, + "learning_rate": 0.0001422072072072072, + "loss": 0.3813, + "step": 3513 + }, + { + "epoch": 1.0533633127225033, + "grad_norm": 0.21874786913394928, + "learning_rate": 0.00014216216216216216, + "loss": 0.3291, + "step": 3514 + }, + { + "epoch": 1.0536631066142028, + "grad_norm": 0.2416338324546814, + "learning_rate": 0.00014211711711711712, + "loss": 0.3801, + "step": 3515 + }, + { + "epoch": 1.0539629005059021, + "grad_norm": 0.2278902381658554, + "learning_rate": 0.00014207207207207206, + "loss": 0.3547, + "step": 3516 + }, + { + "epoch": 1.0542626943976017, + "grad_norm": 0.2442399263381958, + "learning_rate": 0.00014202702702702702, + "loss": 0.3818, + "step": 3517 + }, + { + "epoch": 1.0545624882893012, + "grad_norm": 0.2202155590057373, + "learning_rate": 0.00014198198198198198, + "loss": 0.3486, + "step": 3518 + }, + { + "epoch": 1.0548622821810005, + "grad_norm": 0.22215566039085388, + "learning_rate": 0.00014193693693693692, + "loss": 0.3729, + "step": 3519 + }, + { + "epoch": 1.0551620760727, + "grad_norm": 0.23282161355018616, + "learning_rate": 0.00014189189189189188, + "loss": 0.3695, + "step": 3520 + }, + { + "epoch": 1.0554618699643994, + "grad_norm": 0.21966330707073212, + "learning_rate": 0.00014184684684684685, + "loss": 0.3463, + "step": 3521 + }, + { + "epoch": 1.055761663856099, + "grad_norm": 0.2348901331424713, + "learning_rate": 0.00014180180180180179, + "loss": 0.3836, + "step": 3522 + }, + { + "epoch": 1.0560614577477985, + "grad_norm": 0.23036788403987885, + "learning_rate": 0.00014175675675675675, + "loss": 0.3875, + "step": 3523 + }, + { + "epoch": 1.0563612516394978, + "grad_norm": 0.2271084189414978, + "learning_rate": 0.0001417117117117117, + "loss": 0.3631, + "step": 3524 + }, + { + "epoch": 1.0566610455311973, + "grad_norm": 0.23065337538719177, + "learning_rate": 0.00014166666666666665, + "loss": 0.384, + "step": 3525 + }, + { + "epoch": 1.0569608394228966, + "grad_norm": 0.23186425864696503, + "learning_rate": 0.00014162162162162161, + "loss": 0.3733, + "step": 3526 + }, + { + "epoch": 1.0572606333145962, + "grad_norm": 0.23528079688549042, + "learning_rate": 0.00014157657657657658, + "loss": 0.377, + "step": 3527 + }, + { + "epoch": 1.0575604272062957, + "grad_norm": 0.24329833686351776, + "learning_rate": 0.00014153153153153151, + "loss": 0.3605, + "step": 3528 + }, + { + "epoch": 1.057860221097995, + "grad_norm": 0.24599115550518036, + "learning_rate": 0.00014148648648648648, + "loss": 0.3859, + "step": 3529 + }, + { + "epoch": 1.0581600149896946, + "grad_norm": 0.22416117787361145, + "learning_rate": 0.00014144144144144141, + "loss": 0.3788, + "step": 3530 + }, + { + "epoch": 1.0584598088813941, + "grad_norm": 0.23425617814064026, + "learning_rate": 0.00014139639639639638, + "loss": 0.3745, + "step": 3531 + }, + { + "epoch": 1.0587596027730934, + "grad_norm": 0.2239811271429062, + "learning_rate": 0.00014135135135135134, + "loss": 0.3617, + "step": 3532 + }, + { + "epoch": 1.059059396664793, + "grad_norm": 0.2348106950521469, + "learning_rate": 0.00014130630630630628, + "loss": 0.3637, + "step": 3533 + }, + { + "epoch": 1.0593591905564925, + "grad_norm": 0.2254401445388794, + "learning_rate": 0.00014126126126126124, + "loss": 0.3395, + "step": 3534 + }, + { + "epoch": 1.0596589844481918, + "grad_norm": 0.22252045571804047, + "learning_rate": 0.0001412162162162162, + "loss": 0.3716, + "step": 3535 + }, + { + "epoch": 1.0599587783398914, + "grad_norm": 0.2396722286939621, + "learning_rate": 0.00014117117117117114, + "loss": 0.3808, + "step": 3536 + }, + { + "epoch": 1.0602585722315907, + "grad_norm": 0.24962125718593597, + "learning_rate": 0.0001411261261261261, + "loss": 0.3934, + "step": 3537 + }, + { + "epoch": 1.0605583661232902, + "grad_norm": 0.2402314692735672, + "learning_rate": 0.00014108108108108107, + "loss": 0.3801, + "step": 3538 + }, + { + "epoch": 1.0608581600149898, + "grad_norm": 0.22211486101150513, + "learning_rate": 0.000141036036036036, + "loss": 0.3721, + "step": 3539 + }, + { + "epoch": 1.061157953906689, + "grad_norm": 0.22576668858528137, + "learning_rate": 0.00014099099099099097, + "loss": 0.3477, + "step": 3540 + }, + { + "epoch": 1.0614577477983886, + "grad_norm": 0.2367440015077591, + "learning_rate": 0.00014094594594594594, + "loss": 0.3793, + "step": 3541 + }, + { + "epoch": 1.061757541690088, + "grad_norm": 0.21483252942562103, + "learning_rate": 0.0001409009009009009, + "loss": 0.3429, + "step": 3542 + }, + { + "epoch": 1.0620573355817875, + "grad_norm": 0.2304966300725937, + "learning_rate": 0.00014085585585585584, + "loss": 0.3603, + "step": 3543 + }, + { + "epoch": 1.062357129473487, + "grad_norm": 0.24434131383895874, + "learning_rate": 0.0001408108108108108, + "loss": 0.3763, + "step": 3544 + }, + { + "epoch": 1.0626569233651864, + "grad_norm": 0.2312142550945282, + "learning_rate": 0.00014076576576576576, + "loss": 0.3856, + "step": 3545 + }, + { + "epoch": 1.062956717256886, + "grad_norm": 0.22286604344844818, + "learning_rate": 0.0001407207207207207, + "loss": 0.3367, + "step": 3546 + }, + { + "epoch": 1.0632565111485854, + "grad_norm": 0.2351447492837906, + "learning_rate": 0.00014067567567567567, + "loss": 0.3725, + "step": 3547 + }, + { + "epoch": 1.0635563050402848, + "grad_norm": 0.22424407303333282, + "learning_rate": 0.00014063063063063063, + "loss": 0.3638, + "step": 3548 + }, + { + "epoch": 1.0638560989319843, + "grad_norm": 0.22963954508304596, + "learning_rate": 0.0001405855855855856, + "loss": 0.3613, + "step": 3549 + }, + { + "epoch": 1.0641558928236836, + "grad_norm": 0.23109790682792664, + "learning_rate": 0.00014054054054054053, + "loss": 0.369, + "step": 3550 + }, + { + "epoch": 1.0644556867153832, + "grad_norm": 0.2307424396276474, + "learning_rate": 0.0001404954954954955, + "loss": 0.3688, + "step": 3551 + }, + { + "epoch": 1.0647554806070827, + "grad_norm": 0.21975605189800262, + "learning_rate": 0.00014045045045045043, + "loss": 0.3301, + "step": 3552 + }, + { + "epoch": 1.065055274498782, + "grad_norm": 0.2344069480895996, + "learning_rate": 0.0001404054054054054, + "loss": 0.4069, + "step": 3553 + }, + { + "epoch": 1.0653550683904816, + "grad_norm": 0.22718745470046997, + "learning_rate": 0.00014036036036036036, + "loss": 0.3613, + "step": 3554 + }, + { + "epoch": 1.065654862282181, + "grad_norm": 0.22939084470272064, + "learning_rate": 0.0001403153153153153, + "loss": 0.3678, + "step": 3555 + }, + { + "epoch": 1.0659546561738804, + "grad_norm": 0.2331932783126831, + "learning_rate": 0.00014027027027027026, + "loss": 0.3744, + "step": 3556 + }, + { + "epoch": 1.06625445006558, + "grad_norm": 0.24038521945476532, + "learning_rate": 0.00014022522522522522, + "loss": 0.3608, + "step": 3557 + }, + { + "epoch": 1.0665542439572793, + "grad_norm": 0.24839916825294495, + "learning_rate": 0.00014018018018018016, + "loss": 0.369, + "step": 3558 + }, + { + "epoch": 1.0668540378489788, + "grad_norm": 0.22673757374286652, + "learning_rate": 0.00014013513513513512, + "loss": 0.3573, + "step": 3559 + }, + { + "epoch": 1.0671538317406783, + "grad_norm": 0.23155052959918976, + "learning_rate": 0.0001400900900900901, + "loss": 0.3507, + "step": 3560 + }, + { + "epoch": 1.0674536256323777, + "grad_norm": 0.23616757988929749, + "learning_rate": 0.00014004504504504502, + "loss": 0.3675, + "step": 3561 + }, + { + "epoch": 1.0677534195240772, + "grad_norm": 0.22857490181922913, + "learning_rate": 0.00014, + "loss": 0.3508, + "step": 3562 + }, + { + "epoch": 1.0680532134157767, + "grad_norm": 0.23677614331245422, + "learning_rate": 0.00013995495495495495, + "loss": 0.3545, + "step": 3563 + }, + { + "epoch": 1.068353007307476, + "grad_norm": 0.23888587951660156, + "learning_rate": 0.0001399099099099099, + "loss": 0.3679, + "step": 3564 + }, + { + "epoch": 1.0686528011991756, + "grad_norm": 0.24625924229621887, + "learning_rate": 0.00013986486486486485, + "loss": 0.349, + "step": 3565 + }, + { + "epoch": 1.068952595090875, + "grad_norm": 0.24238745868206024, + "learning_rate": 0.0001398198198198198, + "loss": 0.3867, + "step": 3566 + }, + { + "epoch": 1.0692523889825745, + "grad_norm": 0.2739328444004059, + "learning_rate": 0.00013977477477477475, + "loss": 0.395, + "step": 3567 + }, + { + "epoch": 1.069552182874274, + "grad_norm": 0.24995070695877075, + "learning_rate": 0.00013972972972972972, + "loss": 0.3247, + "step": 3568 + }, + { + "epoch": 1.0698519767659733, + "grad_norm": 0.2289031744003296, + "learning_rate": 0.00013968468468468465, + "loss": 0.3629, + "step": 3569 + }, + { + "epoch": 1.0701517706576729, + "grad_norm": 0.23290695250034332, + "learning_rate": 0.00013963963963963962, + "loss": 0.3619, + "step": 3570 + }, + { + "epoch": 1.0704515645493724, + "grad_norm": 0.250203013420105, + "learning_rate": 0.00013959459459459458, + "loss": 0.3592, + "step": 3571 + }, + { + "epoch": 1.0707513584410717, + "grad_norm": 0.24303412437438965, + "learning_rate": 0.00013954954954954955, + "loss": 0.3439, + "step": 3572 + }, + { + "epoch": 1.0710511523327713, + "grad_norm": 0.23078800737857819, + "learning_rate": 0.00013950450450450448, + "loss": 0.3933, + "step": 3573 + }, + { + "epoch": 1.0713509462244706, + "grad_norm": 0.23219646513462067, + "learning_rate": 0.00013945945945945945, + "loss": 0.3565, + "step": 3574 + }, + { + "epoch": 1.0716507401161701, + "grad_norm": 0.25298643112182617, + "learning_rate": 0.0001394144144144144, + "loss": 0.3848, + "step": 3575 + }, + { + "epoch": 1.0719505340078697, + "grad_norm": 0.23267370462417603, + "learning_rate": 0.00013936936936936937, + "loss": 0.3515, + "step": 3576 + }, + { + "epoch": 1.072250327899569, + "grad_norm": 0.23744705319404602, + "learning_rate": 0.0001393243243243243, + "loss": 0.3963, + "step": 3577 + }, + { + "epoch": 1.0725501217912685, + "grad_norm": 0.23229120671749115, + "learning_rate": 0.00013927927927927927, + "loss": 0.3395, + "step": 3578 + }, + { + "epoch": 1.072849915682968, + "grad_norm": 0.25329065322875977, + "learning_rate": 0.00013923423423423424, + "loss": 0.352, + "step": 3579 + }, + { + "epoch": 1.0731497095746674, + "grad_norm": 0.22956810891628265, + "learning_rate": 0.00013918918918918917, + "loss": 0.3792, + "step": 3580 + }, + { + "epoch": 1.073449503466367, + "grad_norm": 0.23131117224693298, + "learning_rate": 0.00013914414414414414, + "loss": 0.3503, + "step": 3581 + }, + { + "epoch": 1.0737492973580662, + "grad_norm": 0.22814302146434784, + "learning_rate": 0.0001390990990990991, + "loss": 0.3675, + "step": 3582 + }, + { + "epoch": 1.0740490912497658, + "grad_norm": 0.23262296617031097, + "learning_rate": 0.00013905405405405404, + "loss": 0.3607, + "step": 3583 + }, + { + "epoch": 1.0743488851414653, + "grad_norm": 0.23462675511837006, + "learning_rate": 0.000139009009009009, + "loss": 0.3831, + "step": 3584 + }, + { + "epoch": 1.0746486790331646, + "grad_norm": 0.24443736672401428, + "learning_rate": 0.00013896396396396397, + "loss": 0.3735, + "step": 3585 + }, + { + "epoch": 1.0749484729248642, + "grad_norm": 0.22071777284145355, + "learning_rate": 0.0001389189189189189, + "loss": 0.3607, + "step": 3586 + }, + { + "epoch": 1.0752482668165637, + "grad_norm": 0.25556886196136475, + "learning_rate": 0.00013887387387387387, + "loss": 0.383, + "step": 3587 + }, + { + "epoch": 1.075548060708263, + "grad_norm": 0.23284411430358887, + "learning_rate": 0.0001388288288288288, + "loss": 0.376, + "step": 3588 + }, + { + "epoch": 1.0758478545999626, + "grad_norm": 0.22534599900245667, + "learning_rate": 0.00013878378378378377, + "loss": 0.3647, + "step": 3589 + }, + { + "epoch": 1.076147648491662, + "grad_norm": 0.22576001286506653, + "learning_rate": 0.00013873873873873873, + "loss": 0.356, + "step": 3590 + }, + { + "epoch": 1.0764474423833614, + "grad_norm": 0.231540247797966, + "learning_rate": 0.00013869369369369367, + "loss": 0.3917, + "step": 3591 + }, + { + "epoch": 1.076747236275061, + "grad_norm": 0.22077928483486176, + "learning_rate": 0.00013864864864864863, + "loss": 0.3639, + "step": 3592 + }, + { + "epoch": 1.0770470301667603, + "grad_norm": 0.22737953066825867, + "learning_rate": 0.0001386036036036036, + "loss": 0.3544, + "step": 3593 + }, + { + "epoch": 1.0773468240584598, + "grad_norm": 0.250492125749588, + "learning_rate": 0.00013855855855855853, + "loss": 0.3821, + "step": 3594 + }, + { + "epoch": 1.0776466179501594, + "grad_norm": 0.23965615034103394, + "learning_rate": 0.0001385135135135135, + "loss": 0.3623, + "step": 3595 + }, + { + "epoch": 1.0779464118418587, + "grad_norm": 0.24302807450294495, + "learning_rate": 0.00013846846846846846, + "loss": 0.38, + "step": 3596 + }, + { + "epoch": 1.0782462057335582, + "grad_norm": 0.2204785794019699, + "learning_rate": 0.0001384234234234234, + "loss": 0.3435, + "step": 3597 + }, + { + "epoch": 1.0785459996252575, + "grad_norm": 0.2180601954460144, + "learning_rate": 0.00013837837837837836, + "loss": 0.3477, + "step": 3598 + }, + { + "epoch": 1.078845793516957, + "grad_norm": 0.22917163372039795, + "learning_rate": 0.00013833333333333333, + "loss": 0.3551, + "step": 3599 + }, + { + "epoch": 1.0791455874086566, + "grad_norm": 0.24759288132190704, + "learning_rate": 0.00013828828828828826, + "loss": 0.3666, + "step": 3600 + }, + { + "epoch": 1.079445381300356, + "grad_norm": 0.22639593482017517, + "learning_rate": 0.00013824324324324323, + "loss": 0.3419, + "step": 3601 + }, + { + "epoch": 1.0797451751920555, + "grad_norm": 0.24272778630256653, + "learning_rate": 0.0001381981981981982, + "loss": 0.3755, + "step": 3602 + }, + { + "epoch": 1.080044969083755, + "grad_norm": 0.23822729289531708, + "learning_rate": 0.00013815315315315313, + "loss": 0.3642, + "step": 3603 + }, + { + "epoch": 1.0803447629754543, + "grad_norm": 0.25243374705314636, + "learning_rate": 0.0001381081081081081, + "loss": 0.3754, + "step": 3604 + }, + { + "epoch": 1.0806445568671539, + "grad_norm": 0.24384267628192902, + "learning_rate": 0.00013806306306306305, + "loss": 0.3727, + "step": 3605 + }, + { + "epoch": 1.0809443507588532, + "grad_norm": 0.2210264801979065, + "learning_rate": 0.00013801801801801802, + "loss": 0.3527, + "step": 3606 + }, + { + "epoch": 1.0812441446505527, + "grad_norm": 0.23943836987018585, + "learning_rate": 0.00013797297297297296, + "loss": 0.3624, + "step": 3607 + }, + { + "epoch": 1.0815439385422523, + "grad_norm": 0.2331855595111847, + "learning_rate": 0.00013792792792792792, + "loss": 0.3677, + "step": 3608 + }, + { + "epoch": 1.0818437324339516, + "grad_norm": 0.23624837398529053, + "learning_rate": 0.00013788288288288288, + "loss": 0.3539, + "step": 3609 + }, + { + "epoch": 1.0821435263256511, + "grad_norm": 0.24074693024158478, + "learning_rate": 0.00013783783783783782, + "loss": 0.3615, + "step": 3610 + }, + { + "epoch": 1.0824433202173505, + "grad_norm": 0.22781312465667725, + "learning_rate": 0.00013779279279279278, + "loss": 0.3633, + "step": 3611 + }, + { + "epoch": 1.08274311410905, + "grad_norm": 0.2411901205778122, + "learning_rate": 0.00013774774774774775, + "loss": 0.352, + "step": 3612 + }, + { + "epoch": 1.0830429080007495, + "grad_norm": 0.24481438100337982, + "learning_rate": 0.00013770270270270268, + "loss": 0.353, + "step": 3613 + }, + { + "epoch": 1.0833427018924489, + "grad_norm": 0.2313372939825058, + "learning_rate": 0.00013765765765765765, + "loss": 0.3857, + "step": 3614 + }, + { + "epoch": 1.0836424957841484, + "grad_norm": 0.23338955640792847, + "learning_rate": 0.0001376126126126126, + "loss": 0.3852, + "step": 3615 + }, + { + "epoch": 1.083942289675848, + "grad_norm": 0.24845603108406067, + "learning_rate": 0.00013756756756756755, + "loss": 0.3418, + "step": 3616 + }, + { + "epoch": 1.0842420835675473, + "grad_norm": 0.22615694999694824, + "learning_rate": 0.0001375225225225225, + "loss": 0.3652, + "step": 3617 + }, + { + "epoch": 1.0845418774592468, + "grad_norm": 0.21894417703151703, + "learning_rate": 0.00013747747747747748, + "loss": 0.3343, + "step": 3618 + }, + { + "epoch": 1.0848416713509463, + "grad_norm": 0.23376759886741638, + "learning_rate": 0.0001374324324324324, + "loss": 0.3806, + "step": 3619 + }, + { + "epoch": 1.0851414652426457, + "grad_norm": 0.24642081558704376, + "learning_rate": 0.00013738738738738738, + "loss": 0.3869, + "step": 3620 + }, + { + "epoch": 1.0854412591343452, + "grad_norm": 0.2276991456747055, + "learning_rate": 0.00013734234234234234, + "loss": 0.3676, + "step": 3621 + }, + { + "epoch": 1.0857410530260445, + "grad_norm": 0.22866936028003693, + "learning_rate": 0.00013729729729729728, + "loss": 0.3566, + "step": 3622 + }, + { + "epoch": 1.086040846917744, + "grad_norm": 0.23887404799461365, + "learning_rate": 0.00013725225225225224, + "loss": 0.3738, + "step": 3623 + }, + { + "epoch": 1.0863406408094436, + "grad_norm": 0.23940208554267883, + "learning_rate": 0.00013720720720720718, + "loss": 0.3732, + "step": 3624 + }, + { + "epoch": 1.086640434701143, + "grad_norm": 0.2483586072921753, + "learning_rate": 0.00013716216216216214, + "loss": 0.3778, + "step": 3625 + }, + { + "epoch": 1.0869402285928425, + "grad_norm": 0.23491543531417847, + "learning_rate": 0.0001371171171171171, + "loss": 0.3532, + "step": 3626 + }, + { + "epoch": 1.0872400224845418, + "grad_norm": 0.21283003687858582, + "learning_rate": 0.00013707207207207204, + "loss": 0.3328, + "step": 3627 + }, + { + "epoch": 1.0875398163762413, + "grad_norm": 0.22129040956497192, + "learning_rate": 0.000137027027027027, + "loss": 0.356, + "step": 3628 + }, + { + "epoch": 1.0878396102679408, + "grad_norm": 0.2425287812948227, + "learning_rate": 0.00013698198198198197, + "loss": 0.3915, + "step": 3629 + }, + { + "epoch": 1.0881394041596402, + "grad_norm": 0.22848957777023315, + "learning_rate": 0.0001369369369369369, + "loss": 0.382, + "step": 3630 + }, + { + "epoch": 1.0884391980513397, + "grad_norm": 0.21875622868537903, + "learning_rate": 0.00013689189189189187, + "loss": 0.3399, + "step": 3631 + }, + { + "epoch": 1.0887389919430392, + "grad_norm": 0.2495780885219574, + "learning_rate": 0.00013684684684684684, + "loss": 0.3583, + "step": 3632 + }, + { + "epoch": 1.0890387858347386, + "grad_norm": 0.22478289902210236, + "learning_rate": 0.0001368018018018018, + "loss": 0.3594, + "step": 3633 + }, + { + "epoch": 1.089338579726438, + "grad_norm": 0.24992315471172333, + "learning_rate": 0.00013675675675675674, + "loss": 0.3892, + "step": 3634 + }, + { + "epoch": 1.0896383736181376, + "grad_norm": 0.21412153542041779, + "learning_rate": 0.0001367117117117117, + "loss": 0.3504, + "step": 3635 + }, + { + "epoch": 1.089938167509837, + "grad_norm": 0.246387779712677, + "learning_rate": 0.00013666666666666666, + "loss": 0.3786, + "step": 3636 + }, + { + "epoch": 1.0902379614015365, + "grad_norm": 0.21780602633953094, + "learning_rate": 0.0001366216216216216, + "loss": 0.3603, + "step": 3637 + }, + { + "epoch": 1.0905377552932358, + "grad_norm": 0.23555536568164825, + "learning_rate": 0.00013657657657657656, + "loss": 0.3879, + "step": 3638 + }, + { + "epoch": 1.0908375491849354, + "grad_norm": 0.22709296643733978, + "learning_rate": 0.00013653153153153153, + "loss": 0.3619, + "step": 3639 + }, + { + "epoch": 1.091137343076635, + "grad_norm": 0.2261640876531601, + "learning_rate": 0.0001364864864864865, + "loss": 0.362, + "step": 3640 + }, + { + "epoch": 1.0914371369683342, + "grad_norm": 0.21862421929836273, + "learning_rate": 0.00013644144144144143, + "loss": 0.3627, + "step": 3641 + }, + { + "epoch": 1.0917369308600338, + "grad_norm": 0.2231677919626236, + "learning_rate": 0.0001363963963963964, + "loss": 0.3522, + "step": 3642 + }, + { + "epoch": 1.092036724751733, + "grad_norm": 0.23300901055335999, + "learning_rate": 0.00013635135135135136, + "loss": 0.3718, + "step": 3643 + }, + { + "epoch": 1.0923365186434326, + "grad_norm": 0.2250256985425949, + "learning_rate": 0.0001363063063063063, + "loss": 0.3666, + "step": 3644 + }, + { + "epoch": 1.0926363125351322, + "grad_norm": 0.22397859394550323, + "learning_rate": 0.00013626126126126126, + "loss": 0.3627, + "step": 3645 + }, + { + "epoch": 1.0929361064268315, + "grad_norm": 0.22734856605529785, + "learning_rate": 0.00013621621621621622, + "loss": 0.3621, + "step": 3646 + }, + { + "epoch": 1.093235900318531, + "grad_norm": 0.2459496110677719, + "learning_rate": 0.00013617117117117116, + "loss": 0.3275, + "step": 3647 + }, + { + "epoch": 1.0935356942102306, + "grad_norm": 0.23530100286006927, + "learning_rate": 0.00013612612612612612, + "loss": 0.3507, + "step": 3648 + }, + { + "epoch": 1.0938354881019299, + "grad_norm": 0.2462019920349121, + "learning_rate": 0.00013608108108108106, + "loss": 0.3558, + "step": 3649 + }, + { + "epoch": 1.0941352819936294, + "grad_norm": 0.2784384489059448, + "learning_rate": 0.00013603603603603602, + "loss": 0.3739, + "step": 3650 + }, + { + "epoch": 1.0944350758853287, + "grad_norm": 0.2294548749923706, + "learning_rate": 0.00013599099099099099, + "loss": 0.3749, + "step": 3651 + }, + { + "epoch": 1.0947348697770283, + "grad_norm": 0.26810503005981445, + "learning_rate": 0.00013594594594594592, + "loss": 0.3896, + "step": 3652 + }, + { + "epoch": 1.0950346636687278, + "grad_norm": 0.2433824986219406, + "learning_rate": 0.0001359009009009009, + "loss": 0.3547, + "step": 3653 + }, + { + "epoch": 1.0953344575604271, + "grad_norm": 0.22635456919670105, + "learning_rate": 0.00013585585585585585, + "loss": 0.3529, + "step": 3654 + }, + { + "epoch": 1.0956342514521267, + "grad_norm": 0.2657754421234131, + "learning_rate": 0.0001358108108108108, + "loss": 0.3692, + "step": 3655 + }, + { + "epoch": 1.0959340453438262, + "grad_norm": 0.23322702944278717, + "learning_rate": 0.00013576576576576575, + "loss": 0.3598, + "step": 3656 + }, + { + "epoch": 1.0962338392355255, + "grad_norm": 0.23769034445285797, + "learning_rate": 0.00013572072072072072, + "loss": 0.3685, + "step": 3657 + }, + { + "epoch": 1.096533633127225, + "grad_norm": 0.2438647300004959, + "learning_rate": 0.00013567567567567565, + "loss": 0.3744, + "step": 3658 + }, + { + "epoch": 1.0968334270189244, + "grad_norm": 0.2461502104997635, + "learning_rate": 0.00013563063063063062, + "loss": 0.3686, + "step": 3659 + }, + { + "epoch": 1.097133220910624, + "grad_norm": 0.24611039459705353, + "learning_rate": 0.00013558558558558555, + "loss": 0.3745, + "step": 3660 + }, + { + "epoch": 1.0974330148023235, + "grad_norm": 0.25352588295936584, + "learning_rate": 0.00013554054054054052, + "loss": 0.3853, + "step": 3661 + }, + { + "epoch": 1.0977328086940228, + "grad_norm": 0.22174997627735138, + "learning_rate": 0.00013549549549549548, + "loss": 0.3601, + "step": 3662 + }, + { + "epoch": 1.0980326025857223, + "grad_norm": 0.22847199440002441, + "learning_rate": 0.00013545045045045044, + "loss": 0.3519, + "step": 3663 + }, + { + "epoch": 1.0983323964774219, + "grad_norm": 0.2461545169353485, + "learning_rate": 0.00013540540540540538, + "loss": 0.3754, + "step": 3664 + }, + { + "epoch": 1.0986321903691212, + "grad_norm": 0.21870985627174377, + "learning_rate": 0.00013536036036036034, + "loss": 0.3632, + "step": 3665 + }, + { + "epoch": 1.0989319842608207, + "grad_norm": 0.23304589092731476, + "learning_rate": 0.0001353153153153153, + "loss": 0.3328, + "step": 3666 + }, + { + "epoch": 1.09923177815252, + "grad_norm": 0.2161308377981186, + "learning_rate": 0.00013527027027027025, + "loss": 0.3376, + "step": 3667 + }, + { + "epoch": 1.0995315720442196, + "grad_norm": 0.22993774712085724, + "learning_rate": 0.0001352252252252252, + "loss": 0.3711, + "step": 3668 + }, + { + "epoch": 1.0998313659359191, + "grad_norm": 0.22846129536628723, + "learning_rate": 0.00013518018018018017, + "loss": 0.3326, + "step": 3669 + }, + { + "epoch": 1.1001311598276184, + "grad_norm": 0.22285179793834686, + "learning_rate": 0.00013513513513513514, + "loss": 0.3624, + "step": 3670 + }, + { + "epoch": 1.100430953719318, + "grad_norm": 0.21985436975955963, + "learning_rate": 0.00013509009009009007, + "loss": 0.3645, + "step": 3671 + }, + { + "epoch": 1.1007307476110175, + "grad_norm": 0.24420645833015442, + "learning_rate": 0.00013504504504504504, + "loss": 0.3842, + "step": 3672 + }, + { + "epoch": 1.1010305415027168, + "grad_norm": 0.22566132247447968, + "learning_rate": 0.000135, + "loss": 0.3743, + "step": 3673 + }, + { + "epoch": 1.1013303353944164, + "grad_norm": 0.23107251524925232, + "learning_rate": 0.00013495495495495494, + "loss": 0.3735, + "step": 3674 + }, + { + "epoch": 1.1016301292861157, + "grad_norm": 0.22111618518829346, + "learning_rate": 0.0001349099099099099, + "loss": 0.3556, + "step": 3675 + }, + { + "epoch": 1.1019299231778152, + "grad_norm": 0.22829733788967133, + "learning_rate": 0.00013486486486486487, + "loss": 0.3611, + "step": 3676 + }, + { + "epoch": 1.1022297170695148, + "grad_norm": 0.22518396377563477, + "learning_rate": 0.0001348198198198198, + "loss": 0.3681, + "step": 3677 + }, + { + "epoch": 1.102529510961214, + "grad_norm": 0.21075007319450378, + "learning_rate": 0.00013477477477477477, + "loss": 0.3343, + "step": 3678 + }, + { + "epoch": 1.1028293048529136, + "grad_norm": 0.2217082977294922, + "learning_rate": 0.00013472972972972973, + "loss": 0.3608, + "step": 3679 + }, + { + "epoch": 1.1031290987446132, + "grad_norm": 0.22440844774246216, + "learning_rate": 0.00013468468468468467, + "loss": 0.3548, + "step": 3680 + }, + { + "epoch": 1.1034288926363125, + "grad_norm": 0.22126342356204987, + "learning_rate": 0.00013463963963963963, + "loss": 0.3495, + "step": 3681 + }, + { + "epoch": 1.103728686528012, + "grad_norm": 0.23776847124099731, + "learning_rate": 0.0001345945945945946, + "loss": 0.393, + "step": 3682 + }, + { + "epoch": 1.1040284804197114, + "grad_norm": 0.21629567444324493, + "learning_rate": 0.00013454954954954953, + "loss": 0.3678, + "step": 3683 + }, + { + "epoch": 1.104328274311411, + "grad_norm": 0.2131320685148239, + "learning_rate": 0.0001345045045045045, + "loss": 0.3419, + "step": 3684 + }, + { + "epoch": 1.1046280682031104, + "grad_norm": 0.22412894666194916, + "learning_rate": 0.00013445945945945943, + "loss": 0.3656, + "step": 3685 + }, + { + "epoch": 1.1049278620948098, + "grad_norm": 0.2328217476606369, + "learning_rate": 0.0001344144144144144, + "loss": 0.3534, + "step": 3686 + }, + { + "epoch": 1.1052276559865093, + "grad_norm": 0.24423038959503174, + "learning_rate": 0.00013436936936936936, + "loss": 0.3858, + "step": 3687 + }, + { + "epoch": 1.1055274498782088, + "grad_norm": 0.23228362202644348, + "learning_rate": 0.0001343243243243243, + "loss": 0.3808, + "step": 3688 + }, + { + "epoch": 1.1058272437699082, + "grad_norm": 0.22738321125507355, + "learning_rate": 0.00013427927927927926, + "loss": 0.3892, + "step": 3689 + }, + { + "epoch": 1.1061270376616077, + "grad_norm": 0.2281411737203598, + "learning_rate": 0.00013423423423423422, + "loss": 0.3624, + "step": 3690 + }, + { + "epoch": 1.106426831553307, + "grad_norm": 0.2366478443145752, + "learning_rate": 0.00013418918918918916, + "loss": 0.3901, + "step": 3691 + }, + { + "epoch": 1.1067266254450066, + "grad_norm": 0.21975024044513702, + "learning_rate": 0.00013414414414414413, + "loss": 0.3815, + "step": 3692 + }, + { + "epoch": 1.107026419336706, + "grad_norm": 0.24421407282352448, + "learning_rate": 0.0001340990990990991, + "loss": 0.3763, + "step": 3693 + }, + { + "epoch": 1.1073262132284054, + "grad_norm": 0.23885339498519897, + "learning_rate": 0.00013405405405405403, + "loss": 0.3576, + "step": 3694 + }, + { + "epoch": 1.107626007120105, + "grad_norm": 0.2415476143360138, + "learning_rate": 0.000134009009009009, + "loss": 0.3766, + "step": 3695 + }, + { + "epoch": 1.1079258010118043, + "grad_norm": 0.23010794818401337, + "learning_rate": 0.00013396396396396395, + "loss": 0.3846, + "step": 3696 + }, + { + "epoch": 1.1082255949035038, + "grad_norm": 0.24465374648571014, + "learning_rate": 0.00013391891891891892, + "loss": 0.3803, + "step": 3697 + }, + { + "epoch": 1.1085253887952033, + "grad_norm": 0.23296433687210083, + "learning_rate": 0.00013387387387387385, + "loss": 0.3702, + "step": 3698 + }, + { + "epoch": 1.1088251826869027, + "grad_norm": 0.23366378247737885, + "learning_rate": 0.00013382882882882882, + "loss": 0.3618, + "step": 3699 + }, + { + "epoch": 1.1091249765786022, + "grad_norm": 0.21989014744758606, + "learning_rate": 0.00013378378378378378, + "loss": 0.3547, + "step": 3700 + }, + { + "epoch": 1.1094247704703017, + "grad_norm": 0.23640766739845276, + "learning_rate": 0.00013373873873873872, + "loss": 0.3863, + "step": 3701 + }, + { + "epoch": 1.109724564362001, + "grad_norm": 0.22326450049877167, + "learning_rate": 0.00013369369369369368, + "loss": 0.3579, + "step": 3702 + }, + { + "epoch": 1.1100243582537006, + "grad_norm": 0.23940657079219818, + "learning_rate": 0.00013364864864864865, + "loss": 0.3679, + "step": 3703 + }, + { + "epoch": 1.1103241521454001, + "grad_norm": 0.24894234538078308, + "learning_rate": 0.0001336036036036036, + "loss": 0.3873, + "step": 3704 + }, + { + "epoch": 1.1106239460370995, + "grad_norm": 0.22873175144195557, + "learning_rate": 0.00013355855855855855, + "loss": 0.3797, + "step": 3705 + }, + { + "epoch": 1.110923739928799, + "grad_norm": 0.23857954144477844, + "learning_rate": 0.0001335135135135135, + "loss": 0.368, + "step": 3706 + }, + { + "epoch": 1.1112235338204983, + "grad_norm": 0.22301368415355682, + "learning_rate": 0.00013346846846846848, + "loss": 0.3535, + "step": 3707 + }, + { + "epoch": 1.1115233277121979, + "grad_norm": 0.23414717614650726, + "learning_rate": 0.0001334234234234234, + "loss": 0.3637, + "step": 3708 + }, + { + "epoch": 1.1118231216038974, + "grad_norm": 0.2625446021556854, + "learning_rate": 0.00013337837837837838, + "loss": 0.3959, + "step": 3709 + }, + { + "epoch": 1.1121229154955967, + "grad_norm": 0.21231703460216522, + "learning_rate": 0.0001333333333333333, + "loss": 0.3427, + "step": 3710 + }, + { + "epoch": 1.1124227093872963, + "grad_norm": 0.23999685049057007, + "learning_rate": 0.00013328828828828828, + "loss": 0.402, + "step": 3711 + }, + { + "epoch": 1.1127225032789956, + "grad_norm": 0.23020519316196442, + "learning_rate": 0.00013324324324324324, + "loss": 0.361, + "step": 3712 + }, + { + "epoch": 1.1130222971706951, + "grad_norm": 0.2160668969154358, + "learning_rate": 0.00013319819819819818, + "loss": 0.3559, + "step": 3713 + }, + { + "epoch": 1.1133220910623947, + "grad_norm": 0.23094019293785095, + "learning_rate": 0.00013315315315315314, + "loss": 0.3799, + "step": 3714 + }, + { + "epoch": 1.113621884954094, + "grad_norm": 0.23295539617538452, + "learning_rate": 0.0001331081081081081, + "loss": 0.3782, + "step": 3715 + }, + { + "epoch": 1.1139216788457935, + "grad_norm": 0.24856853485107422, + "learning_rate": 0.00013306306306306304, + "loss": 0.3878, + "step": 3716 + }, + { + "epoch": 1.114221472737493, + "grad_norm": 0.22519513964653015, + "learning_rate": 0.000133018018018018, + "loss": 0.3637, + "step": 3717 + }, + { + "epoch": 1.1145212666291924, + "grad_norm": 0.235112726688385, + "learning_rate": 0.00013297297297297297, + "loss": 0.3623, + "step": 3718 + }, + { + "epoch": 1.114821060520892, + "grad_norm": 0.2207203060388565, + "learning_rate": 0.0001329279279279279, + "loss": 0.3711, + "step": 3719 + }, + { + "epoch": 1.1151208544125915, + "grad_norm": 0.2256298065185547, + "learning_rate": 0.00013288288288288287, + "loss": 0.3407, + "step": 3720 + }, + { + "epoch": 1.1154206483042908, + "grad_norm": 0.21627315878868103, + "learning_rate": 0.0001328378378378378, + "loss": 0.3514, + "step": 3721 + }, + { + "epoch": 1.1157204421959903, + "grad_norm": 0.22296972572803497, + "learning_rate": 0.00013279279279279277, + "loss": 0.3598, + "step": 3722 + }, + { + "epoch": 1.1160202360876896, + "grad_norm": 0.23051004111766815, + "learning_rate": 0.00013274774774774773, + "loss": 0.3703, + "step": 3723 + }, + { + "epoch": 1.1163200299793892, + "grad_norm": 0.22421251237392426, + "learning_rate": 0.00013270270270270267, + "loss": 0.3671, + "step": 3724 + }, + { + "epoch": 1.1166198238710887, + "grad_norm": 0.22339527308940887, + "learning_rate": 0.00013265765765765764, + "loss": 0.3396, + "step": 3725 + }, + { + "epoch": 1.116919617762788, + "grad_norm": 0.22906826436519623, + "learning_rate": 0.0001326126126126126, + "loss": 0.3492, + "step": 3726 + }, + { + "epoch": 1.1172194116544876, + "grad_norm": 0.22747504711151123, + "learning_rate": 0.00013256756756756756, + "loss": 0.3592, + "step": 3727 + }, + { + "epoch": 1.1175192055461869, + "grad_norm": 0.2335633784532547, + "learning_rate": 0.0001325225225225225, + "loss": 0.3636, + "step": 3728 + }, + { + "epoch": 1.1178189994378864, + "grad_norm": 0.22873105108737946, + "learning_rate": 0.00013247747747747746, + "loss": 0.3716, + "step": 3729 + }, + { + "epoch": 1.118118793329586, + "grad_norm": 0.23329488933086395, + "learning_rate": 0.00013243243243243243, + "loss": 0.3573, + "step": 3730 + }, + { + "epoch": 1.1184185872212853, + "grad_norm": 0.21844343841075897, + "learning_rate": 0.0001323873873873874, + "loss": 0.3355, + "step": 3731 + }, + { + "epoch": 1.1187183811129848, + "grad_norm": 0.23338133096694946, + "learning_rate": 0.00013234234234234233, + "loss": 0.3792, + "step": 3732 + }, + { + "epoch": 1.1190181750046844, + "grad_norm": 0.22400586307048798, + "learning_rate": 0.0001322972972972973, + "loss": 0.3753, + "step": 3733 + }, + { + "epoch": 1.1193179688963837, + "grad_norm": 0.2163958102464676, + "learning_rate": 0.00013225225225225226, + "loss": 0.353, + "step": 3734 + }, + { + "epoch": 1.1196177627880832, + "grad_norm": 0.24042215943336487, + "learning_rate": 0.0001322072072072072, + "loss": 0.3607, + "step": 3735 + }, + { + "epoch": 1.1199175566797828, + "grad_norm": 0.22561825811862946, + "learning_rate": 0.00013216216216216216, + "loss": 0.3701, + "step": 3736 + }, + { + "epoch": 1.120217350571482, + "grad_norm": 0.22830486297607422, + "learning_rate": 0.00013211711711711712, + "loss": 0.3649, + "step": 3737 + }, + { + "epoch": 1.1205171444631816, + "grad_norm": 0.21653513610363007, + "learning_rate": 0.00013207207207207206, + "loss": 0.3464, + "step": 3738 + }, + { + "epoch": 1.120816938354881, + "grad_norm": 0.23094573616981506, + "learning_rate": 0.00013202702702702702, + "loss": 0.3829, + "step": 3739 + }, + { + "epoch": 1.1211167322465805, + "grad_norm": 0.22236552834510803, + "learning_rate": 0.00013198198198198198, + "loss": 0.341, + "step": 3740 + }, + { + "epoch": 1.12141652613828, + "grad_norm": 0.22838713228702545, + "learning_rate": 0.00013193693693693692, + "loss": 0.3724, + "step": 3741 + }, + { + "epoch": 1.1217163200299793, + "grad_norm": 0.2387278527021408, + "learning_rate": 0.00013189189189189189, + "loss": 0.3535, + "step": 3742 + }, + { + "epoch": 1.1220161139216789, + "grad_norm": 0.2243206948041916, + "learning_rate": 0.00013184684684684685, + "loss": 0.3484, + "step": 3743 + }, + { + "epoch": 1.1223159078133782, + "grad_norm": 0.21640804409980774, + "learning_rate": 0.00013180180180180179, + "loss": 0.3558, + "step": 3744 + }, + { + "epoch": 1.1226157017050777, + "grad_norm": 0.23123346269130707, + "learning_rate": 0.00013175675675675675, + "loss": 0.3549, + "step": 3745 + }, + { + "epoch": 1.1229154955967773, + "grad_norm": 0.22995956242084503, + "learning_rate": 0.0001317117117117117, + "loss": 0.3502, + "step": 3746 + }, + { + "epoch": 1.1232152894884766, + "grad_norm": 0.2435770183801651, + "learning_rate": 0.00013166666666666665, + "loss": 0.4184, + "step": 3747 + }, + { + "epoch": 1.1235150833801761, + "grad_norm": 0.21576935052871704, + "learning_rate": 0.00013162162162162161, + "loss": 0.3598, + "step": 3748 + }, + { + "epoch": 1.1238148772718757, + "grad_norm": 0.22539666295051575, + "learning_rate": 0.00013157657657657655, + "loss": 0.3527, + "step": 3749 + }, + { + "epoch": 1.124114671163575, + "grad_norm": 0.24922697246074677, + "learning_rate": 0.00013153153153153152, + "loss": 0.3499, + "step": 3750 + }, + { + "epoch": 1.1244144650552745, + "grad_norm": 0.2371659129858017, + "learning_rate": 0.00013148648648648648, + "loss": 0.3837, + "step": 3751 + }, + { + "epoch": 1.1247142589469739, + "grad_norm": 0.22361376881599426, + "learning_rate": 0.00013144144144144142, + "loss": 0.3675, + "step": 3752 + }, + { + "epoch": 1.1250140528386734, + "grad_norm": 0.24542655050754547, + "learning_rate": 0.00013139639639639638, + "loss": 0.3922, + "step": 3753 + }, + { + "epoch": 1.125313846730373, + "grad_norm": 0.22801847755908966, + "learning_rate": 0.00013135135135135134, + "loss": 0.357, + "step": 3754 + }, + { + "epoch": 1.1256136406220723, + "grad_norm": 0.23783008754253387, + "learning_rate": 0.00013130630630630628, + "loss": 0.3834, + "step": 3755 + }, + { + "epoch": 1.1259134345137718, + "grad_norm": 0.2301386147737503, + "learning_rate": 0.00013126126126126124, + "loss": 0.3671, + "step": 3756 + }, + { + "epoch": 1.1262132284054713, + "grad_norm": 0.2202630639076233, + "learning_rate": 0.0001312162162162162, + "loss": 0.3564, + "step": 3757 + }, + { + "epoch": 1.1265130222971707, + "grad_norm": 0.22803302109241486, + "learning_rate": 0.00013117117117117114, + "loss": 0.3716, + "step": 3758 + }, + { + "epoch": 1.1268128161888702, + "grad_norm": 0.23013994097709656, + "learning_rate": 0.0001311261261261261, + "loss": 0.3668, + "step": 3759 + }, + { + "epoch": 1.1271126100805695, + "grad_norm": 0.2381478101015091, + "learning_rate": 0.00013108108108108107, + "loss": 0.364, + "step": 3760 + }, + { + "epoch": 1.127412403972269, + "grad_norm": 0.22907504439353943, + "learning_rate": 0.00013103603603603604, + "loss": 0.3582, + "step": 3761 + }, + { + "epoch": 1.1277121978639686, + "grad_norm": 0.23410449922084808, + "learning_rate": 0.00013099099099099097, + "loss": 0.3604, + "step": 3762 + }, + { + "epoch": 1.128011991755668, + "grad_norm": 0.2244759202003479, + "learning_rate": 0.00013094594594594594, + "loss": 0.3534, + "step": 3763 + }, + { + "epoch": 1.1283117856473674, + "grad_norm": 0.22788846492767334, + "learning_rate": 0.0001309009009009009, + "loss": 0.375, + "step": 3764 + }, + { + "epoch": 1.1286115795390668, + "grad_norm": 0.23673410713672638, + "learning_rate": 0.00013085585585585584, + "loss": 0.3754, + "step": 3765 + }, + { + "epoch": 1.1289113734307663, + "grad_norm": 0.21583424508571625, + "learning_rate": 0.0001308108108108108, + "loss": 0.3523, + "step": 3766 + }, + { + "epoch": 1.1292111673224658, + "grad_norm": 0.22422835230827332, + "learning_rate": 0.00013076576576576577, + "loss": 0.3659, + "step": 3767 + }, + { + "epoch": 1.1295109612141652, + "grad_norm": 0.23020607233047485, + "learning_rate": 0.00013072072072072073, + "loss": 0.3736, + "step": 3768 + }, + { + "epoch": 1.1298107551058647, + "grad_norm": 0.22912679612636566, + "learning_rate": 0.00013067567567567567, + "loss": 0.3822, + "step": 3769 + }, + { + "epoch": 1.1301105489975642, + "grad_norm": 0.221907839179039, + "learning_rate": 0.00013063063063063063, + "loss": 0.3718, + "step": 3770 + }, + { + "epoch": 1.1304103428892636, + "grad_norm": 0.23240089416503906, + "learning_rate": 0.00013058558558558557, + "loss": 0.3703, + "step": 3771 + }, + { + "epoch": 1.130710136780963, + "grad_norm": 0.23240825533866882, + "learning_rate": 0.00013054054054054053, + "loss": 0.3412, + "step": 3772 + }, + { + "epoch": 1.1310099306726626, + "grad_norm": 0.22136814892292023, + "learning_rate": 0.0001304954954954955, + "loss": 0.3667, + "step": 3773 + }, + { + "epoch": 1.131309724564362, + "grad_norm": 0.23060092329978943, + "learning_rate": 0.00013045045045045043, + "loss": 0.3689, + "step": 3774 + }, + { + "epoch": 1.1316095184560615, + "grad_norm": 0.23062026500701904, + "learning_rate": 0.0001304054054054054, + "loss": 0.3546, + "step": 3775 + }, + { + "epoch": 1.1319093123477608, + "grad_norm": 0.22831834852695465, + "learning_rate": 0.00013036036036036036, + "loss": 0.3699, + "step": 3776 + }, + { + "epoch": 1.1322091062394604, + "grad_norm": 0.22447456419467926, + "learning_rate": 0.0001303153153153153, + "loss": 0.368, + "step": 3777 + }, + { + "epoch": 1.13250890013116, + "grad_norm": 0.21648916602134705, + "learning_rate": 0.00013027027027027026, + "loss": 0.3498, + "step": 3778 + }, + { + "epoch": 1.1328086940228592, + "grad_norm": 0.2411709874868393, + "learning_rate": 0.00013022522522522522, + "loss": 0.3729, + "step": 3779 + }, + { + "epoch": 1.1331084879145588, + "grad_norm": 0.2325679361820221, + "learning_rate": 0.00013018018018018016, + "loss": 0.3459, + "step": 3780 + }, + { + "epoch": 1.133408281806258, + "grad_norm": 0.22246721386909485, + "learning_rate": 0.00013013513513513512, + "loss": 0.3518, + "step": 3781 + }, + { + "epoch": 1.1337080756979576, + "grad_norm": 0.23394489288330078, + "learning_rate": 0.00013009009009009006, + "loss": 0.3825, + "step": 3782 + }, + { + "epoch": 1.1340078695896572, + "grad_norm": 0.21840216219425201, + "learning_rate": 0.00013004504504504502, + "loss": 0.3584, + "step": 3783 + }, + { + "epoch": 1.1343076634813565, + "grad_norm": 0.2106565684080124, + "learning_rate": 0.00013, + "loss": 0.3458, + "step": 3784 + }, + { + "epoch": 1.134607457373056, + "grad_norm": 0.24276579916477203, + "learning_rate": 0.00012995495495495493, + "loss": 0.3997, + "step": 3785 + }, + { + "epoch": 1.1349072512647556, + "grad_norm": 0.226302832365036, + "learning_rate": 0.0001299099099099099, + "loss": 0.3594, + "step": 3786 + }, + { + "epoch": 1.1352070451564549, + "grad_norm": 0.23036521673202515, + "learning_rate": 0.00012986486486486485, + "loss": 0.3705, + "step": 3787 + }, + { + "epoch": 1.1355068390481544, + "grad_norm": 0.21445749700069427, + "learning_rate": 0.00012981981981981982, + "loss": 0.3596, + "step": 3788 + }, + { + "epoch": 1.135806632939854, + "grad_norm": 0.23045356571674347, + "learning_rate": 0.00012977477477477475, + "loss": 0.3622, + "step": 3789 + }, + { + "epoch": 1.1361064268315533, + "grad_norm": 0.21341310441493988, + "learning_rate": 0.00012972972972972972, + "loss": 0.3351, + "step": 3790 + }, + { + "epoch": 1.1364062207232528, + "grad_norm": 0.2475743591785431, + "learning_rate": 0.00012968468468468468, + "loss": 0.384, + "step": 3791 + }, + { + "epoch": 1.1367060146149521, + "grad_norm": 0.23405541479587555, + "learning_rate": 0.00012963963963963962, + "loss": 0.381, + "step": 3792 + }, + { + "epoch": 1.1370058085066517, + "grad_norm": 0.22105379402637482, + "learning_rate": 0.00012959459459459458, + "loss": 0.3682, + "step": 3793 + }, + { + "epoch": 1.1373056023983512, + "grad_norm": 0.2180909365415573, + "learning_rate": 0.00012954954954954955, + "loss": 0.3549, + "step": 3794 + }, + { + "epoch": 1.1376053962900505, + "grad_norm": 0.23662352561950684, + "learning_rate": 0.0001295045045045045, + "loss": 0.404, + "step": 3795 + }, + { + "epoch": 1.13790519018175, + "grad_norm": 0.23034508526325226, + "learning_rate": 0.00012945945945945945, + "loss": 0.3422, + "step": 3796 + }, + { + "epoch": 1.1382049840734494, + "grad_norm": 0.2209431529045105, + "learning_rate": 0.0001294144144144144, + "loss": 0.3522, + "step": 3797 + }, + { + "epoch": 1.138504777965149, + "grad_norm": 0.2166588455438614, + "learning_rate": 0.00012936936936936937, + "loss": 0.3507, + "step": 3798 + }, + { + "epoch": 1.1388045718568485, + "grad_norm": 0.22025123238563538, + "learning_rate": 0.0001293243243243243, + "loss": 0.3624, + "step": 3799 + }, + { + "epoch": 1.1391043657485478, + "grad_norm": 0.22763989865779877, + "learning_rate": 0.00012927927927927928, + "loss": 0.3695, + "step": 3800 + }, + { + "epoch": 1.1394041596402473, + "grad_norm": 0.2226601541042328, + "learning_rate": 0.00012923423423423424, + "loss": 0.3573, + "step": 3801 + }, + { + "epoch": 1.1397039535319469, + "grad_norm": 0.2142391949892044, + "learning_rate": 0.00012918918918918918, + "loss": 0.3257, + "step": 3802 + }, + { + "epoch": 1.1400037474236462, + "grad_norm": 0.23330844938755035, + "learning_rate": 0.00012914414414414414, + "loss": 0.3619, + "step": 3803 + }, + { + "epoch": 1.1403035413153457, + "grad_norm": 0.22902002930641174, + "learning_rate": 0.0001290990990990991, + "loss": 0.3563, + "step": 3804 + }, + { + "epoch": 1.1406033352070453, + "grad_norm": 0.2570081353187561, + "learning_rate": 0.00012905405405405404, + "loss": 0.3784, + "step": 3805 + }, + { + "epoch": 1.1409031290987446, + "grad_norm": 0.2169354110956192, + "learning_rate": 0.000129009009009009, + "loss": 0.3751, + "step": 3806 + }, + { + "epoch": 1.1412029229904441, + "grad_norm": 0.2247963547706604, + "learning_rate": 0.00012896396396396394, + "loss": 0.3797, + "step": 3807 + }, + { + "epoch": 1.1415027168821434, + "grad_norm": 0.23023879528045654, + "learning_rate": 0.0001289189189189189, + "loss": 0.3571, + "step": 3808 + }, + { + "epoch": 1.141802510773843, + "grad_norm": 0.21960099041461945, + "learning_rate": 0.00012887387387387387, + "loss": 0.3531, + "step": 3809 + }, + { + "epoch": 1.1421023046655425, + "grad_norm": 0.22244469821453094, + "learning_rate": 0.0001288288288288288, + "loss": 0.3544, + "step": 3810 + }, + { + "epoch": 1.1424020985572418, + "grad_norm": 0.23384922742843628, + "learning_rate": 0.00012878378378378377, + "loss": 0.382, + "step": 3811 + }, + { + "epoch": 1.1427018924489414, + "grad_norm": 0.2118406891822815, + "learning_rate": 0.00012873873873873873, + "loss": 0.3372, + "step": 3812 + }, + { + "epoch": 1.1430016863406407, + "grad_norm": 0.22800342738628387, + "learning_rate": 0.00012869369369369367, + "loss": 0.3664, + "step": 3813 + }, + { + "epoch": 1.1433014802323402, + "grad_norm": 0.24370819330215454, + "learning_rate": 0.00012864864864864863, + "loss": 0.364, + "step": 3814 + }, + { + "epoch": 1.1436012741240398, + "grad_norm": 0.22153517603874207, + "learning_rate": 0.0001286036036036036, + "loss": 0.3664, + "step": 3815 + }, + { + "epoch": 1.143901068015739, + "grad_norm": 0.22754263877868652, + "learning_rate": 0.00012855855855855853, + "loss": 0.367, + "step": 3816 + }, + { + "epoch": 1.1442008619074386, + "grad_norm": 0.22332321107387543, + "learning_rate": 0.0001285135135135135, + "loss": 0.3625, + "step": 3817 + }, + { + "epoch": 1.1445006557991382, + "grad_norm": 0.22275400161743164, + "learning_rate": 0.00012846846846846846, + "loss": 0.3575, + "step": 3818 + }, + { + "epoch": 1.1448004496908375, + "grad_norm": 0.21963873505592346, + "learning_rate": 0.0001284234234234234, + "loss": 0.341, + "step": 3819 + }, + { + "epoch": 1.145100243582537, + "grad_norm": 0.2317275106906891, + "learning_rate": 0.00012837837837837836, + "loss": 0.3631, + "step": 3820 + }, + { + "epoch": 1.1454000374742366, + "grad_norm": 0.21779996156692505, + "learning_rate": 0.00012833333333333333, + "loss": 0.3643, + "step": 3821 + }, + { + "epoch": 1.145699831365936, + "grad_norm": 0.22643230855464935, + "learning_rate": 0.00012828828828828826, + "loss": 0.362, + "step": 3822 + }, + { + "epoch": 1.1459996252576354, + "grad_norm": 0.23639261722564697, + "learning_rate": 0.00012824324324324323, + "loss": 0.36, + "step": 3823 + }, + { + "epoch": 1.1462994191493348, + "grad_norm": 0.241719588637352, + "learning_rate": 0.0001281981981981982, + "loss": 0.4015, + "step": 3824 + }, + { + "epoch": 1.1465992130410343, + "grad_norm": 0.22257046401500702, + "learning_rate": 0.00012815315315315315, + "loss": 0.3648, + "step": 3825 + }, + { + "epoch": 1.1468990069327338, + "grad_norm": 0.2236374020576477, + "learning_rate": 0.0001281081081081081, + "loss": 0.3684, + "step": 3826 + }, + { + "epoch": 1.1471988008244332, + "grad_norm": 0.2196348011493683, + "learning_rate": 0.00012806306306306306, + "loss": 0.3627, + "step": 3827 + }, + { + "epoch": 1.1474985947161327, + "grad_norm": 0.24044060707092285, + "learning_rate": 0.00012801801801801802, + "loss": 0.3706, + "step": 3828 + }, + { + "epoch": 1.147798388607832, + "grad_norm": 0.2156895101070404, + "learning_rate": 0.00012797297297297296, + "loss": 0.3697, + "step": 3829 + }, + { + "epoch": 1.1480981824995315, + "grad_norm": 0.2214498221874237, + "learning_rate": 0.00012792792792792792, + "loss": 0.3719, + "step": 3830 + }, + { + "epoch": 1.148397976391231, + "grad_norm": 0.2220916897058487, + "learning_rate": 0.00012788288288288288, + "loss": 0.367, + "step": 3831 + }, + { + "epoch": 1.1486977702829304, + "grad_norm": 0.23982498049736023, + "learning_rate": 0.00012783783783783782, + "loss": 0.3856, + "step": 3832 + }, + { + "epoch": 1.14899756417463, + "grad_norm": 0.22089318931102753, + "learning_rate": 0.00012779279279279278, + "loss": 0.3337, + "step": 3833 + }, + { + "epoch": 1.1492973580663295, + "grad_norm": 0.20539483428001404, + "learning_rate": 0.00012774774774774775, + "loss": 0.3382, + "step": 3834 + }, + { + "epoch": 1.1495971519580288, + "grad_norm": 0.2173803299665451, + "learning_rate": 0.00012770270270270269, + "loss": 0.3595, + "step": 3835 + }, + { + "epoch": 1.1498969458497283, + "grad_norm": 0.2402333915233612, + "learning_rate": 0.00012765765765765765, + "loss": 0.3714, + "step": 3836 + }, + { + "epoch": 1.1501967397414279, + "grad_norm": 0.2177773267030716, + "learning_rate": 0.0001276126126126126, + "loss": 0.348, + "step": 3837 + }, + { + "epoch": 1.1504965336331272, + "grad_norm": 0.22336289286613464, + "learning_rate": 0.00012756756756756755, + "loss": 0.3722, + "step": 3838 + }, + { + "epoch": 1.1507963275248267, + "grad_norm": 0.20858174562454224, + "learning_rate": 0.00012752252252252251, + "loss": 0.3337, + "step": 3839 + }, + { + "epoch": 1.151096121416526, + "grad_norm": 0.22657938301563263, + "learning_rate": 0.00012747747747747748, + "loss": 0.3641, + "step": 3840 + }, + { + "epoch": 1.1513959153082256, + "grad_norm": 0.23537993431091309, + "learning_rate": 0.00012743243243243241, + "loss": 0.3679, + "step": 3841 + }, + { + "epoch": 1.1516957091999251, + "grad_norm": 0.23289118707180023, + "learning_rate": 0.00012738738738738738, + "loss": 0.3783, + "step": 3842 + }, + { + "epoch": 1.1519955030916245, + "grad_norm": 0.2241375595331192, + "learning_rate": 0.00012734234234234231, + "loss": 0.351, + "step": 3843 + }, + { + "epoch": 1.152295296983324, + "grad_norm": 0.22272761166095734, + "learning_rate": 0.00012729729729729728, + "loss": 0.3466, + "step": 3844 + }, + { + "epoch": 1.1525950908750233, + "grad_norm": 0.22620238363742828, + "learning_rate": 0.00012725225225225224, + "loss": 0.3567, + "step": 3845 + }, + { + "epoch": 1.1528948847667229, + "grad_norm": 0.23571644723415375, + "learning_rate": 0.00012720720720720718, + "loss": 0.3679, + "step": 3846 + }, + { + "epoch": 1.1531946786584224, + "grad_norm": 0.2184414118528366, + "learning_rate": 0.00012716216216216214, + "loss": 0.3329, + "step": 3847 + }, + { + "epoch": 1.1534944725501217, + "grad_norm": 0.23073722422122955, + "learning_rate": 0.0001271171171171171, + "loss": 0.3613, + "step": 3848 + }, + { + "epoch": 1.1537942664418213, + "grad_norm": 0.21744020283222198, + "learning_rate": 0.00012707207207207204, + "loss": 0.378, + "step": 3849 + }, + { + "epoch": 1.1540940603335208, + "grad_norm": 0.23165687918663025, + "learning_rate": 0.000127027027027027, + "loss": 0.3623, + "step": 3850 + }, + { + "epoch": 1.1543938542252201, + "grad_norm": 0.2230186015367508, + "learning_rate": 0.00012698198198198197, + "loss": 0.3849, + "step": 3851 + }, + { + "epoch": 1.1546936481169197, + "grad_norm": 0.2341298907995224, + "learning_rate": 0.00012693693693693694, + "loss": 0.3744, + "step": 3852 + }, + { + "epoch": 1.1549934420086192, + "grad_norm": 0.2185104489326477, + "learning_rate": 0.00012689189189189187, + "loss": 0.3675, + "step": 3853 + }, + { + "epoch": 1.1552932359003185, + "grad_norm": 0.21571005880832672, + "learning_rate": 0.00012684684684684684, + "loss": 0.3428, + "step": 3854 + }, + { + "epoch": 1.155593029792018, + "grad_norm": 0.221488818526268, + "learning_rate": 0.0001268018018018018, + "loss": 0.3449, + "step": 3855 + }, + { + "epoch": 1.1558928236837174, + "grad_norm": 0.2154919058084488, + "learning_rate": 0.00012675675675675674, + "loss": 0.3392, + "step": 3856 + }, + { + "epoch": 1.156192617575417, + "grad_norm": 0.22724199295043945, + "learning_rate": 0.0001267117117117117, + "loss": 0.3618, + "step": 3857 + }, + { + "epoch": 1.1564924114671165, + "grad_norm": 0.22333818674087524, + "learning_rate": 0.00012666666666666666, + "loss": 0.3547, + "step": 3858 + }, + { + "epoch": 1.1567922053588158, + "grad_norm": 0.22069819271564484, + "learning_rate": 0.00012662162162162163, + "loss": 0.3636, + "step": 3859 + }, + { + "epoch": 1.1570919992505153, + "grad_norm": 0.21591633558273315, + "learning_rate": 0.00012657657657657657, + "loss": 0.3574, + "step": 3860 + }, + { + "epoch": 1.1573917931422146, + "grad_norm": 0.24098694324493408, + "learning_rate": 0.00012653153153153153, + "loss": 0.3836, + "step": 3861 + }, + { + "epoch": 1.1576915870339142, + "grad_norm": 0.2127639651298523, + "learning_rate": 0.0001264864864864865, + "loss": 0.3508, + "step": 3862 + }, + { + "epoch": 1.1579913809256137, + "grad_norm": 0.2245662361383438, + "learning_rate": 0.00012644144144144143, + "loss": 0.3617, + "step": 3863 + }, + { + "epoch": 1.158291174817313, + "grad_norm": 0.2253742665052414, + "learning_rate": 0.0001263963963963964, + "loss": 0.3689, + "step": 3864 + }, + { + "epoch": 1.1585909687090126, + "grad_norm": 0.22618187963962555, + "learning_rate": 0.00012635135135135133, + "loss": 0.3588, + "step": 3865 + }, + { + "epoch": 1.1588907626007119, + "grad_norm": 0.21666477620601654, + "learning_rate": 0.0001263063063063063, + "loss": 0.3447, + "step": 3866 + }, + { + "epoch": 1.1591905564924114, + "grad_norm": 0.20630092918872833, + "learning_rate": 0.00012626126126126126, + "loss": 0.3466, + "step": 3867 + }, + { + "epoch": 1.159490350384111, + "grad_norm": 0.22365500032901764, + "learning_rate": 0.0001262162162162162, + "loss": 0.363, + "step": 3868 + }, + { + "epoch": 1.1597901442758103, + "grad_norm": 0.22614437341690063, + "learning_rate": 0.00012617117117117116, + "loss": 0.3689, + "step": 3869 + }, + { + "epoch": 1.1600899381675098, + "grad_norm": 0.21964935958385468, + "learning_rate": 0.00012612612612612612, + "loss": 0.3512, + "step": 3870 + }, + { + "epoch": 1.1603897320592094, + "grad_norm": 0.22623321413993835, + "learning_rate": 0.00012608108108108106, + "loss": 0.3808, + "step": 3871 + }, + { + "epoch": 1.1606895259509087, + "grad_norm": 0.24244821071624756, + "learning_rate": 0.00012603603603603602, + "loss": 0.391, + "step": 3872 + }, + { + "epoch": 1.1609893198426082, + "grad_norm": 0.21943898499011993, + "learning_rate": 0.000125990990990991, + "loss": 0.3544, + "step": 3873 + }, + { + "epoch": 1.1612891137343078, + "grad_norm": 0.2190704196691513, + "learning_rate": 0.00012594594594594592, + "loss": 0.3668, + "step": 3874 + }, + { + "epoch": 1.161588907626007, + "grad_norm": 0.23364375531673431, + "learning_rate": 0.0001259009009009009, + "loss": 0.3464, + "step": 3875 + }, + { + "epoch": 1.1618887015177066, + "grad_norm": 0.22535942494869232, + "learning_rate": 0.00012585585585585585, + "loss": 0.3547, + "step": 3876 + }, + { + "epoch": 1.162188495409406, + "grad_norm": 0.2408933788537979, + "learning_rate": 0.0001258108108108108, + "loss": 0.381, + "step": 3877 + }, + { + "epoch": 1.1624882893011055, + "grad_norm": 0.22482524812221527, + "learning_rate": 0.00012576576576576575, + "loss": 0.3638, + "step": 3878 + }, + { + "epoch": 1.162788083192805, + "grad_norm": 0.22486832737922668, + "learning_rate": 0.0001257207207207207, + "loss": 0.3801, + "step": 3879 + }, + { + "epoch": 1.1630878770845043, + "grad_norm": 0.22509555518627167, + "learning_rate": 0.00012567567567567565, + "loss": 0.3776, + "step": 3880 + }, + { + "epoch": 1.1633876709762039, + "grad_norm": 0.21738441288471222, + "learning_rate": 0.00012563063063063062, + "loss": 0.3394, + "step": 3881 + }, + { + "epoch": 1.1636874648679032, + "grad_norm": 0.24172024428844452, + "learning_rate": 0.00012558558558558558, + "loss": 0.3598, + "step": 3882 + }, + { + "epoch": 1.1639872587596027, + "grad_norm": 0.21539001166820526, + "learning_rate": 0.00012554054054054052, + "loss": 0.3463, + "step": 3883 + }, + { + "epoch": 1.1642870526513023, + "grad_norm": 0.22196736931800842, + "learning_rate": 0.00012549549549549548, + "loss": 0.3507, + "step": 3884 + }, + { + "epoch": 1.1645868465430016, + "grad_norm": 0.23154208064079285, + "learning_rate": 0.00012545045045045045, + "loss": 0.3763, + "step": 3885 + }, + { + "epoch": 1.1648866404347011, + "grad_norm": 0.23062720894813538, + "learning_rate": 0.0001254054054054054, + "loss": 0.3465, + "step": 3886 + }, + { + "epoch": 1.1651864343264007, + "grad_norm": 0.21816441416740417, + "learning_rate": 0.00012536036036036035, + "loss": 0.3488, + "step": 3887 + }, + { + "epoch": 1.1654862282181, + "grad_norm": 0.2304592877626419, + "learning_rate": 0.0001253153153153153, + "loss": 0.386, + "step": 3888 + }, + { + "epoch": 1.1657860221097995, + "grad_norm": 0.2256111055612564, + "learning_rate": 0.00012527027027027027, + "loss": 0.3537, + "step": 3889 + }, + { + "epoch": 1.166085816001499, + "grad_norm": 0.22202616930007935, + "learning_rate": 0.0001252252252252252, + "loss": 0.364, + "step": 3890 + }, + { + "epoch": 1.1663856098931984, + "grad_norm": 0.28027182817459106, + "learning_rate": 0.00012518018018018017, + "loss": 0.3567, + "step": 3891 + }, + { + "epoch": 1.166685403784898, + "grad_norm": 0.2304534763097763, + "learning_rate": 0.00012513513513513514, + "loss": 0.3668, + "step": 3892 + }, + { + "epoch": 1.1669851976765973, + "grad_norm": 0.22690708935260773, + "learning_rate": 0.00012509009009009007, + "loss": 0.3801, + "step": 3893 + }, + { + "epoch": 1.1672849915682968, + "grad_norm": 0.2346915751695633, + "learning_rate": 0.00012504504504504504, + "loss": 0.3595, + "step": 3894 + }, + { + "epoch": 1.1675847854599963, + "grad_norm": 0.22019316256046295, + "learning_rate": 0.000125, + "loss": 0.36, + "step": 3895 + }, + { + "epoch": 1.1678845793516957, + "grad_norm": 0.2175881713628769, + "learning_rate": 0.00012495495495495494, + "loss": 0.3526, + "step": 3896 + }, + { + "epoch": 1.1681843732433952, + "grad_norm": 0.22282525897026062, + "learning_rate": 0.0001249099099099099, + "loss": 0.3552, + "step": 3897 + }, + { + "epoch": 1.1684841671350945, + "grad_norm": 0.21789103746414185, + "learning_rate": 0.00012486486486486487, + "loss": 0.3591, + "step": 3898 + }, + { + "epoch": 1.168783961026794, + "grad_norm": 0.21931247413158417, + "learning_rate": 0.0001248198198198198, + "loss": 0.3583, + "step": 3899 + }, + { + "epoch": 1.1690837549184936, + "grad_norm": 0.22268635034561157, + "learning_rate": 0.00012477477477477477, + "loss": 0.3497, + "step": 3900 + }, + { + "epoch": 1.169383548810193, + "grad_norm": 0.23430244624614716, + "learning_rate": 0.00012472972972972973, + "loss": 0.3827, + "step": 3901 + }, + { + "epoch": 1.1696833427018924, + "grad_norm": 0.21991029381752014, + "learning_rate": 0.00012468468468468467, + "loss": 0.356, + "step": 3902 + }, + { + "epoch": 1.169983136593592, + "grad_norm": 0.20806784927845, + "learning_rate": 0.00012463963963963963, + "loss": 0.3456, + "step": 3903 + }, + { + "epoch": 1.1702829304852913, + "grad_norm": 0.22096467018127441, + "learning_rate": 0.00012459459459459457, + "loss": 0.3751, + "step": 3904 + }, + { + "epoch": 1.1705827243769908, + "grad_norm": 0.22532299160957336, + "learning_rate": 0.00012454954954954953, + "loss": 0.3598, + "step": 3905 + }, + { + "epoch": 1.1708825182686904, + "grad_norm": 0.23222416639328003, + "learning_rate": 0.0001245045045045045, + "loss": 0.368, + "step": 3906 + }, + { + "epoch": 1.1711823121603897, + "grad_norm": 0.21905198693275452, + "learning_rate": 0.00012445945945945943, + "loss": 0.3648, + "step": 3907 + }, + { + "epoch": 1.1714821060520892, + "grad_norm": 0.2258433699607849, + "learning_rate": 0.0001244144144144144, + "loss": 0.3768, + "step": 3908 + }, + { + "epoch": 1.1717818999437886, + "grad_norm": 0.21878860890865326, + "learning_rate": 0.00012436936936936936, + "loss": 0.3729, + "step": 3909 + }, + { + "epoch": 1.172081693835488, + "grad_norm": 0.22248351573944092, + "learning_rate": 0.0001243243243243243, + "loss": 0.3472, + "step": 3910 + }, + { + "epoch": 1.1723814877271876, + "grad_norm": 0.23260651528835297, + "learning_rate": 0.00012427927927927926, + "loss": 0.3664, + "step": 3911 + }, + { + "epoch": 1.172681281618887, + "grad_norm": 0.22885175049304962, + "learning_rate": 0.00012423423423423423, + "loss": 0.4064, + "step": 3912 + }, + { + "epoch": 1.1729810755105865, + "grad_norm": 0.22039201855659485, + "learning_rate": 0.00012418918918918916, + "loss": 0.3432, + "step": 3913 + }, + { + "epoch": 1.1732808694022858, + "grad_norm": 0.2224431037902832, + "learning_rate": 0.00012414414414414413, + "loss": 0.3673, + "step": 3914 + }, + { + "epoch": 1.1735806632939854, + "grad_norm": 0.21871282160282135, + "learning_rate": 0.0001240990990990991, + "loss": 0.3665, + "step": 3915 + }, + { + "epoch": 1.173880457185685, + "grad_norm": 0.22808001935482025, + "learning_rate": 0.00012405405405405405, + "loss": 0.3595, + "step": 3916 + }, + { + "epoch": 1.1741802510773842, + "grad_norm": 0.21829096972942352, + "learning_rate": 0.000124009009009009, + "loss": 0.3708, + "step": 3917 + }, + { + "epoch": 1.1744800449690838, + "grad_norm": 0.23878386616706848, + "learning_rate": 0.00012396396396396395, + "loss": 0.3469, + "step": 3918 + }, + { + "epoch": 1.1747798388607833, + "grad_norm": 0.22917965054512024, + "learning_rate": 0.00012391891891891892, + "loss": 0.3734, + "step": 3919 + }, + { + "epoch": 1.1750796327524826, + "grad_norm": 0.2284645140171051, + "learning_rate": 0.00012387387387387386, + "loss": 0.3721, + "step": 3920 + }, + { + "epoch": 1.1753794266441822, + "grad_norm": 0.21777629852294922, + "learning_rate": 0.00012382882882882882, + "loss": 0.3631, + "step": 3921 + }, + { + "epoch": 1.1756792205358817, + "grad_norm": 0.22827209532260895, + "learning_rate": 0.00012378378378378378, + "loss": 0.3962, + "step": 3922 + }, + { + "epoch": 1.175979014427581, + "grad_norm": 0.22360359132289886, + "learning_rate": 0.00012373873873873875, + "loss": 0.3624, + "step": 3923 + }, + { + "epoch": 1.1762788083192806, + "grad_norm": 0.20297862589359283, + "learning_rate": 0.00012369369369369368, + "loss": 0.3348, + "step": 3924 + }, + { + "epoch": 1.1765786022109799, + "grad_norm": 0.22274337708950043, + "learning_rate": 0.00012364864864864865, + "loss": 0.3659, + "step": 3925 + }, + { + "epoch": 1.1768783961026794, + "grad_norm": 0.21908465027809143, + "learning_rate": 0.00012360360360360358, + "loss": 0.367, + "step": 3926 + }, + { + "epoch": 1.177178189994379, + "grad_norm": 0.2173020988702774, + "learning_rate": 0.00012355855855855855, + "loss": 0.3718, + "step": 3927 + }, + { + "epoch": 1.1774779838860783, + "grad_norm": 0.21801786124706268, + "learning_rate": 0.0001235135135135135, + "loss": 0.3358, + "step": 3928 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 0.22594833374023438, + "learning_rate": 0.00012346846846846845, + "loss": 0.3646, + "step": 3929 + }, + { + "epoch": 1.1780775716694771, + "grad_norm": 0.2159031629562378, + "learning_rate": 0.0001234234234234234, + "loss": 0.3459, + "step": 3930 + }, + { + "epoch": 1.1783773655611767, + "grad_norm": 0.23376664519309998, + "learning_rate": 0.00012337837837837838, + "loss": 0.3899, + "step": 3931 + }, + { + "epoch": 1.1786771594528762, + "grad_norm": 0.2555432915687561, + "learning_rate": 0.0001233333333333333, + "loss": 0.3729, + "step": 3932 + }, + { + "epoch": 1.1789769533445755, + "grad_norm": 0.2331700474023819, + "learning_rate": 0.00012328828828828828, + "loss": 0.3783, + "step": 3933 + }, + { + "epoch": 1.179276747236275, + "grad_norm": 0.2171568125486374, + "learning_rate": 0.00012324324324324324, + "loss": 0.3588, + "step": 3934 + }, + { + "epoch": 1.1795765411279746, + "grad_norm": 0.2255590707063675, + "learning_rate": 0.00012319819819819818, + "loss": 0.3486, + "step": 3935 + }, + { + "epoch": 1.179876335019674, + "grad_norm": 0.23395510017871857, + "learning_rate": 0.00012315315315315314, + "loss": 0.3399, + "step": 3936 + }, + { + "epoch": 1.1801761289113735, + "grad_norm": 0.22677238285541534, + "learning_rate": 0.0001231081081081081, + "loss": 0.3917, + "step": 3937 + }, + { + "epoch": 1.180475922803073, + "grad_norm": 0.22630028426647186, + "learning_rate": 0.00012306306306306304, + "loss": 0.3717, + "step": 3938 + }, + { + "epoch": 1.1807757166947723, + "grad_norm": 0.22929327189922333, + "learning_rate": 0.000123018018018018, + "loss": 0.3797, + "step": 3939 + }, + { + "epoch": 1.1810755105864719, + "grad_norm": 0.24179096519947052, + "learning_rate": 0.00012297297297297294, + "loss": 0.4066, + "step": 3940 + }, + { + "epoch": 1.1813753044781712, + "grad_norm": 0.2346000075340271, + "learning_rate": 0.0001229279279279279, + "loss": 0.3587, + "step": 3941 + }, + { + "epoch": 1.1816750983698707, + "grad_norm": 0.21742184460163116, + "learning_rate": 0.00012288288288288287, + "loss": 0.3498, + "step": 3942 + }, + { + "epoch": 1.1819748922615703, + "grad_norm": 0.22498059272766113, + "learning_rate": 0.00012283783783783783, + "loss": 0.3469, + "step": 3943 + }, + { + "epoch": 1.1822746861532696, + "grad_norm": 0.22407019138336182, + "learning_rate": 0.00012279279279279277, + "loss": 0.3871, + "step": 3944 + }, + { + "epoch": 1.1825744800449691, + "grad_norm": 0.2243795543909073, + "learning_rate": 0.00012274774774774774, + "loss": 0.3562, + "step": 3945 + }, + { + "epoch": 1.1828742739366684, + "grad_norm": 0.22106344997882843, + "learning_rate": 0.0001227027027027027, + "loss": 0.3585, + "step": 3946 + }, + { + "epoch": 1.183174067828368, + "grad_norm": 0.21711674332618713, + "learning_rate": 0.00012265765765765764, + "loss": 0.3635, + "step": 3947 + }, + { + "epoch": 1.1834738617200675, + "grad_norm": 0.22720924019813538, + "learning_rate": 0.0001226126126126126, + "loss": 0.3617, + "step": 3948 + }, + { + "epoch": 1.1837736556117668, + "grad_norm": 0.21701271831989288, + "learning_rate": 0.00012256756756756756, + "loss": 0.3605, + "step": 3949 + }, + { + "epoch": 1.1840734495034664, + "grad_norm": 0.23393398523330688, + "learning_rate": 0.00012252252252252253, + "loss": 0.3743, + "step": 3950 + }, + { + "epoch": 1.184373243395166, + "grad_norm": 0.21987706422805786, + "learning_rate": 0.00012247747747747746, + "loss": 0.3536, + "step": 3951 + }, + { + "epoch": 1.1846730372868652, + "grad_norm": 0.22225283086299896, + "learning_rate": 0.00012243243243243243, + "loss": 0.3538, + "step": 3952 + }, + { + "epoch": 1.1849728311785648, + "grad_norm": 0.24060380458831787, + "learning_rate": 0.0001223873873873874, + "loss": 0.3733, + "step": 3953 + }, + { + "epoch": 1.185272625070264, + "grad_norm": 0.2302178144454956, + "learning_rate": 0.00012234234234234233, + "loss": 0.3707, + "step": 3954 + }, + { + "epoch": 1.1855724189619636, + "grad_norm": 0.22814485430717468, + "learning_rate": 0.0001222972972972973, + "loss": 0.3678, + "step": 3955 + }, + { + "epoch": 1.1858722128536632, + "grad_norm": 0.24073362350463867, + "learning_rate": 0.00012225225225225226, + "loss": 0.3746, + "step": 3956 + }, + { + "epoch": 1.1861720067453625, + "grad_norm": 0.21149156987667084, + "learning_rate": 0.0001222072072072072, + "loss": 0.3266, + "step": 3957 + }, + { + "epoch": 1.186471800637062, + "grad_norm": 0.21905194222927094, + "learning_rate": 0.00012216216216216216, + "loss": 0.3426, + "step": 3958 + }, + { + "epoch": 1.1867715945287616, + "grad_norm": 0.22855554521083832, + "learning_rate": 0.00012211711711711712, + "loss": 0.3624, + "step": 3959 + }, + { + "epoch": 1.187071388420461, + "grad_norm": 0.23453767597675323, + "learning_rate": 0.00012207207207207206, + "loss": 0.3742, + "step": 3960 + }, + { + "epoch": 1.1873711823121604, + "grad_norm": 0.23686885833740234, + "learning_rate": 0.00012202702702702702, + "loss": 0.3956, + "step": 3961 + }, + { + "epoch": 1.1876709762038598, + "grad_norm": 0.2404448688030243, + "learning_rate": 0.00012198198198198196, + "loss": 0.3717, + "step": 3962 + }, + { + "epoch": 1.1879707700955593, + "grad_norm": 0.2192368060350418, + "learning_rate": 0.00012193693693693692, + "loss": 0.3692, + "step": 3963 + }, + { + "epoch": 1.1882705639872588, + "grad_norm": 0.2268245369195938, + "learning_rate": 0.00012189189189189189, + "loss": 0.3722, + "step": 3964 + }, + { + "epoch": 1.1885703578789582, + "grad_norm": 0.23420900106430054, + "learning_rate": 0.00012184684684684682, + "loss": 0.3696, + "step": 3965 + }, + { + "epoch": 1.1888701517706577, + "grad_norm": 0.23050832748413086, + "learning_rate": 0.00012180180180180179, + "loss": 0.3836, + "step": 3966 + }, + { + "epoch": 1.189169945662357, + "grad_norm": 0.25578629970550537, + "learning_rate": 0.00012175675675675675, + "loss": 0.3358, + "step": 3967 + }, + { + "epoch": 1.1894697395540565, + "grad_norm": 0.21223682165145874, + "learning_rate": 0.0001217117117117117, + "loss": 0.354, + "step": 3968 + }, + { + "epoch": 1.189769533445756, + "grad_norm": 0.2102338820695877, + "learning_rate": 0.00012166666666666665, + "loss": 0.3485, + "step": 3969 + }, + { + "epoch": 1.1900693273374554, + "grad_norm": 0.22144131362438202, + "learning_rate": 0.00012162162162162162, + "loss": 0.3503, + "step": 3970 + }, + { + "epoch": 1.190369121229155, + "grad_norm": 0.22547723352909088, + "learning_rate": 0.00012157657657657657, + "loss": 0.3569, + "step": 3971 + }, + { + "epoch": 1.1906689151208545, + "grad_norm": 0.20872855186462402, + "learning_rate": 0.00012153153153153153, + "loss": 0.343, + "step": 3972 + }, + { + "epoch": 1.1909687090125538, + "grad_norm": 0.2280842363834381, + "learning_rate": 0.00012148648648648648, + "loss": 0.3911, + "step": 3973 + }, + { + "epoch": 1.1912685029042533, + "grad_norm": 0.23819836974143982, + "learning_rate": 0.00012144144144144143, + "loss": 0.3952, + "step": 3974 + }, + { + "epoch": 1.1915682967959529, + "grad_norm": 0.22785331308841705, + "learning_rate": 0.0001213963963963964, + "loss": 0.3559, + "step": 3975 + }, + { + "epoch": 1.1918680906876522, + "grad_norm": 0.21852603554725647, + "learning_rate": 0.00012135135135135133, + "loss": 0.3413, + "step": 3976 + }, + { + "epoch": 1.1921678845793517, + "grad_norm": 0.2668752670288086, + "learning_rate": 0.0001213063063063063, + "loss": 0.4048, + "step": 3977 + }, + { + "epoch": 1.192467678471051, + "grad_norm": 0.223458930850029, + "learning_rate": 0.00012126126126126126, + "loss": 0.3763, + "step": 3978 + }, + { + "epoch": 1.1927674723627506, + "grad_norm": 0.2441006898880005, + "learning_rate": 0.0001212162162162162, + "loss": 0.3752, + "step": 3979 + }, + { + "epoch": 1.1930672662544501, + "grad_norm": 0.241143599152565, + "learning_rate": 0.00012117117117117116, + "loss": 0.39, + "step": 3980 + }, + { + "epoch": 1.1933670601461495, + "grad_norm": 0.22745396196842194, + "learning_rate": 0.00012112612612612612, + "loss": 0.3755, + "step": 3981 + }, + { + "epoch": 1.193666854037849, + "grad_norm": 0.21644215285778046, + "learning_rate": 0.00012108108108108106, + "loss": 0.3448, + "step": 3982 + }, + { + "epoch": 1.1939666479295483, + "grad_norm": 0.23860111832618713, + "learning_rate": 0.00012103603603603602, + "loss": 0.3878, + "step": 3983 + }, + { + "epoch": 1.1942664418212479, + "grad_norm": 0.21523654460906982, + "learning_rate": 0.00012099099099099099, + "loss": 0.3428, + "step": 3984 + }, + { + "epoch": 1.1945662357129474, + "grad_norm": 0.24360989034175873, + "learning_rate": 0.00012094594594594594, + "loss": 0.3778, + "step": 3985 + }, + { + "epoch": 1.1948660296046467, + "grad_norm": 0.23723161220550537, + "learning_rate": 0.00012090090090090089, + "loss": 0.3658, + "step": 3986 + }, + { + "epoch": 1.1951658234963463, + "grad_norm": 0.22299665212631226, + "learning_rate": 0.00012085585585585584, + "loss": 0.3742, + "step": 3987 + }, + { + "epoch": 1.1954656173880458, + "grad_norm": 0.2324703186750412, + "learning_rate": 0.0001208108108108108, + "loss": 0.3709, + "step": 3988 + }, + { + "epoch": 1.1957654112797451, + "grad_norm": 0.22136402130126953, + "learning_rate": 0.00012076576576576577, + "loss": 0.3662, + "step": 3989 + }, + { + "epoch": 1.1960652051714447, + "grad_norm": 0.23628540337085724, + "learning_rate": 0.0001207207207207207, + "loss": 0.3409, + "step": 3990 + }, + { + "epoch": 1.1963649990631442, + "grad_norm": 0.21257908642292023, + "learning_rate": 0.00012067567567567567, + "loss": 0.3382, + "step": 3991 + }, + { + "epoch": 1.1966647929548435, + "grad_norm": 0.22111082077026367, + "learning_rate": 0.00012063063063063063, + "loss": 0.3529, + "step": 3992 + }, + { + "epoch": 1.196964586846543, + "grad_norm": 0.23919793963432312, + "learning_rate": 0.00012058558558558557, + "loss": 0.3831, + "step": 3993 + }, + { + "epoch": 1.1972643807382424, + "grad_norm": 0.20639801025390625, + "learning_rate": 0.00012054054054054053, + "loss": 0.333, + "step": 3994 + }, + { + "epoch": 1.197564174629942, + "grad_norm": 0.2506180703639984, + "learning_rate": 0.0001204954954954955, + "loss": 0.3755, + "step": 3995 + }, + { + "epoch": 1.1978639685216415, + "grad_norm": 0.2168271392583847, + "learning_rate": 0.00012045045045045043, + "loss": 0.3483, + "step": 3996 + }, + { + "epoch": 1.1981637624133408, + "grad_norm": 0.21680712699890137, + "learning_rate": 0.0001204054054054054, + "loss": 0.3411, + "step": 3997 + }, + { + "epoch": 1.1984635563050403, + "grad_norm": 0.2226821780204773, + "learning_rate": 0.00012036036036036035, + "loss": 0.3511, + "step": 3998 + }, + { + "epoch": 1.1987633501967396, + "grad_norm": 0.23245584964752197, + "learning_rate": 0.0001203153153153153, + "loss": 0.3554, + "step": 3999 + }, + { + "epoch": 1.1990631440884392, + "grad_norm": 0.2246054708957672, + "learning_rate": 0.00012027027027027026, + "loss": 0.3684, + "step": 4000 + }, + { + "epoch": 1.1990631440884392, + "eval_loss": 0.41834643483161926, + "eval_runtime": 565.2826, + "eval_samples_per_second": 3.819, + "eval_steps_per_second": 0.478, + "step": 4000 + }, + { + "epoch": 1.1993629379801387, + "grad_norm": 0.22603508830070496, + "learning_rate": 0.00012022522522522521, + "loss": 0.3897, + "step": 4001 + }, + { + "epoch": 1.199662731871838, + "grad_norm": 0.2275640070438385, + "learning_rate": 0.00012018018018018017, + "loss": 0.3567, + "step": 4002 + }, + { + "epoch": 1.1999625257635376, + "grad_norm": 0.23352980613708496, + "learning_rate": 0.00012013513513513512, + "loss": 0.3807, + "step": 4003 + }, + { + "epoch": 1.200262319655237, + "grad_norm": 0.2211960405111313, + "learning_rate": 0.00012009009009009008, + "loss": 0.3546, + "step": 4004 + }, + { + "epoch": 1.2005621135469364, + "grad_norm": 0.2441369891166687, + "learning_rate": 0.00012004504504504504, + "loss": 0.3776, + "step": 4005 + }, + { + "epoch": 1.200861907438636, + "grad_norm": 0.22006067633628845, + "learning_rate": 0.00011999999999999999, + "loss": 0.3705, + "step": 4006 + }, + { + "epoch": 1.2011617013303355, + "grad_norm": 0.23932883143424988, + "learning_rate": 0.00011995495495495494, + "loss": 0.3665, + "step": 4007 + }, + { + "epoch": 1.2014614952220348, + "grad_norm": 0.2247323989868164, + "learning_rate": 0.0001199099099099099, + "loss": 0.3859, + "step": 4008 + }, + { + "epoch": 1.2017612891137344, + "grad_norm": 0.22732797265052795, + "learning_rate": 0.00011986486486486487, + "loss": 0.3782, + "step": 4009 + }, + { + "epoch": 1.2020610830054337, + "grad_norm": 0.22873279452323914, + "learning_rate": 0.0001198198198198198, + "loss": 0.3625, + "step": 4010 + }, + { + "epoch": 1.2023608768971332, + "grad_norm": 0.22076311707496643, + "learning_rate": 0.00011977477477477477, + "loss": 0.3578, + "step": 4011 + }, + { + "epoch": 1.2026606707888328, + "grad_norm": 0.2291504144668579, + "learning_rate": 0.00011972972972972972, + "loss": 0.3457, + "step": 4012 + }, + { + "epoch": 1.202960464680532, + "grad_norm": 0.22902348637580872, + "learning_rate": 0.00011968468468468467, + "loss": 0.3823, + "step": 4013 + }, + { + "epoch": 1.2032602585722316, + "grad_norm": 0.22112970054149628, + "learning_rate": 0.00011963963963963963, + "loss": 0.3687, + "step": 4014 + }, + { + "epoch": 1.203560052463931, + "grad_norm": 0.2165578454732895, + "learning_rate": 0.00011959459459459458, + "loss": 0.3469, + "step": 4015 + }, + { + "epoch": 1.2038598463556305, + "grad_norm": 0.2269822061061859, + "learning_rate": 0.00011954954954954953, + "loss": 0.3643, + "step": 4016 + }, + { + "epoch": 1.20415964024733, + "grad_norm": 0.23120154440402985, + "learning_rate": 0.0001195045045045045, + "loss": 0.3833, + "step": 4017 + }, + { + "epoch": 1.2044594341390293, + "grad_norm": 0.22641071677207947, + "learning_rate": 0.00011945945945945945, + "loss": 0.3718, + "step": 4018 + }, + { + "epoch": 1.2047592280307289, + "grad_norm": 0.23450811207294464, + "learning_rate": 0.00011941441441441441, + "loss": 0.3817, + "step": 4019 + }, + { + "epoch": 1.2050590219224284, + "grad_norm": 0.2254931777715683, + "learning_rate": 0.00011936936936936936, + "loss": 0.3693, + "step": 4020 + }, + { + "epoch": 1.2053588158141277, + "grad_norm": 0.21437515318393707, + "learning_rate": 0.00011932432432432431, + "loss": 0.3499, + "step": 4021 + }, + { + "epoch": 1.2056586097058273, + "grad_norm": 0.21812491118907928, + "learning_rate": 0.00011927927927927928, + "loss": 0.3615, + "step": 4022 + }, + { + "epoch": 1.2059584035975268, + "grad_norm": 0.23963527381420135, + "learning_rate": 0.00011923423423423421, + "loss": 0.3799, + "step": 4023 + }, + { + "epoch": 1.2062581974892261, + "grad_norm": 0.2252485454082489, + "learning_rate": 0.00011918918918918918, + "loss": 0.3422, + "step": 4024 + }, + { + "epoch": 1.2065579913809257, + "grad_norm": 0.23507623374462128, + "learning_rate": 0.00011914414414414414, + "loss": 0.3572, + "step": 4025 + }, + { + "epoch": 1.206857785272625, + "grad_norm": 0.2158452719449997, + "learning_rate": 0.00011909909909909908, + "loss": 0.3771, + "step": 4026 + }, + { + "epoch": 1.2071575791643245, + "grad_norm": 0.24523043632507324, + "learning_rate": 0.00011905405405405404, + "loss": 0.3615, + "step": 4027 + }, + { + "epoch": 1.207457373056024, + "grad_norm": 0.2354147732257843, + "learning_rate": 0.000119009009009009, + "loss": 0.3614, + "step": 4028 + }, + { + "epoch": 1.2077571669477234, + "grad_norm": 0.22273007035255432, + "learning_rate": 0.00011896396396396396, + "loss": 0.3398, + "step": 4029 + }, + { + "epoch": 1.208056960839423, + "grad_norm": 0.22855806350708008, + "learning_rate": 0.0001189189189189189, + "loss": 0.3706, + "step": 4030 + }, + { + "epoch": 1.2083567547311223, + "grad_norm": 0.23568007349967957, + "learning_rate": 0.00011887387387387387, + "loss": 0.3952, + "step": 4031 + }, + { + "epoch": 1.2086565486228218, + "grad_norm": 0.215668722987175, + "learning_rate": 0.00011882882882882882, + "loss": 0.3492, + "step": 4032 + }, + { + "epoch": 1.2089563425145213, + "grad_norm": 0.2247469574213028, + "learning_rate": 0.00011878378378378377, + "loss": 0.3547, + "step": 4033 + }, + { + "epoch": 1.2092561364062206, + "grad_norm": 0.2222067266702652, + "learning_rate": 0.00011873873873873872, + "loss": 0.35, + "step": 4034 + }, + { + "epoch": 1.2095559302979202, + "grad_norm": 0.22392643988132477, + "learning_rate": 0.00011869369369369368, + "loss": 0.3445, + "step": 4035 + }, + { + "epoch": 1.2098557241896197, + "grad_norm": 0.22978229820728302, + "learning_rate": 0.00011864864864864865, + "loss": 0.3806, + "step": 4036 + }, + { + "epoch": 1.210155518081319, + "grad_norm": 0.23751720786094666, + "learning_rate": 0.00011860360360360358, + "loss": 0.3567, + "step": 4037 + }, + { + "epoch": 1.2104553119730186, + "grad_norm": 0.2222888469696045, + "learning_rate": 0.00011855855855855855, + "loss": 0.3583, + "step": 4038 + }, + { + "epoch": 1.2107551058647181, + "grad_norm": 0.24681641161441803, + "learning_rate": 0.00011851351351351351, + "loss": 0.3746, + "step": 4039 + }, + { + "epoch": 1.2110548997564174, + "grad_norm": 0.2182309329509735, + "learning_rate": 0.00011846846846846845, + "loss": 0.3348, + "step": 4040 + }, + { + "epoch": 1.211354693648117, + "grad_norm": 0.21371771395206451, + "learning_rate": 0.00011842342342342341, + "loss": 0.3338, + "step": 4041 + }, + { + "epoch": 1.2116544875398163, + "grad_norm": 0.23629151284694672, + "learning_rate": 0.00011837837837837838, + "loss": 0.3408, + "step": 4042 + }, + { + "epoch": 1.2119542814315158, + "grad_norm": 0.23609225451946259, + "learning_rate": 0.00011833333333333331, + "loss": 0.38, + "step": 4043 + }, + { + "epoch": 1.2122540753232154, + "grad_norm": 0.21824340522289276, + "learning_rate": 0.00011828828828828828, + "loss": 0.3355, + "step": 4044 + }, + { + "epoch": 1.2125538692149147, + "grad_norm": 0.2105160504579544, + "learning_rate": 0.00011824324324324324, + "loss": 0.353, + "step": 4045 + }, + { + "epoch": 1.2128536631066142, + "grad_norm": 0.2521781027317047, + "learning_rate": 0.00011819819819819818, + "loss": 0.3899, + "step": 4046 + }, + { + "epoch": 1.2131534569983136, + "grad_norm": 0.21568675339221954, + "learning_rate": 0.00011815315315315314, + "loss": 0.3575, + "step": 4047 + }, + { + "epoch": 1.213453250890013, + "grad_norm": 0.21583522856235504, + "learning_rate": 0.00011810810810810809, + "loss": 0.365, + "step": 4048 + }, + { + "epoch": 1.2137530447817126, + "grad_norm": 0.23436518013477325, + "learning_rate": 0.00011806306306306306, + "loss": 0.3783, + "step": 4049 + }, + { + "epoch": 1.214052838673412, + "grad_norm": 0.21352584660053253, + "learning_rate": 0.00011801801801801801, + "loss": 0.3446, + "step": 4050 + }, + { + "epoch": 1.2143526325651115, + "grad_norm": 0.2259955108165741, + "learning_rate": 0.00011797297297297296, + "loss": 0.3866, + "step": 4051 + }, + { + "epoch": 1.2146524264568108, + "grad_norm": 0.215003103017807, + "learning_rate": 0.00011792792792792792, + "loss": 0.3372, + "step": 4052 + }, + { + "epoch": 1.2149522203485104, + "grad_norm": 0.21452820301055908, + "learning_rate": 0.00011788288288288288, + "loss": 0.333, + "step": 4053 + }, + { + "epoch": 1.21525201424021, + "grad_norm": 0.21691389381885529, + "learning_rate": 0.00011783783783783782, + "loss": 0.3588, + "step": 4054 + }, + { + "epoch": 1.2155518081319092, + "grad_norm": 0.22161614894866943, + "learning_rate": 0.00011779279279279279, + "loss": 0.366, + "step": 4055 + }, + { + "epoch": 1.2158516020236088, + "grad_norm": 0.22550690174102783, + "learning_rate": 0.00011774774774774775, + "loss": 0.3535, + "step": 4056 + }, + { + "epoch": 1.2161513959153083, + "grad_norm": 0.24499079585075378, + "learning_rate": 0.00011770270270270269, + "loss": 0.387, + "step": 4057 + }, + { + "epoch": 1.2164511898070076, + "grad_norm": 0.21006570756435394, + "learning_rate": 0.00011765765765765765, + "loss": 0.3313, + "step": 4058 + }, + { + "epoch": 1.2167509836987072, + "grad_norm": 0.24210864305496216, + "learning_rate": 0.0001176126126126126, + "loss": 0.3861, + "step": 4059 + }, + { + "epoch": 1.2170507775904067, + "grad_norm": 0.22041113674640656, + "learning_rate": 0.00011756756756756755, + "loss": 0.357, + "step": 4060 + }, + { + "epoch": 1.217350571482106, + "grad_norm": 0.22228474915027618, + "learning_rate": 0.00011752252252252251, + "loss": 0.3738, + "step": 4061 + }, + { + "epoch": 1.2176503653738056, + "grad_norm": 0.22721122205257416, + "learning_rate": 0.00011747747747747746, + "loss": 0.3829, + "step": 4062 + }, + { + "epoch": 1.2179501592655049, + "grad_norm": 0.23477396368980408, + "learning_rate": 0.00011743243243243242, + "loss": 0.405, + "step": 4063 + }, + { + "epoch": 1.2182499531572044, + "grad_norm": 0.22051817178726196, + "learning_rate": 0.00011738738738738738, + "loss": 0.3666, + "step": 4064 + }, + { + "epoch": 1.218549747048904, + "grad_norm": 0.2251734584569931, + "learning_rate": 0.00011734234234234233, + "loss": 0.3843, + "step": 4065 + }, + { + "epoch": 1.2188495409406033, + "grad_norm": 0.223738893866539, + "learning_rate": 0.0001172972972972973, + "loss": 0.3734, + "step": 4066 + }, + { + "epoch": 1.2191493348323028, + "grad_norm": 0.2110142558813095, + "learning_rate": 0.00011725225225225224, + "loss": 0.3412, + "step": 4067 + }, + { + "epoch": 1.2194491287240021, + "grad_norm": 0.2286280393600464, + "learning_rate": 0.0001172072072072072, + "loss": 0.3668, + "step": 4068 + }, + { + "epoch": 1.2197489226157017, + "grad_norm": 0.22667136788368225, + "learning_rate": 0.00011716216216216216, + "loss": 0.3575, + "step": 4069 + }, + { + "epoch": 1.2200487165074012, + "grad_norm": 0.2305767983198166, + "learning_rate": 0.0001171171171171171, + "loss": 0.3581, + "step": 4070 + }, + { + "epoch": 1.2203485103991005, + "grad_norm": 0.2244352102279663, + "learning_rate": 0.00011707207207207206, + "loss": 0.3569, + "step": 4071 + }, + { + "epoch": 1.2206483042908, + "grad_norm": 0.2392815798521042, + "learning_rate": 0.00011702702702702702, + "loss": 0.3671, + "step": 4072 + }, + { + "epoch": 1.2209480981824996, + "grad_norm": 0.2159206122159958, + "learning_rate": 0.00011698198198198196, + "loss": 0.3403, + "step": 4073 + }, + { + "epoch": 1.221247892074199, + "grad_norm": 0.21920053660869598, + "learning_rate": 0.00011693693693693692, + "loss": 0.3549, + "step": 4074 + }, + { + "epoch": 1.2215476859658985, + "grad_norm": 0.2131916582584381, + "learning_rate": 0.00011689189189189189, + "loss": 0.337, + "step": 4075 + }, + { + "epoch": 1.221847479857598, + "grad_norm": 0.22338977456092834, + "learning_rate": 0.00011684684684684684, + "loss": 0.3591, + "step": 4076 + }, + { + "epoch": 1.2221472737492973, + "grad_norm": 0.22058121860027313, + "learning_rate": 0.00011680180180180179, + "loss": 0.3529, + "step": 4077 + }, + { + "epoch": 1.2224470676409969, + "grad_norm": 0.2266795039176941, + "learning_rate": 0.00011675675675675675, + "loss": 0.372, + "step": 4078 + }, + { + "epoch": 1.2227468615326962, + "grad_norm": 0.23163147270679474, + "learning_rate": 0.0001167117117117117, + "loss": 0.3737, + "step": 4079 + }, + { + "epoch": 1.2230466554243957, + "grad_norm": 0.22143979370594025, + "learning_rate": 0.00011666666666666665, + "loss": 0.3743, + "step": 4080 + }, + { + "epoch": 1.2233464493160953, + "grad_norm": 0.21844099462032318, + "learning_rate": 0.00011662162162162162, + "loss": 0.3538, + "step": 4081 + }, + { + "epoch": 1.2236462432077946, + "grad_norm": 0.22864669561386108, + "learning_rate": 0.00011657657657657657, + "loss": 0.3325, + "step": 4082 + }, + { + "epoch": 1.2239460370994941, + "grad_norm": 0.22677049040794373, + "learning_rate": 0.00011653153153153153, + "loss": 0.3708, + "step": 4083 + }, + { + "epoch": 1.2242458309911934, + "grad_norm": 0.2209145426750183, + "learning_rate": 0.00011648648648648647, + "loss": 0.3529, + "step": 4084 + }, + { + "epoch": 1.224545624882893, + "grad_norm": 0.22299690544605255, + "learning_rate": 0.00011644144144144143, + "loss": 0.3471, + "step": 4085 + }, + { + "epoch": 1.2248454187745925, + "grad_norm": 0.2258932739496231, + "learning_rate": 0.0001163963963963964, + "loss": 0.3729, + "step": 4086 + }, + { + "epoch": 1.2251452126662918, + "grad_norm": 0.21733281016349792, + "learning_rate": 0.00011635135135135133, + "loss": 0.3464, + "step": 4087 + }, + { + "epoch": 1.2254450065579914, + "grad_norm": 0.2246559113264084, + "learning_rate": 0.0001163063063063063, + "loss": 0.3618, + "step": 4088 + }, + { + "epoch": 1.225744800449691, + "grad_norm": 0.2373102605342865, + "learning_rate": 0.00011626126126126126, + "loss": 0.3695, + "step": 4089 + }, + { + "epoch": 1.2260445943413902, + "grad_norm": 0.21920005977153778, + "learning_rate": 0.0001162162162162162, + "loss": 0.3697, + "step": 4090 + }, + { + "epoch": 1.2263443882330898, + "grad_norm": 0.22167542576789856, + "learning_rate": 0.00011617117117117116, + "loss": 0.3613, + "step": 4091 + }, + { + "epoch": 1.2266441821247893, + "grad_norm": 0.21665841341018677, + "learning_rate": 0.00011612612612612612, + "loss": 0.36, + "step": 4092 + }, + { + "epoch": 1.2269439760164886, + "grad_norm": 0.22142474353313446, + "learning_rate": 0.00011608108108108107, + "loss": 0.3577, + "step": 4093 + }, + { + "epoch": 1.2272437699081882, + "grad_norm": 0.21807880699634552, + "learning_rate": 0.00011603603603603602, + "loss": 0.3598, + "step": 4094 + }, + { + "epoch": 1.2275435637998875, + "grad_norm": 0.21711617708206177, + "learning_rate": 0.00011599099099099097, + "loss": 0.3609, + "step": 4095 + }, + { + "epoch": 1.227843357691587, + "grad_norm": 0.2247924506664276, + "learning_rate": 0.00011594594594594594, + "loss": 0.3485, + "step": 4096 + }, + { + "epoch": 1.2281431515832866, + "grad_norm": 0.22750286757946014, + "learning_rate": 0.00011590090090090089, + "loss": 0.3669, + "step": 4097 + }, + { + "epoch": 1.228442945474986, + "grad_norm": 0.21605989336967468, + "learning_rate": 0.00011585585585585584, + "loss": 0.3658, + "step": 4098 + }, + { + "epoch": 1.2287427393666854, + "grad_norm": 0.217095747590065, + "learning_rate": 0.0001158108108108108, + "loss": 0.3469, + "step": 4099 + }, + { + "epoch": 1.2290425332583848, + "grad_norm": 0.22495174407958984, + "learning_rate": 0.00011576576576576577, + "loss": 0.3741, + "step": 4100 + }, + { + "epoch": 1.2293423271500843, + "grad_norm": 0.21024355292320251, + "learning_rate": 0.0001157207207207207, + "loss": 0.3494, + "step": 4101 + }, + { + "epoch": 1.2296421210417838, + "grad_norm": 0.22852125763893127, + "learning_rate": 0.00011567567567567567, + "loss": 0.3852, + "step": 4102 + }, + { + "epoch": 1.2299419149334831, + "grad_norm": 0.22062285244464874, + "learning_rate": 0.00011563063063063063, + "loss": 0.3512, + "step": 4103 + }, + { + "epoch": 1.2302417088251827, + "grad_norm": 0.21184411644935608, + "learning_rate": 0.00011558558558558557, + "loss": 0.359, + "step": 4104 + }, + { + "epoch": 1.2305415027168822, + "grad_norm": 0.21569395065307617, + "learning_rate": 0.00011554054054054053, + "loss": 0.3636, + "step": 4105 + }, + { + "epoch": 1.2308412966085815, + "grad_norm": 0.22727784514427185, + "learning_rate": 0.0001154954954954955, + "loss": 0.3616, + "step": 4106 + }, + { + "epoch": 1.231141090500281, + "grad_norm": 0.2448568046092987, + "learning_rate": 0.00011545045045045043, + "loss": 0.3546, + "step": 4107 + }, + { + "epoch": 1.2314408843919806, + "grad_norm": 0.2317483127117157, + "learning_rate": 0.0001154054054054054, + "loss": 0.3619, + "step": 4108 + }, + { + "epoch": 1.23174067828368, + "grad_norm": 0.22012370824813843, + "learning_rate": 0.00011536036036036035, + "loss": 0.3571, + "step": 4109 + }, + { + "epoch": 1.2320404721753795, + "grad_norm": 0.2250751107931137, + "learning_rate": 0.00011531531531531531, + "loss": 0.3697, + "step": 4110 + }, + { + "epoch": 1.2323402660670788, + "grad_norm": 0.2103440761566162, + "learning_rate": 0.00011527027027027026, + "loss": 0.3539, + "step": 4111 + }, + { + "epoch": 1.2326400599587783, + "grad_norm": 0.22503060102462769, + "learning_rate": 0.00011522522522522521, + "loss": 0.3778, + "step": 4112 + }, + { + "epoch": 1.2329398538504779, + "grad_norm": 0.22393330931663513, + "learning_rate": 0.00011518018018018018, + "loss": 0.3791, + "step": 4113 + }, + { + "epoch": 1.2332396477421772, + "grad_norm": 0.21146221458911896, + "learning_rate": 0.00011513513513513513, + "loss": 0.3353, + "step": 4114 + }, + { + "epoch": 1.2335394416338767, + "grad_norm": 0.23022443056106567, + "learning_rate": 0.00011509009009009008, + "loss": 0.369, + "step": 4115 + }, + { + "epoch": 1.233839235525576, + "grad_norm": 0.24283228814601898, + "learning_rate": 0.00011504504504504504, + "loss": 0.3553, + "step": 4116 + }, + { + "epoch": 1.2341390294172756, + "grad_norm": 0.2372848093509674, + "learning_rate": 0.000115, + "loss": 0.3657, + "step": 4117 + }, + { + "epoch": 1.2344388233089751, + "grad_norm": 0.22817353904247284, + "learning_rate": 0.00011495495495495494, + "loss": 0.3542, + "step": 4118 + }, + { + "epoch": 1.2347386172006745, + "grad_norm": 0.21485529839992523, + "learning_rate": 0.0001149099099099099, + "loss": 0.3315, + "step": 4119 + }, + { + "epoch": 1.235038411092374, + "grad_norm": 0.2125268578529358, + "learning_rate": 0.00011486486486486484, + "loss": 0.367, + "step": 4120 + }, + { + "epoch": 1.2353382049840735, + "grad_norm": 0.2177802473306656, + "learning_rate": 0.0001148198198198198, + "loss": 0.3667, + "step": 4121 + }, + { + "epoch": 1.2356379988757729, + "grad_norm": 0.22439855337142944, + "learning_rate": 0.00011477477477477477, + "loss": 0.3595, + "step": 4122 + }, + { + "epoch": 1.2359377927674724, + "grad_norm": 0.22252248227596283, + "learning_rate": 0.00011472972972972972, + "loss": 0.3689, + "step": 4123 + }, + { + "epoch": 1.236237586659172, + "grad_norm": 0.21893781423568726, + "learning_rate": 0.00011468468468468467, + "loss": 0.3363, + "step": 4124 + }, + { + "epoch": 1.2365373805508713, + "grad_norm": 0.20217002928256989, + "learning_rate": 0.00011463963963963963, + "loss": 0.3522, + "step": 4125 + }, + { + "epoch": 1.2368371744425708, + "grad_norm": 0.2127171903848648, + "learning_rate": 0.00011459459459459458, + "loss": 0.345, + "step": 4126 + }, + { + "epoch": 1.2371369683342701, + "grad_norm": 0.2248011827468872, + "learning_rate": 0.00011454954954954955, + "loss": 0.3326, + "step": 4127 + }, + { + "epoch": 1.2374367622259697, + "grad_norm": 0.22507144510746002, + "learning_rate": 0.0001145045045045045, + "loss": 0.3728, + "step": 4128 + }, + { + "epoch": 1.2377365561176692, + "grad_norm": 0.22254809737205505, + "learning_rate": 0.00011445945945945945, + "loss": 0.3707, + "step": 4129 + }, + { + "epoch": 1.2380363500093685, + "grad_norm": 0.2163870483636856, + "learning_rate": 0.00011441441441441441, + "loss": 0.3686, + "step": 4130 + }, + { + "epoch": 1.238336143901068, + "grad_norm": 0.23733799159526825, + "learning_rate": 0.00011436936936936935, + "loss": 0.3502, + "step": 4131 + }, + { + "epoch": 1.2386359377927674, + "grad_norm": 0.22752176225185394, + "learning_rate": 0.00011432432432432431, + "loss": 0.3817, + "step": 4132 + }, + { + "epoch": 1.238935731684467, + "grad_norm": 0.2311381846666336, + "learning_rate": 0.00011427927927927928, + "loss": 0.3544, + "step": 4133 + }, + { + "epoch": 1.2392355255761665, + "grad_norm": 0.21874240040779114, + "learning_rate": 0.00011423423423423421, + "loss": 0.3665, + "step": 4134 + }, + { + "epoch": 1.2395353194678658, + "grad_norm": 0.21653451025485992, + "learning_rate": 0.00011418918918918918, + "loss": 0.354, + "step": 4135 + }, + { + "epoch": 1.2398351133595653, + "grad_norm": 0.21141435205936432, + "learning_rate": 0.00011414414414414414, + "loss": 0.34, + "step": 4136 + }, + { + "epoch": 1.2401349072512649, + "grad_norm": 0.21471178531646729, + "learning_rate": 0.00011409909909909908, + "loss": 0.343, + "step": 4137 + }, + { + "epoch": 1.2404347011429642, + "grad_norm": 0.22427885234355927, + "learning_rate": 0.00011405405405405404, + "loss": 0.3719, + "step": 4138 + }, + { + "epoch": 1.2407344950346637, + "grad_norm": 0.21209987998008728, + "learning_rate": 0.000114009009009009, + "loss": 0.3724, + "step": 4139 + }, + { + "epoch": 1.2410342889263632, + "grad_norm": 0.2354046106338501, + "learning_rate": 0.00011396396396396396, + "loss": 0.3735, + "step": 4140 + }, + { + "epoch": 1.2413340828180626, + "grad_norm": 0.21986010670661926, + "learning_rate": 0.0001139189189189189, + "loss": 0.3605, + "step": 4141 + }, + { + "epoch": 1.241633876709762, + "grad_norm": 0.2356344759464264, + "learning_rate": 0.00011387387387387387, + "loss": 0.3589, + "step": 4142 + }, + { + "epoch": 1.2419336706014614, + "grad_norm": 0.2403137981891632, + "learning_rate": 0.00011382882882882882, + "loss": 0.342, + "step": 4143 + }, + { + "epoch": 1.242233464493161, + "grad_norm": 0.22544018924236298, + "learning_rate": 0.00011378378378378378, + "loss": 0.3639, + "step": 4144 + }, + { + "epoch": 1.2425332583848605, + "grad_norm": 0.21682587265968323, + "learning_rate": 0.00011373873873873872, + "loss": 0.3466, + "step": 4145 + }, + { + "epoch": 1.2428330522765598, + "grad_norm": 0.21896804869174957, + "learning_rate": 0.00011369369369369368, + "loss": 0.36, + "step": 4146 + }, + { + "epoch": 1.2431328461682594, + "grad_norm": 0.2302297204732895, + "learning_rate": 0.00011364864864864865, + "loss": 0.3799, + "step": 4147 + }, + { + "epoch": 1.2434326400599587, + "grad_norm": 0.23035024106502533, + "learning_rate": 0.00011360360360360359, + "loss": 0.3488, + "step": 4148 + }, + { + "epoch": 1.2437324339516582, + "grad_norm": 0.21760596334934235, + "learning_rate": 0.00011355855855855855, + "loss": 0.3612, + "step": 4149 + }, + { + "epoch": 1.2440322278433578, + "grad_norm": 0.22977299988269806, + "learning_rate": 0.00011351351351351351, + "loss": 0.3621, + "step": 4150 + }, + { + "epoch": 1.244332021735057, + "grad_norm": 0.22122150659561157, + "learning_rate": 0.00011346846846846845, + "loss": 0.3574, + "step": 4151 + }, + { + "epoch": 1.2446318156267566, + "grad_norm": 0.21653127670288086, + "learning_rate": 0.00011342342342342341, + "loss": 0.3341, + "step": 4152 + }, + { + "epoch": 1.244931609518456, + "grad_norm": 0.20750166475772858, + "learning_rate": 0.00011337837837837838, + "loss": 0.3352, + "step": 4153 + }, + { + "epoch": 1.2452314034101555, + "grad_norm": 0.2003934681415558, + "learning_rate": 0.00011333333333333331, + "loss": 0.3224, + "step": 4154 + }, + { + "epoch": 1.245531197301855, + "grad_norm": 0.21856017410755157, + "learning_rate": 0.00011328828828828828, + "loss": 0.351, + "step": 4155 + }, + { + "epoch": 1.2458309911935543, + "grad_norm": 0.21926572918891907, + "learning_rate": 0.00011324324324324323, + "loss": 0.3587, + "step": 4156 + }, + { + "epoch": 1.2461307850852539, + "grad_norm": 0.2154683619737625, + "learning_rate": 0.00011319819819819819, + "loss": 0.3571, + "step": 4157 + }, + { + "epoch": 1.2464305789769534, + "grad_norm": 0.20265419781208038, + "learning_rate": 0.00011315315315315314, + "loss": 0.3338, + "step": 4158 + }, + { + "epoch": 1.2467303728686527, + "grad_norm": 0.22431926429271698, + "learning_rate": 0.00011310810810810809, + "loss": 0.3604, + "step": 4159 + }, + { + "epoch": 1.2470301667603523, + "grad_norm": 0.2245953381061554, + "learning_rate": 0.00011306306306306306, + "loss": 0.3575, + "step": 4160 + }, + { + "epoch": 1.2473299606520518, + "grad_norm": 0.2239590287208557, + "learning_rate": 0.00011301801801801801, + "loss": 0.3487, + "step": 4161 + }, + { + "epoch": 1.2476297545437511, + "grad_norm": 0.22053176164627075, + "learning_rate": 0.00011297297297297296, + "loss": 0.3493, + "step": 4162 + }, + { + "epoch": 1.2479295484354507, + "grad_norm": 0.22241485118865967, + "learning_rate": 0.00011292792792792792, + "loss": 0.3605, + "step": 4163 + }, + { + "epoch": 1.24822934232715, + "grad_norm": 0.22865022718906403, + "learning_rate": 0.00011288288288288289, + "loss": 0.3591, + "step": 4164 + }, + { + "epoch": 1.2485291362188495, + "grad_norm": 0.2038031369447708, + "learning_rate": 0.00011283783783783782, + "loss": 0.3484, + "step": 4165 + }, + { + "epoch": 1.248828930110549, + "grad_norm": 0.2149299681186676, + "learning_rate": 0.00011279279279279279, + "loss": 0.3583, + "step": 4166 + }, + { + "epoch": 1.2491287240022484, + "grad_norm": 0.22604189813137054, + "learning_rate": 0.00011274774774774774, + "loss": 0.375, + "step": 4167 + }, + { + "epoch": 1.249428517893948, + "grad_norm": 0.23565004765987396, + "learning_rate": 0.00011270270270270269, + "loss": 0.3691, + "step": 4168 + }, + { + "epoch": 1.2497283117856473, + "grad_norm": 0.21931523084640503, + "learning_rate": 0.00011265765765765765, + "loss": 0.3667, + "step": 4169 + }, + { + "epoch": 1.2500281056773468, + "grad_norm": 0.22204811871051788, + "learning_rate": 0.0001126126126126126, + "loss": 0.357, + "step": 4170 + }, + { + "epoch": 1.2503278995690463, + "grad_norm": 0.21145589649677277, + "learning_rate": 0.00011256756756756755, + "loss": 0.3445, + "step": 4171 + }, + { + "epoch": 1.2506276934607459, + "grad_norm": 0.22172702848911285, + "learning_rate": 0.00011252252252252251, + "loss": 0.3684, + "step": 4172 + }, + { + "epoch": 1.2509274873524452, + "grad_norm": 0.22264516353607178, + "learning_rate": 0.00011247747747747747, + "loss": 0.3675, + "step": 4173 + }, + { + "epoch": 1.2512272812441447, + "grad_norm": 0.2182130068540573, + "learning_rate": 0.00011243243243243243, + "loss": 0.3688, + "step": 4174 + }, + { + "epoch": 1.251527075135844, + "grad_norm": 0.20346961915493011, + "learning_rate": 0.00011238738738738738, + "loss": 0.3342, + "step": 4175 + }, + { + "epoch": 1.2518268690275436, + "grad_norm": 0.2261209338903427, + "learning_rate": 0.00011234234234234233, + "loss": 0.3894, + "step": 4176 + }, + { + "epoch": 1.2521266629192431, + "grad_norm": 0.22320590913295746, + "learning_rate": 0.0001122972972972973, + "loss": 0.35, + "step": 4177 + }, + { + "epoch": 1.2524264568109424, + "grad_norm": 0.22703339159488678, + "learning_rate": 0.00011225225225225224, + "loss": 0.3718, + "step": 4178 + }, + { + "epoch": 1.252726250702642, + "grad_norm": 0.21656812727451324, + "learning_rate": 0.0001122072072072072, + "loss": 0.3573, + "step": 4179 + }, + { + "epoch": 1.2530260445943413, + "grad_norm": 0.21183227002620697, + "learning_rate": 0.00011216216216216216, + "loss": 0.3774, + "step": 4180 + }, + { + "epoch": 1.2533258384860408, + "grad_norm": 0.21337759494781494, + "learning_rate": 0.0001121171171171171, + "loss": 0.3614, + "step": 4181 + }, + { + "epoch": 1.2536256323777404, + "grad_norm": 0.2244948446750641, + "learning_rate": 0.00011207207207207206, + "loss": 0.3562, + "step": 4182 + }, + { + "epoch": 1.2539254262694397, + "grad_norm": 0.20882092416286469, + "learning_rate": 0.00011202702702702702, + "loss": 0.3456, + "step": 4183 + }, + { + "epoch": 1.2542252201611392, + "grad_norm": 0.2338513433933258, + "learning_rate": 0.00011198198198198197, + "loss": 0.3714, + "step": 4184 + }, + { + "epoch": 1.2545250140528386, + "grad_norm": 0.20936037600040436, + "learning_rate": 0.00011193693693693692, + "loss": 0.3447, + "step": 4185 + }, + { + "epoch": 1.254824807944538, + "grad_norm": 0.217252716422081, + "learning_rate": 0.00011189189189189189, + "loss": 0.3592, + "step": 4186 + }, + { + "epoch": 1.2551246018362376, + "grad_norm": 0.22383899986743927, + "learning_rate": 0.00011184684684684684, + "loss": 0.3537, + "step": 4187 + }, + { + "epoch": 1.2554243957279372, + "grad_norm": 0.22584226727485657, + "learning_rate": 0.00011180180180180179, + "loss": 0.3615, + "step": 4188 + }, + { + "epoch": 1.2557241896196365, + "grad_norm": 0.2147902399301529, + "learning_rate": 0.00011175675675675675, + "loss": 0.3726, + "step": 4189 + }, + { + "epoch": 1.256023983511336, + "grad_norm": 0.23081621527671814, + "learning_rate": 0.0001117117117117117, + "loss": 0.3909, + "step": 4190 + }, + { + "epoch": 1.2563237774030354, + "grad_norm": 0.21398404240608215, + "learning_rate": 0.00011166666666666667, + "loss": 0.3763, + "step": 4191 + }, + { + "epoch": 1.256623571294735, + "grad_norm": 0.20779982209205627, + "learning_rate": 0.0001116216216216216, + "loss": 0.3367, + "step": 4192 + }, + { + "epoch": 1.2569233651864344, + "grad_norm": 0.2094753086566925, + "learning_rate": 0.00011157657657657657, + "loss": 0.3401, + "step": 4193 + }, + { + "epoch": 1.2572231590781338, + "grad_norm": 0.21873033046722412, + "learning_rate": 0.00011153153153153153, + "loss": 0.3735, + "step": 4194 + }, + { + "epoch": 1.2575229529698333, + "grad_norm": 0.21792501211166382, + "learning_rate": 0.00011148648648648647, + "loss": 0.3689, + "step": 4195 + }, + { + "epoch": 1.2578227468615326, + "grad_norm": 0.21120500564575195, + "learning_rate": 0.00011144144144144143, + "loss": 0.3443, + "step": 4196 + }, + { + "epoch": 1.2581225407532322, + "grad_norm": 0.22521494328975677, + "learning_rate": 0.0001113963963963964, + "loss": 0.3584, + "step": 4197 + }, + { + "epoch": 1.2584223346449317, + "grad_norm": 0.2140861600637436, + "learning_rate": 0.00011135135135135133, + "loss": 0.3599, + "step": 4198 + }, + { + "epoch": 1.258722128536631, + "grad_norm": 0.21226727962493896, + "learning_rate": 0.0001113063063063063, + "loss": 0.3673, + "step": 4199 + }, + { + "epoch": 1.2590219224283306, + "grad_norm": 0.22516387701034546, + "learning_rate": 0.00011126126126126126, + "loss": 0.3678, + "step": 4200 + }, + { + "epoch": 1.2593217163200299, + "grad_norm": 0.22374635934829712, + "learning_rate": 0.0001112162162162162, + "loss": 0.3548, + "step": 4201 + }, + { + "epoch": 1.2596215102117294, + "grad_norm": 0.2221246212720871, + "learning_rate": 0.00011117117117117116, + "loss": 0.3745, + "step": 4202 + }, + { + "epoch": 1.259921304103429, + "grad_norm": 0.22684535384178162, + "learning_rate": 0.00011112612612612611, + "loss": 0.3557, + "step": 4203 + }, + { + "epoch": 1.2602210979951283, + "grad_norm": 0.22062942385673523, + "learning_rate": 0.00011108108108108107, + "loss": 0.3814, + "step": 4204 + }, + { + "epoch": 1.2605208918868278, + "grad_norm": 0.21318332850933075, + "learning_rate": 0.00011103603603603602, + "loss": 0.3628, + "step": 4205 + }, + { + "epoch": 1.2608206857785271, + "grad_norm": 0.20761235058307648, + "learning_rate": 0.00011099099099099097, + "loss": 0.3486, + "step": 4206 + }, + { + "epoch": 1.2611204796702267, + "grad_norm": 0.2155384123325348, + "learning_rate": 0.00011094594594594594, + "loss": 0.364, + "step": 4207 + }, + { + "epoch": 1.2614202735619262, + "grad_norm": 0.2154616266489029, + "learning_rate": 0.0001109009009009009, + "loss": 0.3583, + "step": 4208 + }, + { + "epoch": 1.2617200674536257, + "grad_norm": 0.22318878769874573, + "learning_rate": 0.00011085585585585584, + "loss": 0.3677, + "step": 4209 + }, + { + "epoch": 1.262019861345325, + "grad_norm": 0.21294383704662323, + "learning_rate": 0.0001108108108108108, + "loss": 0.3745, + "step": 4210 + }, + { + "epoch": 1.2623196552370246, + "grad_norm": 0.2227228581905365, + "learning_rate": 0.00011076576576576577, + "loss": 0.3825, + "step": 4211 + }, + { + "epoch": 1.262619449128724, + "grad_norm": 0.21078257262706757, + "learning_rate": 0.0001107207207207207, + "loss": 0.3403, + "step": 4212 + }, + { + "epoch": 1.2629192430204235, + "grad_norm": 0.23595395684242249, + "learning_rate": 0.00011067567567567567, + "loss": 0.3718, + "step": 4213 + }, + { + "epoch": 1.263219036912123, + "grad_norm": 0.22779153287410736, + "learning_rate": 0.00011063063063063063, + "loss": 0.3546, + "step": 4214 + }, + { + "epoch": 1.2635188308038223, + "grad_norm": 0.21909108757972717, + "learning_rate": 0.00011058558558558557, + "loss": 0.3701, + "step": 4215 + }, + { + "epoch": 1.2638186246955219, + "grad_norm": 0.22383159399032593, + "learning_rate": 0.00011054054054054053, + "loss": 0.3735, + "step": 4216 + }, + { + "epoch": 1.2641184185872212, + "grad_norm": 0.22993333637714386, + "learning_rate": 0.00011049549549549548, + "loss": 0.3859, + "step": 4217 + }, + { + "epoch": 1.2644182124789207, + "grad_norm": 0.22387270629405975, + "learning_rate": 0.00011045045045045043, + "loss": 0.3517, + "step": 4218 + }, + { + "epoch": 1.2647180063706203, + "grad_norm": 0.22424739599227905, + "learning_rate": 0.0001104054054054054, + "loss": 0.3758, + "step": 4219 + }, + { + "epoch": 1.2650178002623196, + "grad_norm": 0.21295391023159027, + "learning_rate": 0.00011036036036036035, + "loss": 0.3594, + "step": 4220 + }, + { + "epoch": 1.2653175941540191, + "grad_norm": 0.2224559783935547, + "learning_rate": 0.00011031531531531531, + "loss": 0.368, + "step": 4221 + }, + { + "epoch": 1.2656173880457184, + "grad_norm": 0.22176241874694824, + "learning_rate": 0.00011027027027027026, + "loss": 0.3644, + "step": 4222 + }, + { + "epoch": 1.265917181937418, + "grad_norm": 0.2132066935300827, + "learning_rate": 0.00011022522522522521, + "loss": 0.3545, + "step": 4223 + }, + { + "epoch": 1.2662169758291175, + "grad_norm": 0.2125639170408249, + "learning_rate": 0.00011018018018018018, + "loss": 0.3694, + "step": 4224 + }, + { + "epoch": 1.266516769720817, + "grad_norm": 0.21990853548049927, + "learning_rate": 0.00011013513513513514, + "loss": 0.3772, + "step": 4225 + }, + { + "epoch": 1.2668165636125164, + "grad_norm": 0.20371896028518677, + "learning_rate": 0.00011009009009009008, + "loss": 0.3463, + "step": 4226 + }, + { + "epoch": 1.267116357504216, + "grad_norm": 0.21950723230838776, + "learning_rate": 0.00011004504504504504, + "loss": 0.3609, + "step": 4227 + }, + { + "epoch": 1.2674161513959152, + "grad_norm": 0.20541849732398987, + "learning_rate": 0.00010999999999999998, + "loss": 0.3411, + "step": 4228 + }, + { + "epoch": 1.2677159452876148, + "grad_norm": 0.2173074632883072, + "learning_rate": 0.00010995495495495494, + "loss": 0.3694, + "step": 4229 + }, + { + "epoch": 1.2680157391793143, + "grad_norm": 0.21789297461509705, + "learning_rate": 0.0001099099099099099, + "loss": 0.3732, + "step": 4230 + }, + { + "epoch": 1.2683155330710136, + "grad_norm": 0.21867066621780396, + "learning_rate": 0.00010986486486486485, + "loss": 0.371, + "step": 4231 + }, + { + "epoch": 1.2686153269627132, + "grad_norm": 0.2214251011610031, + "learning_rate": 0.0001098198198198198, + "loss": 0.3645, + "step": 4232 + }, + { + "epoch": 1.2689151208544125, + "grad_norm": 0.21950241923332214, + "learning_rate": 0.00010977477477477477, + "loss": 0.3584, + "step": 4233 + }, + { + "epoch": 1.269214914746112, + "grad_norm": 0.21484404802322388, + "learning_rate": 0.00010972972972972972, + "loss": 0.3451, + "step": 4234 + }, + { + "epoch": 1.2695147086378116, + "grad_norm": 0.21472983062267303, + "learning_rate": 0.00010968468468468467, + "loss": 0.3334, + "step": 4235 + }, + { + "epoch": 1.269814502529511, + "grad_norm": 0.224507138133049, + "learning_rate": 0.00010963963963963963, + "loss": 0.3746, + "step": 4236 + }, + { + "epoch": 1.2701142964212104, + "grad_norm": 0.22031918168067932, + "learning_rate": 0.00010959459459459458, + "loss": 0.3401, + "step": 4237 + }, + { + "epoch": 1.2704140903129097, + "grad_norm": 0.22102873027324677, + "learning_rate": 0.00010954954954954955, + "loss": 0.3778, + "step": 4238 + }, + { + "epoch": 1.2707138842046093, + "grad_norm": 0.2392221838235855, + "learning_rate": 0.00010950450450450448, + "loss": 0.3587, + "step": 4239 + }, + { + "epoch": 1.2710136780963088, + "grad_norm": 0.2183891385793686, + "learning_rate": 0.00010945945945945945, + "loss": 0.3407, + "step": 4240 + }, + { + "epoch": 1.2713134719880084, + "grad_norm": 0.22965964674949646, + "learning_rate": 0.00010941441441441441, + "loss": 0.3641, + "step": 4241 + }, + { + "epoch": 1.2716132658797077, + "grad_norm": 0.22997382283210754, + "learning_rate": 0.00010936936936936935, + "loss": 0.3698, + "step": 4242 + }, + { + "epoch": 1.2719130597714072, + "grad_norm": 0.2099880427122116, + "learning_rate": 0.00010932432432432431, + "loss": 0.3441, + "step": 4243 + }, + { + "epoch": 1.2722128536631065, + "grad_norm": 0.2355547547340393, + "learning_rate": 0.00010927927927927928, + "loss": 0.3905, + "step": 4244 + }, + { + "epoch": 1.272512647554806, + "grad_norm": 0.2386329025030136, + "learning_rate": 0.00010923423423423421, + "loss": 0.3716, + "step": 4245 + }, + { + "epoch": 1.2728124414465056, + "grad_norm": 0.21616117656230927, + "learning_rate": 0.00010918918918918918, + "loss": 0.3542, + "step": 4246 + }, + { + "epoch": 1.273112235338205, + "grad_norm": 0.22752898931503296, + "learning_rate": 0.00010914414414414414, + "loss": 0.3713, + "step": 4247 + }, + { + "epoch": 1.2734120292299045, + "grad_norm": 0.21322378516197205, + "learning_rate": 0.00010909909909909909, + "loss": 0.3541, + "step": 4248 + }, + { + "epoch": 1.2737118231216038, + "grad_norm": 0.22594040632247925, + "learning_rate": 0.00010905405405405404, + "loss": 0.3702, + "step": 4249 + }, + { + "epoch": 1.2740116170133033, + "grad_norm": 0.22437477111816406, + "learning_rate": 0.000109009009009009, + "loss": 0.3849, + "step": 4250 + }, + { + "epoch": 1.2743114109050029, + "grad_norm": 0.20456473529338837, + "learning_rate": 0.00010896396396396396, + "loss": 0.3297, + "step": 4251 + }, + { + "epoch": 1.2746112047967022, + "grad_norm": 0.22588905692100525, + "learning_rate": 0.0001089189189189189, + "loss": 0.3807, + "step": 4252 + }, + { + "epoch": 1.2749109986884017, + "grad_norm": 0.2254696935415268, + "learning_rate": 0.00010887387387387386, + "loss": 0.3501, + "step": 4253 + }, + { + "epoch": 1.275210792580101, + "grad_norm": 0.2138838917016983, + "learning_rate": 0.00010882882882882882, + "loss": 0.3368, + "step": 4254 + }, + { + "epoch": 1.2755105864718006, + "grad_norm": 0.221001535654068, + "learning_rate": 0.00010878378378378378, + "loss": 0.3531, + "step": 4255 + }, + { + "epoch": 1.2758103803635001, + "grad_norm": 0.226884663105011, + "learning_rate": 0.00010873873873873872, + "loss": 0.366, + "step": 4256 + }, + { + "epoch": 1.2761101742551997, + "grad_norm": 0.22693143784999847, + "learning_rate": 0.00010869369369369369, + "loss": 0.3906, + "step": 4257 + }, + { + "epoch": 1.276409968146899, + "grad_norm": 0.22758977115154266, + "learning_rate": 0.00010864864864864865, + "loss": 0.3821, + "step": 4258 + }, + { + "epoch": 1.2767097620385985, + "grad_norm": 0.22596494853496552, + "learning_rate": 0.00010860360360360359, + "loss": 0.3825, + "step": 4259 + }, + { + "epoch": 1.2770095559302979, + "grad_norm": 0.22047007083892822, + "learning_rate": 0.00010855855855855855, + "loss": 0.3412, + "step": 4260 + }, + { + "epoch": 1.2773093498219974, + "grad_norm": 0.22519131004810333, + "learning_rate": 0.00010851351351351351, + "loss": 0.3592, + "step": 4261 + }, + { + "epoch": 1.277609143713697, + "grad_norm": 0.22262904047966003, + "learning_rate": 0.00010846846846846845, + "loss": 0.3766, + "step": 4262 + }, + { + "epoch": 1.2779089376053963, + "grad_norm": 0.23117446899414062, + "learning_rate": 0.00010842342342342341, + "loss": 0.3897, + "step": 4263 + }, + { + "epoch": 1.2782087314970958, + "grad_norm": 0.21441631019115448, + "learning_rate": 0.00010837837837837836, + "loss": 0.3616, + "step": 4264 + }, + { + "epoch": 1.2785085253887951, + "grad_norm": 0.21647268533706665, + "learning_rate": 0.00010833333333333333, + "loss": 0.3571, + "step": 4265 + }, + { + "epoch": 1.2788083192804947, + "grad_norm": 0.23482079803943634, + "learning_rate": 0.00010828828828828828, + "loss": 0.354, + "step": 4266 + }, + { + "epoch": 1.2791081131721942, + "grad_norm": 0.20919762551784515, + "learning_rate": 0.00010824324324324323, + "loss": 0.3666, + "step": 4267 + }, + { + "epoch": 1.2794079070638935, + "grad_norm": 0.21219047904014587, + "learning_rate": 0.00010819819819819819, + "loss": 0.3602, + "step": 4268 + }, + { + "epoch": 1.279707700955593, + "grad_norm": 0.279299259185791, + "learning_rate": 0.00010815315315315314, + "loss": 0.3867, + "step": 4269 + }, + { + "epoch": 1.2800074948472924, + "grad_norm": 0.2186315357685089, + "learning_rate": 0.0001081081081081081, + "loss": 0.3494, + "step": 4270 + }, + { + "epoch": 1.280307288738992, + "grad_norm": 0.24019218981266022, + "learning_rate": 0.00010806306306306306, + "loss": 0.3813, + "step": 4271 + }, + { + "epoch": 1.2806070826306915, + "grad_norm": 0.2710072696208954, + "learning_rate": 0.00010801801801801802, + "loss": 0.4117, + "step": 4272 + }, + { + "epoch": 1.280906876522391, + "grad_norm": 0.21920080482959747, + "learning_rate": 0.00010797297297297296, + "loss": 0.3701, + "step": 4273 + }, + { + "epoch": 1.2812066704140903, + "grad_norm": 0.2656455934047699, + "learning_rate": 0.00010792792792792792, + "loss": 0.3556, + "step": 4274 + }, + { + "epoch": 1.2815064643057898, + "grad_norm": 0.24394501745700836, + "learning_rate": 0.00010788288288288289, + "loss": 0.3978, + "step": 4275 + }, + { + "epoch": 1.2818062581974892, + "grad_norm": 0.2048931121826172, + "learning_rate": 0.00010783783783783782, + "loss": 0.3491, + "step": 4276 + }, + { + "epoch": 1.2821060520891887, + "grad_norm": 0.25899815559387207, + "learning_rate": 0.00010779279279279279, + "loss": 0.3599, + "step": 4277 + }, + { + "epoch": 1.2824058459808882, + "grad_norm": 0.2182856947183609, + "learning_rate": 0.00010774774774774774, + "loss": 0.3547, + "step": 4278 + }, + { + "epoch": 1.2827056398725876, + "grad_norm": 0.22078947722911835, + "learning_rate": 0.00010770270270270269, + "loss": 0.3822, + "step": 4279 + }, + { + "epoch": 1.283005433764287, + "grad_norm": 0.21624134480953217, + "learning_rate": 0.00010765765765765765, + "loss": 0.3594, + "step": 4280 + }, + { + "epoch": 1.2833052276559864, + "grad_norm": 0.22964727878570557, + "learning_rate": 0.0001076126126126126, + "loss": 0.3908, + "step": 4281 + }, + { + "epoch": 1.283605021547686, + "grad_norm": 0.21670106053352356, + "learning_rate": 0.00010756756756756757, + "loss": 0.3864, + "step": 4282 + }, + { + "epoch": 1.2839048154393855, + "grad_norm": 0.21967041492462158, + "learning_rate": 0.00010752252252252252, + "loss": 0.3639, + "step": 4283 + }, + { + "epoch": 1.2842046093310848, + "grad_norm": 0.20801186561584473, + "learning_rate": 0.00010747747747747747, + "loss": 0.3328, + "step": 4284 + }, + { + "epoch": 1.2845044032227844, + "grad_norm": 0.21895767748355865, + "learning_rate": 0.00010743243243243243, + "loss": 0.3734, + "step": 4285 + }, + { + "epoch": 1.2848041971144837, + "grad_norm": 0.2117338627576828, + "learning_rate": 0.00010738738738738738, + "loss": 0.3443, + "step": 4286 + }, + { + "epoch": 1.2851039910061832, + "grad_norm": 0.22575978934764862, + "learning_rate": 0.00010734234234234233, + "loss": 0.3619, + "step": 4287 + }, + { + "epoch": 1.2854037848978828, + "grad_norm": 0.21902771294116974, + "learning_rate": 0.0001072972972972973, + "loss": 0.3545, + "step": 4288 + }, + { + "epoch": 1.285703578789582, + "grad_norm": 0.22649185359477997, + "learning_rate": 0.00010725225225225223, + "loss": 0.3569, + "step": 4289 + }, + { + "epoch": 1.2860033726812816, + "grad_norm": 0.2184680551290512, + "learning_rate": 0.0001072072072072072, + "loss": 0.3868, + "step": 4290 + }, + { + "epoch": 1.286303166572981, + "grad_norm": 0.2234402894973755, + "learning_rate": 0.00010716216216216216, + "loss": 0.3599, + "step": 4291 + }, + { + "epoch": 1.2866029604646805, + "grad_norm": 0.2185603380203247, + "learning_rate": 0.0001071171171171171, + "loss": 0.3288, + "step": 4292 + }, + { + "epoch": 1.28690275435638, + "grad_norm": 0.216938316822052, + "learning_rate": 0.00010707207207207206, + "loss": 0.3657, + "step": 4293 + }, + { + "epoch": 1.2872025482480796, + "grad_norm": 0.22174493968486786, + "learning_rate": 0.00010702702702702702, + "loss": 0.3486, + "step": 4294 + }, + { + "epoch": 1.2875023421397789, + "grad_norm": 0.23581112921237946, + "learning_rate": 0.00010698198198198197, + "loss": 0.3934, + "step": 4295 + }, + { + "epoch": 1.2878021360314784, + "grad_norm": 0.2387666404247284, + "learning_rate": 0.00010693693693693692, + "loss": 0.3561, + "step": 4296 + }, + { + "epoch": 1.2881019299231777, + "grad_norm": 0.21351338922977448, + "learning_rate": 0.00010689189189189189, + "loss": 0.3673, + "step": 4297 + }, + { + "epoch": 1.2884017238148773, + "grad_norm": 0.22528427839279175, + "learning_rate": 0.00010684684684684684, + "loss": 0.3493, + "step": 4298 + }, + { + "epoch": 1.2887015177065768, + "grad_norm": 0.22709199786186218, + "learning_rate": 0.0001068018018018018, + "loss": 0.3645, + "step": 4299 + }, + { + "epoch": 1.2890013115982761, + "grad_norm": 0.2260282188653946, + "learning_rate": 0.00010675675675675674, + "loss": 0.3904, + "step": 4300 + }, + { + "epoch": 1.2893011054899757, + "grad_norm": 0.23056663572788239, + "learning_rate": 0.0001067117117117117, + "loss": 0.3642, + "step": 4301 + }, + { + "epoch": 1.289600899381675, + "grad_norm": 0.21464140713214874, + "learning_rate": 0.00010666666666666667, + "loss": 0.364, + "step": 4302 + }, + { + "epoch": 1.2899006932733745, + "grad_norm": 0.20631635189056396, + "learning_rate": 0.0001066216216216216, + "loss": 0.3576, + "step": 4303 + }, + { + "epoch": 1.290200487165074, + "grad_norm": 0.22520391643047333, + "learning_rate": 0.00010657657657657657, + "loss": 0.3706, + "step": 4304 + }, + { + "epoch": 1.2905002810567734, + "grad_norm": 0.2366526871919632, + "learning_rate": 0.00010653153153153153, + "loss": 0.3667, + "step": 4305 + }, + { + "epoch": 1.290800074948473, + "grad_norm": 0.21495036780834198, + "learning_rate": 0.00010648648648648647, + "loss": 0.3808, + "step": 4306 + }, + { + "epoch": 1.2910998688401722, + "grad_norm": 0.2272660732269287, + "learning_rate": 0.00010644144144144143, + "loss": 0.3683, + "step": 4307 + }, + { + "epoch": 1.2913996627318718, + "grad_norm": 0.22838729619979858, + "learning_rate": 0.0001063963963963964, + "loss": 0.3537, + "step": 4308 + }, + { + "epoch": 1.2916994566235713, + "grad_norm": 0.20135217905044556, + "learning_rate": 0.00010635135135135133, + "loss": 0.3258, + "step": 4309 + }, + { + "epoch": 1.2919992505152709, + "grad_norm": 0.22161005437374115, + "learning_rate": 0.0001063063063063063, + "loss": 0.3766, + "step": 4310 + }, + { + "epoch": 1.2922990444069702, + "grad_norm": 0.2209654301404953, + "learning_rate": 0.00010626126126126126, + "loss": 0.3558, + "step": 4311 + }, + { + "epoch": 1.2925988382986697, + "grad_norm": 0.21990704536437988, + "learning_rate": 0.00010621621621621621, + "loss": 0.3503, + "step": 4312 + }, + { + "epoch": 1.292898632190369, + "grad_norm": 0.25278615951538086, + "learning_rate": 0.00010617117117117116, + "loss": 0.3738, + "step": 4313 + }, + { + "epoch": 1.2931984260820686, + "grad_norm": 0.22431500256061554, + "learning_rate": 0.00010612612612612611, + "loss": 0.3625, + "step": 4314 + }, + { + "epoch": 1.2934982199737681, + "grad_norm": 0.20802560448646545, + "learning_rate": 0.00010608108108108107, + "loss": 0.3367, + "step": 4315 + }, + { + "epoch": 1.2937980138654674, + "grad_norm": 0.248360276222229, + "learning_rate": 0.00010603603603603603, + "loss": 0.3693, + "step": 4316 + }, + { + "epoch": 1.294097807757167, + "grad_norm": 0.21649615466594696, + "learning_rate": 0.00010599099099099098, + "loss": 0.3542, + "step": 4317 + }, + { + "epoch": 1.2943976016488663, + "grad_norm": 0.20109300315380096, + "learning_rate": 0.00010594594594594594, + "loss": 0.3314, + "step": 4318 + }, + { + "epoch": 1.2946973955405658, + "grad_norm": 0.2151678502559662, + "learning_rate": 0.0001059009009009009, + "loss": 0.344, + "step": 4319 + }, + { + "epoch": 1.2949971894322654, + "grad_norm": 0.2119542509317398, + "learning_rate": 0.00010585585585585584, + "loss": 0.341, + "step": 4320 + }, + { + "epoch": 1.2952969833239647, + "grad_norm": 0.2245444804430008, + "learning_rate": 0.0001058108108108108, + "loss": 0.364, + "step": 4321 + }, + { + "epoch": 1.2955967772156642, + "grad_norm": 0.20092026889324188, + "learning_rate": 0.00010576576576576577, + "loss": 0.328, + "step": 4322 + }, + { + "epoch": 1.2958965711073636, + "grad_norm": 0.22229532897472382, + "learning_rate": 0.0001057207207207207, + "loss": 0.3758, + "step": 4323 + }, + { + "epoch": 1.296196364999063, + "grad_norm": 0.21315178275108337, + "learning_rate": 0.00010567567567567567, + "loss": 0.3523, + "step": 4324 + }, + { + "epoch": 1.2964961588907626, + "grad_norm": 0.22185546159744263, + "learning_rate": 0.00010563063063063062, + "loss": 0.3764, + "step": 4325 + }, + { + "epoch": 1.2967959527824622, + "grad_norm": 0.20400555431842804, + "learning_rate": 0.00010558558558558557, + "loss": 0.3061, + "step": 4326 + }, + { + "epoch": 1.2970957466741615, + "grad_norm": 0.22479741275310516, + "learning_rate": 0.00010554054054054053, + "loss": 0.3716, + "step": 4327 + }, + { + "epoch": 1.297395540565861, + "grad_norm": 0.2396492063999176, + "learning_rate": 0.00010549549549549548, + "loss": 0.3491, + "step": 4328 + }, + { + "epoch": 1.2976953344575604, + "grad_norm": 0.22863778471946716, + "learning_rate": 0.00010545045045045045, + "loss": 0.3699, + "step": 4329 + }, + { + "epoch": 1.29799512834926, + "grad_norm": 0.21430253982543945, + "learning_rate": 0.0001054054054054054, + "loss": 0.3682, + "step": 4330 + }, + { + "epoch": 1.2982949222409594, + "grad_norm": 0.23227596282958984, + "learning_rate": 0.00010536036036036035, + "loss": 0.3621, + "step": 4331 + }, + { + "epoch": 1.2985947161326588, + "grad_norm": 0.22787782549858093, + "learning_rate": 0.00010531531531531531, + "loss": 0.3599, + "step": 4332 + }, + { + "epoch": 1.2988945100243583, + "grad_norm": 0.21019242703914642, + "learning_rate": 0.00010527027027027026, + "loss": 0.3426, + "step": 4333 + }, + { + "epoch": 1.2991943039160576, + "grad_norm": 0.22081467509269714, + "learning_rate": 0.00010522522522522521, + "loss": 0.3589, + "step": 4334 + }, + { + "epoch": 1.2994940978077572, + "grad_norm": 0.21910566091537476, + "learning_rate": 0.00010518018018018018, + "loss": 0.3415, + "step": 4335 + }, + { + "epoch": 1.2997938916994567, + "grad_norm": 0.22049827873706818, + "learning_rate": 0.00010513513513513511, + "loss": 0.3602, + "step": 4336 + }, + { + "epoch": 1.300093685591156, + "grad_norm": 0.21187494695186615, + "learning_rate": 0.00010509009009009008, + "loss": 0.3446, + "step": 4337 + }, + { + "epoch": 1.3003934794828556, + "grad_norm": 0.22202420234680176, + "learning_rate": 0.00010504504504504504, + "loss": 0.3918, + "step": 4338 + }, + { + "epoch": 1.3006932733745549, + "grad_norm": 0.21001745760440826, + "learning_rate": 0.00010499999999999999, + "loss": 0.3631, + "step": 4339 + }, + { + "epoch": 1.3009930672662544, + "grad_norm": 0.21252772212028503, + "learning_rate": 0.00010495495495495494, + "loss": 0.3491, + "step": 4340 + }, + { + "epoch": 1.301292861157954, + "grad_norm": 0.22069324553012848, + "learning_rate": 0.0001049099099099099, + "loss": 0.3696, + "step": 4341 + }, + { + "epoch": 1.3015926550496535, + "grad_norm": 0.221624955534935, + "learning_rate": 0.00010486486486486486, + "loss": 0.3736, + "step": 4342 + }, + { + "epoch": 1.3018924489413528, + "grad_norm": 0.22644497454166412, + "learning_rate": 0.0001048198198198198, + "loss": 0.3855, + "step": 4343 + }, + { + "epoch": 1.3021922428330523, + "grad_norm": 0.21738597750663757, + "learning_rate": 0.00010477477477477477, + "loss": 0.3628, + "step": 4344 + }, + { + "epoch": 1.3024920367247517, + "grad_norm": 0.23884043097496033, + "learning_rate": 0.00010472972972972972, + "loss": 0.3596, + "step": 4345 + }, + { + "epoch": 1.3027918306164512, + "grad_norm": 0.21880125999450684, + "learning_rate": 0.00010468468468468468, + "loss": 0.3749, + "step": 4346 + }, + { + "epoch": 1.3030916245081507, + "grad_norm": 0.23063544929027557, + "learning_rate": 0.00010463963963963963, + "loss": 0.3672, + "step": 4347 + }, + { + "epoch": 1.30339141839985, + "grad_norm": 0.21442480385303497, + "learning_rate": 0.00010459459459459458, + "loss": 0.3772, + "step": 4348 + }, + { + "epoch": 1.3036912122915496, + "grad_norm": 0.21598872542381287, + "learning_rate": 0.00010454954954954955, + "loss": 0.3814, + "step": 4349 + }, + { + "epoch": 1.303991006183249, + "grad_norm": 0.21835830807685852, + "learning_rate": 0.00010450450450450448, + "loss": 0.3425, + "step": 4350 + }, + { + "epoch": 1.3042908000749485, + "grad_norm": 0.22444991767406464, + "learning_rate": 0.00010445945945945945, + "loss": 0.3784, + "step": 4351 + }, + { + "epoch": 1.304590593966648, + "grad_norm": 0.2318716198205948, + "learning_rate": 0.00010441441441441441, + "loss": 0.3706, + "step": 4352 + }, + { + "epoch": 1.3048903878583473, + "grad_norm": 0.21502730250358582, + "learning_rate": 0.00010436936936936935, + "loss": 0.3497, + "step": 4353 + }, + { + "epoch": 1.3051901817500469, + "grad_norm": 0.213004007935524, + "learning_rate": 0.00010432432432432431, + "loss": 0.3552, + "step": 4354 + }, + { + "epoch": 1.3054899756417462, + "grad_norm": 0.22040680050849915, + "learning_rate": 0.00010427927927927928, + "loss": 0.3719, + "step": 4355 + }, + { + "epoch": 1.3057897695334457, + "grad_norm": 0.22093693912029266, + "learning_rate": 0.00010423423423423421, + "loss": 0.349, + "step": 4356 + }, + { + "epoch": 1.3060895634251453, + "grad_norm": 0.21267752349376678, + "learning_rate": 0.00010418918918918918, + "loss": 0.3441, + "step": 4357 + }, + { + "epoch": 1.3063893573168448, + "grad_norm": 0.21792468428611755, + "learning_rate": 0.00010414414414414414, + "loss": 0.3587, + "step": 4358 + }, + { + "epoch": 1.3066891512085441, + "grad_norm": 0.2234465628862381, + "learning_rate": 0.00010409909909909909, + "loss": 0.3594, + "step": 4359 + }, + { + "epoch": 1.3069889451002437, + "grad_norm": 0.22853820025920868, + "learning_rate": 0.00010405405405405404, + "loss": 0.396, + "step": 4360 + }, + { + "epoch": 1.307288738991943, + "grad_norm": 0.21059495210647583, + "learning_rate": 0.00010400900900900899, + "loss": 0.3778, + "step": 4361 + }, + { + "epoch": 1.3075885328836425, + "grad_norm": 0.23244240880012512, + "learning_rate": 0.00010396396396396396, + "loss": 0.3898, + "step": 4362 + }, + { + "epoch": 1.307888326775342, + "grad_norm": 0.21645265817642212, + "learning_rate": 0.00010391891891891892, + "loss": 0.3388, + "step": 4363 + }, + { + "epoch": 1.3081881206670414, + "grad_norm": 0.22847887873649597, + "learning_rate": 0.00010387387387387386, + "loss": 0.37, + "step": 4364 + }, + { + "epoch": 1.308487914558741, + "grad_norm": 0.20887666940689087, + "learning_rate": 0.00010382882882882882, + "loss": 0.3496, + "step": 4365 + }, + { + "epoch": 1.3087877084504402, + "grad_norm": 0.21099483966827393, + "learning_rate": 0.00010378378378378378, + "loss": 0.362, + "step": 4366 + }, + { + "epoch": 1.3090875023421398, + "grad_norm": 0.2116171419620514, + "learning_rate": 0.00010373873873873872, + "loss": 0.3456, + "step": 4367 + }, + { + "epoch": 1.3093872962338393, + "grad_norm": 0.19885142147541046, + "learning_rate": 0.00010369369369369369, + "loss": 0.3329, + "step": 4368 + }, + { + "epoch": 1.3096870901255386, + "grad_norm": 0.21213248372077942, + "learning_rate": 0.00010364864864864865, + "loss": 0.3403, + "step": 4369 + }, + { + "epoch": 1.3099868840172382, + "grad_norm": 0.2295825481414795, + "learning_rate": 0.00010360360360360359, + "loss": 0.3823, + "step": 4370 + }, + { + "epoch": 1.3102866779089375, + "grad_norm": 0.2063182145357132, + "learning_rate": 0.00010355855855855855, + "loss": 0.3421, + "step": 4371 + }, + { + "epoch": 1.310586471800637, + "grad_norm": 0.20554274320602417, + "learning_rate": 0.0001035135135135135, + "loss": 0.3638, + "step": 4372 + }, + { + "epoch": 1.3108862656923366, + "grad_norm": 0.21071697771549225, + "learning_rate": 0.00010346846846846845, + "loss": 0.3582, + "step": 4373 + }, + { + "epoch": 1.3111860595840361, + "grad_norm": 0.21579106152057648, + "learning_rate": 0.00010342342342342341, + "loss": 0.3512, + "step": 4374 + }, + { + "epoch": 1.3114858534757354, + "grad_norm": 0.21299222111701965, + "learning_rate": 0.00010337837837837836, + "loss": 0.3424, + "step": 4375 + }, + { + "epoch": 1.311785647367435, + "grad_norm": 0.21030524373054504, + "learning_rate": 0.00010333333333333333, + "loss": 0.3581, + "step": 4376 + }, + { + "epoch": 1.3120854412591343, + "grad_norm": 0.21630938351154327, + "learning_rate": 0.00010328828828828828, + "loss": 0.3747, + "step": 4377 + }, + { + "epoch": 1.3123852351508338, + "grad_norm": 0.22339695692062378, + "learning_rate": 0.00010324324324324323, + "loss": 0.3833, + "step": 4378 + }, + { + "epoch": 1.3126850290425334, + "grad_norm": 0.21773970127105713, + "learning_rate": 0.0001031981981981982, + "loss": 0.3729, + "step": 4379 + }, + { + "epoch": 1.3129848229342327, + "grad_norm": 0.2207316905260086, + "learning_rate": 0.00010315315315315316, + "loss": 0.3553, + "step": 4380 + }, + { + "epoch": 1.3132846168259322, + "grad_norm": 0.21742987632751465, + "learning_rate": 0.0001031081081081081, + "loss": 0.3674, + "step": 4381 + }, + { + "epoch": 1.3135844107176315, + "grad_norm": 0.21643692255020142, + "learning_rate": 0.00010306306306306306, + "loss": 0.3557, + "step": 4382 + }, + { + "epoch": 1.313884204609331, + "grad_norm": 0.2156946212053299, + "learning_rate": 0.00010301801801801802, + "loss": 0.3688, + "step": 4383 + }, + { + "epoch": 1.3141839985010306, + "grad_norm": 0.21070463955402374, + "learning_rate": 0.00010297297297297296, + "loss": 0.3524, + "step": 4384 + }, + { + "epoch": 1.31448379239273, + "grad_norm": 0.21057476103305817, + "learning_rate": 0.00010292792792792792, + "loss": 0.3716, + "step": 4385 + }, + { + "epoch": 1.3147835862844295, + "grad_norm": 0.2108742743730545, + "learning_rate": 0.00010288288288288287, + "loss": 0.3596, + "step": 4386 + }, + { + "epoch": 1.3150833801761288, + "grad_norm": 0.22613100707530975, + "learning_rate": 0.00010283783783783782, + "loss": 0.3755, + "step": 4387 + }, + { + "epoch": 1.3153831740678283, + "grad_norm": 0.21166808903217316, + "learning_rate": 0.00010279279279279279, + "loss": 0.3528, + "step": 4388 + }, + { + "epoch": 1.3156829679595279, + "grad_norm": 0.22526302933692932, + "learning_rate": 0.00010274774774774774, + "loss": 0.3857, + "step": 4389 + }, + { + "epoch": 1.3159827618512272, + "grad_norm": 0.22209607064723969, + "learning_rate": 0.00010270270270270269, + "loss": 0.3613, + "step": 4390 + }, + { + "epoch": 1.3162825557429267, + "grad_norm": 0.22119589149951935, + "learning_rate": 0.00010265765765765765, + "loss": 0.3781, + "step": 4391 + }, + { + "epoch": 1.316582349634626, + "grad_norm": 0.2123972475528717, + "learning_rate": 0.0001026126126126126, + "loss": 0.3425, + "step": 4392 + }, + { + "epoch": 1.3168821435263256, + "grad_norm": 0.22913959622383118, + "learning_rate": 0.00010256756756756757, + "loss": 0.3951, + "step": 4393 + }, + { + "epoch": 1.3171819374180251, + "grad_norm": 0.2116348147392273, + "learning_rate": 0.00010252252252252252, + "loss": 0.3619, + "step": 4394 + }, + { + "epoch": 1.3174817313097247, + "grad_norm": 0.2163851112127304, + "learning_rate": 0.00010247747747747747, + "loss": 0.3378, + "step": 4395 + }, + { + "epoch": 1.317781525201424, + "grad_norm": 0.22200891375541687, + "learning_rate": 0.00010243243243243243, + "loss": 0.3685, + "step": 4396 + }, + { + "epoch": 1.3180813190931235, + "grad_norm": 0.2236941158771515, + "learning_rate": 0.00010238738738738737, + "loss": 0.3597, + "step": 4397 + }, + { + "epoch": 1.3183811129848229, + "grad_norm": 0.20215103030204773, + "learning_rate": 0.00010234234234234233, + "loss": 0.3534, + "step": 4398 + }, + { + "epoch": 1.3186809068765224, + "grad_norm": 0.2127383053302765, + "learning_rate": 0.0001022972972972973, + "loss": 0.3462, + "step": 4399 + }, + { + "epoch": 1.318980700768222, + "grad_norm": 0.21308203041553497, + "learning_rate": 0.00010225225225225223, + "loss": 0.3411, + "step": 4400 + }, + { + "epoch": 1.3192804946599213, + "grad_norm": 0.21060781180858612, + "learning_rate": 0.0001022072072072072, + "loss": 0.3585, + "step": 4401 + }, + { + "epoch": 1.3195802885516208, + "grad_norm": 0.2206219881772995, + "learning_rate": 0.00010216216216216216, + "loss": 0.3643, + "step": 4402 + }, + { + "epoch": 1.3198800824433201, + "grad_norm": 0.2104436755180359, + "learning_rate": 0.00010211711711711711, + "loss": 0.3727, + "step": 4403 + }, + { + "epoch": 1.3201798763350197, + "grad_norm": 0.21713684499263763, + "learning_rate": 0.00010207207207207206, + "loss": 0.3579, + "step": 4404 + }, + { + "epoch": 1.3204796702267192, + "grad_norm": 0.19761869311332703, + "learning_rate": 0.00010202702702702702, + "loss": 0.3367, + "step": 4405 + }, + { + "epoch": 1.3207794641184185, + "grad_norm": 0.21974486112594604, + "learning_rate": 0.00010198198198198197, + "loss": 0.3823, + "step": 4406 + }, + { + "epoch": 1.321079258010118, + "grad_norm": 0.21158014237880707, + "learning_rate": 0.00010193693693693692, + "loss": 0.3702, + "step": 4407 + }, + { + "epoch": 1.3213790519018174, + "grad_norm": 0.21842747926712036, + "learning_rate": 0.00010189189189189187, + "loss": 0.3489, + "step": 4408 + }, + { + "epoch": 1.321678845793517, + "grad_norm": 0.22330352663993835, + "learning_rate": 0.00010184684684684684, + "loss": 0.3434, + "step": 4409 + }, + { + "epoch": 1.3219786396852165, + "grad_norm": 0.20083275437355042, + "learning_rate": 0.0001018018018018018, + "loss": 0.3297, + "step": 4410 + }, + { + "epoch": 1.322278433576916, + "grad_norm": 0.2106667160987854, + "learning_rate": 0.00010175675675675674, + "loss": 0.3613, + "step": 4411 + }, + { + "epoch": 1.3225782274686153, + "grad_norm": 0.21465076506137848, + "learning_rate": 0.0001017117117117117, + "loss": 0.357, + "step": 4412 + }, + { + "epoch": 1.3228780213603148, + "grad_norm": 0.21720650792121887, + "learning_rate": 0.00010166666666666667, + "loss": 0.3498, + "step": 4413 + }, + { + "epoch": 1.3231778152520142, + "grad_norm": 0.20751236379146576, + "learning_rate": 0.0001016216216216216, + "loss": 0.3534, + "step": 4414 + }, + { + "epoch": 1.3234776091437137, + "grad_norm": 0.21749740839004517, + "learning_rate": 0.00010157657657657657, + "loss": 0.3816, + "step": 4415 + }, + { + "epoch": 1.3237774030354132, + "grad_norm": 0.21746374666690826, + "learning_rate": 0.00010153153153153153, + "loss": 0.3583, + "step": 4416 + }, + { + "epoch": 1.3240771969271126, + "grad_norm": 0.21947112679481506, + "learning_rate": 0.00010148648648648647, + "loss": 0.3692, + "step": 4417 + }, + { + "epoch": 1.324376990818812, + "grad_norm": 0.2228492647409439, + "learning_rate": 0.00010144144144144143, + "loss": 0.3693, + "step": 4418 + }, + { + "epoch": 1.3246767847105114, + "grad_norm": 0.21328027546405792, + "learning_rate": 0.0001013963963963964, + "loss": 0.3658, + "step": 4419 + }, + { + "epoch": 1.324976578602211, + "grad_norm": 0.22100898623466492, + "learning_rate": 0.00010135135135135135, + "loss": 0.3682, + "step": 4420 + }, + { + "epoch": 1.3252763724939105, + "grad_norm": 0.23616699874401093, + "learning_rate": 0.0001013063063063063, + "loss": 0.3875, + "step": 4421 + }, + { + "epoch": 1.3255761663856098, + "grad_norm": 0.2213396579027176, + "learning_rate": 0.00010126126126126125, + "loss": 0.3697, + "step": 4422 + }, + { + "epoch": 1.3258759602773094, + "grad_norm": 0.21712253987789154, + "learning_rate": 0.00010121621621621621, + "loss": 0.3588, + "step": 4423 + }, + { + "epoch": 1.3261757541690087, + "grad_norm": 0.21829205751419067, + "learning_rate": 0.00010117117117117116, + "loss": 0.3518, + "step": 4424 + }, + { + "epoch": 1.3264755480607082, + "grad_norm": 0.21578383445739746, + "learning_rate": 0.00010112612612612611, + "loss": 0.3846, + "step": 4425 + }, + { + "epoch": 1.3267753419524078, + "grad_norm": 0.21372157335281372, + "learning_rate": 0.00010108108108108108, + "loss": 0.3482, + "step": 4426 + }, + { + "epoch": 1.3270751358441073, + "grad_norm": 0.22385689616203308, + "learning_rate": 0.00010103603603603604, + "loss": 0.3743, + "step": 4427 + }, + { + "epoch": 1.3273749297358066, + "grad_norm": 0.22532759606838226, + "learning_rate": 0.00010099099099099098, + "loss": 0.3685, + "step": 4428 + }, + { + "epoch": 1.3276747236275062, + "grad_norm": 0.22591564059257507, + "learning_rate": 0.00010094594594594594, + "loss": 0.3879, + "step": 4429 + }, + { + "epoch": 1.3279745175192055, + "grad_norm": 0.2188117355108261, + "learning_rate": 0.0001009009009009009, + "loss": 0.361, + "step": 4430 + }, + { + "epoch": 1.328274311410905, + "grad_norm": 0.21723529696464539, + "learning_rate": 0.00010085585585585584, + "loss": 0.3612, + "step": 4431 + }, + { + "epoch": 1.3285741053026046, + "grad_norm": 0.21275779604911804, + "learning_rate": 0.0001008108108108108, + "loss": 0.3656, + "step": 4432 + }, + { + "epoch": 1.3288738991943039, + "grad_norm": 0.2086101621389389, + "learning_rate": 0.00010076576576576575, + "loss": 0.3599, + "step": 4433 + }, + { + "epoch": 1.3291736930860034, + "grad_norm": 0.21776525676250458, + "learning_rate": 0.0001007207207207207, + "loss": 0.3638, + "step": 4434 + }, + { + "epoch": 1.3294734869777027, + "grad_norm": 0.21323978900909424, + "learning_rate": 0.00010067567567567567, + "loss": 0.3695, + "step": 4435 + }, + { + "epoch": 1.3297732808694023, + "grad_norm": 0.2201107293367386, + "learning_rate": 0.00010063063063063062, + "loss": 0.3821, + "step": 4436 + }, + { + "epoch": 1.3300730747611018, + "grad_norm": 0.22386541962623596, + "learning_rate": 0.00010058558558558558, + "loss": 0.3608, + "step": 4437 + }, + { + "epoch": 1.3303728686528011, + "grad_norm": 0.2155165672302246, + "learning_rate": 0.00010054054054054053, + "loss": 0.3562, + "step": 4438 + }, + { + "epoch": 1.3306726625445007, + "grad_norm": 0.21298325061798096, + "learning_rate": 0.00010049549549549548, + "loss": 0.3521, + "step": 4439 + }, + { + "epoch": 1.3309724564362, + "grad_norm": 0.2154662311077118, + "learning_rate": 0.00010045045045045045, + "loss": 0.3765, + "step": 4440 + }, + { + "epoch": 1.3312722503278995, + "grad_norm": 0.20846322178840637, + "learning_rate": 0.0001004054054054054, + "loss": 0.3715, + "step": 4441 + }, + { + "epoch": 1.331572044219599, + "grad_norm": 0.21915653347969055, + "learning_rate": 0.00010036036036036035, + "loss": 0.3246, + "step": 4442 + }, + { + "epoch": 1.3318718381112986, + "grad_norm": 0.2159089297056198, + "learning_rate": 0.00010031531531531531, + "loss": 0.3636, + "step": 4443 + }, + { + "epoch": 1.332171632002998, + "grad_norm": 0.22735199332237244, + "learning_rate": 0.00010027027027027025, + "loss": 0.3687, + "step": 4444 + }, + { + "epoch": 1.3324714258946975, + "grad_norm": 0.218096524477005, + "learning_rate": 0.00010022522522522521, + "loss": 0.3914, + "step": 4445 + }, + { + "epoch": 1.3327712197863968, + "grad_norm": 0.21156595647335052, + "learning_rate": 0.00010018018018018018, + "loss": 0.3915, + "step": 4446 + }, + { + "epoch": 1.3330710136780963, + "grad_norm": 0.2183390110731125, + "learning_rate": 0.00010013513513513511, + "loss": 0.3611, + "step": 4447 + }, + { + "epoch": 1.3333708075697959, + "grad_norm": 0.21990258991718292, + "learning_rate": 0.00010009009009009008, + "loss": 0.3614, + "step": 4448 + }, + { + "epoch": 1.3336706014614952, + "grad_norm": 0.21382641792297363, + "learning_rate": 0.00010004504504504504, + "loss": 0.3541, + "step": 4449 + }, + { + "epoch": 1.3339703953531947, + "grad_norm": 0.23284634947776794, + "learning_rate": 9.999999999999999e-05, + "loss": 0.3694, + "step": 4450 + }, + { + "epoch": 1.334270189244894, + "grad_norm": 0.22020602226257324, + "learning_rate": 9.995495495495494e-05, + "loss": 0.3559, + "step": 4451 + }, + { + "epoch": 1.3345699831365936, + "grad_norm": 0.2088702917098999, + "learning_rate": 9.99099099099099e-05, + "loss": 0.3566, + "step": 4452 + }, + { + "epoch": 1.3348697770282931, + "grad_norm": 0.26016759872436523, + "learning_rate": 9.986486486486486e-05, + "loss": 0.3741, + "step": 4453 + }, + { + "epoch": 1.3351695709199924, + "grad_norm": 0.22935625910758972, + "learning_rate": 9.981981981981982e-05, + "loss": 0.3789, + "step": 4454 + }, + { + "epoch": 1.335469364811692, + "grad_norm": 0.2152242511510849, + "learning_rate": 9.977477477477477e-05, + "loss": 0.3712, + "step": 4455 + }, + { + "epoch": 1.3357691587033913, + "grad_norm": 0.2259458303451538, + "learning_rate": 9.972972972972972e-05, + "loss": 0.3712, + "step": 4456 + }, + { + "epoch": 1.3360689525950908, + "grad_norm": 0.21118693053722382, + "learning_rate": 9.968468468468468e-05, + "loss": 0.3546, + "step": 4457 + }, + { + "epoch": 1.3363687464867904, + "grad_norm": 0.21529187262058258, + "learning_rate": 9.963963963963962e-05, + "loss": 0.3721, + "step": 4458 + }, + { + "epoch": 1.33666854037849, + "grad_norm": 0.21422810852527618, + "learning_rate": 9.959459459459458e-05, + "loss": 0.3645, + "step": 4459 + }, + { + "epoch": 1.3369683342701892, + "grad_norm": 0.2009521722793579, + "learning_rate": 9.954954954954955e-05, + "loss": 0.3561, + "step": 4460 + }, + { + "epoch": 1.3372681281618888, + "grad_norm": 0.22312965989112854, + "learning_rate": 9.950450450450449e-05, + "loss": 0.3546, + "step": 4461 + }, + { + "epoch": 1.337567922053588, + "grad_norm": 0.21248088777065277, + "learning_rate": 9.945945945945945e-05, + "loss": 0.3603, + "step": 4462 + }, + { + "epoch": 1.3378677159452876, + "grad_norm": 0.21879053115844727, + "learning_rate": 9.941441441441441e-05, + "loss": 0.3847, + "step": 4463 + }, + { + "epoch": 1.3381675098369872, + "grad_norm": 0.2076069414615631, + "learning_rate": 9.936936936936935e-05, + "loss": 0.3678, + "step": 4464 + }, + { + "epoch": 1.3384673037286865, + "grad_norm": 0.19983349740505219, + "learning_rate": 9.932432432432431e-05, + "loss": 0.3483, + "step": 4465 + }, + { + "epoch": 1.338767097620386, + "grad_norm": 0.20354819297790527, + "learning_rate": 9.927927927927928e-05, + "loss": 0.3098, + "step": 4466 + }, + { + "epoch": 1.3390668915120854, + "grad_norm": 0.2120402306318283, + "learning_rate": 9.923423423423423e-05, + "loss": 0.3742, + "step": 4467 + }, + { + "epoch": 1.339366685403785, + "grad_norm": 0.20712968707084656, + "learning_rate": 9.918918918918918e-05, + "loss": 0.3682, + "step": 4468 + }, + { + "epoch": 1.3396664792954844, + "grad_norm": 0.205884650349617, + "learning_rate": 9.914414414414413e-05, + "loss": 0.3232, + "step": 4469 + }, + { + "epoch": 1.3399662731871838, + "grad_norm": 0.20404751598834991, + "learning_rate": 9.909909909909909e-05, + "loss": 0.3413, + "step": 4470 + }, + { + "epoch": 1.3402660670788833, + "grad_norm": 0.20457707345485687, + "learning_rate": 9.905405405405404e-05, + "loss": 0.3455, + "step": 4471 + }, + { + "epoch": 1.3405658609705826, + "grad_norm": 0.20822665095329285, + "learning_rate": 9.900900900900899e-05, + "loss": 0.3559, + "step": 4472 + }, + { + "epoch": 1.3408656548622822, + "grad_norm": 0.21473756432533264, + "learning_rate": 9.896396396396396e-05, + "loss": 0.3723, + "step": 4473 + }, + { + "epoch": 1.3411654487539817, + "grad_norm": 0.22079592943191528, + "learning_rate": 9.891891891891892e-05, + "loss": 0.3715, + "step": 4474 + }, + { + "epoch": 1.3414652426456812, + "grad_norm": 0.22699683904647827, + "learning_rate": 9.887387387387386e-05, + "loss": 0.4057, + "step": 4475 + }, + { + "epoch": 1.3417650365373806, + "grad_norm": 0.20466183125972748, + "learning_rate": 9.882882882882882e-05, + "loss": 0.3454, + "step": 4476 + }, + { + "epoch": 1.3420648304290799, + "grad_norm": 0.2107648253440857, + "learning_rate": 9.878378378378379e-05, + "loss": 0.3393, + "step": 4477 + }, + { + "epoch": 1.3423646243207794, + "grad_norm": 0.20453116297721863, + "learning_rate": 9.873873873873872e-05, + "loss": 0.3393, + "step": 4478 + }, + { + "epoch": 1.342664418212479, + "grad_norm": 0.21234673261642456, + "learning_rate": 9.869369369369369e-05, + "loss": 0.3625, + "step": 4479 + }, + { + "epoch": 1.3429642121041785, + "grad_norm": 0.2178509384393692, + "learning_rate": 9.864864864864865e-05, + "loss": 0.351, + "step": 4480 + }, + { + "epoch": 1.3432640059958778, + "grad_norm": 0.2194160670042038, + "learning_rate": 9.860360360360359e-05, + "loss": 0.3498, + "step": 4481 + }, + { + "epoch": 1.3435637998875773, + "grad_norm": 0.19866818189620972, + "learning_rate": 9.855855855855855e-05, + "loss": 0.3126, + "step": 4482 + }, + { + "epoch": 1.3438635937792767, + "grad_norm": 0.20963077247142792, + "learning_rate": 9.85135135135135e-05, + "loss": 0.3505, + "step": 4483 + }, + { + "epoch": 1.3441633876709762, + "grad_norm": 0.23428975045681, + "learning_rate": 9.846846846846846e-05, + "loss": 0.3655, + "step": 4484 + }, + { + "epoch": 1.3444631815626757, + "grad_norm": 0.21178551018238068, + "learning_rate": 9.842342342342342e-05, + "loss": 0.3542, + "step": 4485 + }, + { + "epoch": 1.344762975454375, + "grad_norm": 0.20787645876407623, + "learning_rate": 9.837837837837837e-05, + "loss": 0.3543, + "step": 4486 + }, + { + "epoch": 1.3450627693460746, + "grad_norm": 0.2064124494791031, + "learning_rate": 9.833333333333333e-05, + "loss": 0.3534, + "step": 4487 + }, + { + "epoch": 1.345362563237774, + "grad_norm": 0.2198820412158966, + "learning_rate": 9.828828828828828e-05, + "loss": 0.3539, + "step": 4488 + }, + { + "epoch": 1.3456623571294735, + "grad_norm": 0.21668361127376556, + "learning_rate": 9.824324324324323e-05, + "loss": 0.3707, + "step": 4489 + }, + { + "epoch": 1.345962151021173, + "grad_norm": 0.21410995721817017, + "learning_rate": 9.81981981981982e-05, + "loss": 0.3572, + "step": 4490 + }, + { + "epoch": 1.3462619449128723, + "grad_norm": 0.21897783875465393, + "learning_rate": 9.815315315315316e-05, + "loss": 0.3775, + "step": 4491 + }, + { + "epoch": 1.3465617388045719, + "grad_norm": 0.20904091000556946, + "learning_rate": 9.81081081081081e-05, + "loss": 0.3521, + "step": 4492 + }, + { + "epoch": 1.3468615326962712, + "grad_norm": 0.23392531275749207, + "learning_rate": 9.806306306306306e-05, + "loss": 0.3709, + "step": 4493 + }, + { + "epoch": 1.3471613265879707, + "grad_norm": 0.21514087915420532, + "learning_rate": 9.801801801801801e-05, + "loss": 0.3661, + "step": 4494 + }, + { + "epoch": 1.3474611204796703, + "grad_norm": 0.2174890786409378, + "learning_rate": 9.797297297297296e-05, + "loss": 0.3611, + "step": 4495 + }, + { + "epoch": 1.3477609143713698, + "grad_norm": 0.21337510645389557, + "learning_rate": 9.792792792792792e-05, + "loss": 0.3856, + "step": 4496 + }, + { + "epoch": 1.3480607082630691, + "grad_norm": 0.22505298256874084, + "learning_rate": 9.788288288288287e-05, + "loss": 0.3736, + "step": 4497 + }, + { + "epoch": 1.3483605021547687, + "grad_norm": 0.21831883490085602, + "learning_rate": 9.783783783783782e-05, + "loss": 0.3548, + "step": 4498 + }, + { + "epoch": 1.348660296046468, + "grad_norm": 0.2187795341014862, + "learning_rate": 9.779279279279279e-05, + "loss": 0.3482, + "step": 4499 + }, + { + "epoch": 1.3489600899381675, + "grad_norm": 0.21773041784763336, + "learning_rate": 9.774774774774774e-05, + "loss": 0.3732, + "step": 4500 + }, + { + "epoch": 1.3489600899381675, + "eval_loss": 0.4129393398761749, + "eval_runtime": 565.7837, + "eval_samples_per_second": 3.816, + "eval_steps_per_second": 0.477, + "step": 4500 + }, + { + "epoch": 1.349259883829867, + "grad_norm": 0.20802539587020874, + "learning_rate": 9.77027027027027e-05, + "loss": 0.3597, + "step": 4501 + }, + { + "epoch": 1.3495596777215664, + "grad_norm": 0.2075861245393753, + "learning_rate": 9.765765765765765e-05, + "loss": 0.3471, + "step": 4502 + }, + { + "epoch": 1.349859471613266, + "grad_norm": 0.2349012792110443, + "learning_rate": 9.76126126126126e-05, + "loss": 0.3839, + "step": 4503 + }, + { + "epoch": 1.3501592655049652, + "grad_norm": 0.22266164422035217, + "learning_rate": 9.756756756756757e-05, + "loss": 0.3602, + "step": 4504 + }, + { + "epoch": 1.3504590593966648, + "grad_norm": 0.23045696318149567, + "learning_rate": 9.75225225225225e-05, + "loss": 0.379, + "step": 4505 + }, + { + "epoch": 1.3507588532883643, + "grad_norm": 0.2137947827577591, + "learning_rate": 9.747747747747747e-05, + "loss": 0.3318, + "step": 4506 + }, + { + "epoch": 1.3510586471800636, + "grad_norm": 0.21073168516159058, + "learning_rate": 9.743243243243243e-05, + "loss": 0.3563, + "step": 4507 + }, + { + "epoch": 1.3513584410717632, + "grad_norm": 0.21682259440422058, + "learning_rate": 9.738738738738737e-05, + "loss": 0.356, + "step": 4508 + }, + { + "epoch": 1.3516582349634625, + "grad_norm": 0.2167338877916336, + "learning_rate": 9.734234234234233e-05, + "loss": 0.3348, + "step": 4509 + }, + { + "epoch": 1.351958028855162, + "grad_norm": 0.21236591041088104, + "learning_rate": 9.72972972972973e-05, + "loss": 0.3618, + "step": 4510 + }, + { + "epoch": 1.3522578227468616, + "grad_norm": 0.20849741995334625, + "learning_rate": 9.725225225225223e-05, + "loss": 0.3405, + "step": 4511 + }, + { + "epoch": 1.352557616638561, + "grad_norm": 0.20645050704479218, + "learning_rate": 9.72072072072072e-05, + "loss": 0.3501, + "step": 4512 + }, + { + "epoch": 1.3528574105302604, + "grad_norm": 0.2172173261642456, + "learning_rate": 9.716216216216216e-05, + "loss": 0.3414, + "step": 4513 + }, + { + "epoch": 1.35315720442196, + "grad_norm": 0.21994781494140625, + "learning_rate": 9.711711711711711e-05, + "loss": 0.3745, + "step": 4514 + }, + { + "epoch": 1.3534569983136593, + "grad_norm": 0.22667646408081055, + "learning_rate": 9.707207207207206e-05, + "loss": 0.363, + "step": 4515 + }, + { + "epoch": 1.3537567922053588, + "grad_norm": 0.21792596578598022, + "learning_rate": 9.702702702702702e-05, + "loss": 0.364, + "step": 4516 + }, + { + "epoch": 1.3540565860970584, + "grad_norm": 0.216531440615654, + "learning_rate": 9.698198198198197e-05, + "loss": 0.3672, + "step": 4517 + }, + { + "epoch": 1.3543563799887577, + "grad_norm": 0.20364072918891907, + "learning_rate": 9.693693693693694e-05, + "loss": 0.3262, + "step": 4518 + }, + { + "epoch": 1.3546561738804572, + "grad_norm": 0.22151176631450653, + "learning_rate": 9.689189189189187e-05, + "loss": 0.3748, + "step": 4519 + }, + { + "epoch": 1.3549559677721565, + "grad_norm": 0.21700510382652283, + "learning_rate": 9.684684684684684e-05, + "loss": 0.3585, + "step": 4520 + }, + { + "epoch": 1.355255761663856, + "grad_norm": 0.21197721362113953, + "learning_rate": 9.68018018018018e-05, + "loss": 0.3448, + "step": 4521 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 0.21541689336299896, + "learning_rate": 9.675675675675674e-05, + "loss": 0.3622, + "step": 4522 + }, + { + "epoch": 1.355855349447255, + "grad_norm": 0.2156587541103363, + "learning_rate": 9.67117117117117e-05, + "loss": 0.363, + "step": 4523 + }, + { + "epoch": 1.3561551433389545, + "grad_norm": 0.21375496685504913, + "learning_rate": 9.666666666666667e-05, + "loss": 0.3576, + "step": 4524 + }, + { + "epoch": 1.3564549372306538, + "grad_norm": 0.208513543009758, + "learning_rate": 9.66216216216216e-05, + "loss": 0.3604, + "step": 4525 + }, + { + "epoch": 1.3567547311223533, + "grad_norm": 0.2164967805147171, + "learning_rate": 9.657657657657657e-05, + "loss": 0.3814, + "step": 4526 + }, + { + "epoch": 1.3570545250140529, + "grad_norm": 0.2145373374223709, + "learning_rate": 9.653153153153153e-05, + "loss": 0.3636, + "step": 4527 + }, + { + "epoch": 1.3573543189057524, + "grad_norm": 0.20882003009319305, + "learning_rate": 9.648648648648647e-05, + "loss": 0.347, + "step": 4528 + }, + { + "epoch": 1.3576541127974517, + "grad_norm": 0.2010725885629654, + "learning_rate": 9.644144144144143e-05, + "loss": 0.3423, + "step": 4529 + }, + { + "epoch": 1.3579539066891513, + "grad_norm": 0.2104565054178238, + "learning_rate": 9.639639639639638e-05, + "loss": 0.3505, + "step": 4530 + }, + { + "epoch": 1.3582537005808506, + "grad_norm": 0.2170221358537674, + "learning_rate": 9.635135135135135e-05, + "loss": 0.3708, + "step": 4531 + }, + { + "epoch": 1.3585534944725501, + "grad_norm": 0.21563062071800232, + "learning_rate": 9.63063063063063e-05, + "loss": 0.3764, + "step": 4532 + }, + { + "epoch": 1.3588532883642497, + "grad_norm": 0.21485255658626556, + "learning_rate": 9.626126126126125e-05, + "loss": 0.3438, + "step": 4533 + }, + { + "epoch": 1.359153082255949, + "grad_norm": 0.20779499411582947, + "learning_rate": 9.621621621621621e-05, + "loss": 0.3597, + "step": 4534 + }, + { + "epoch": 1.3594528761476485, + "grad_norm": 0.22415833175182343, + "learning_rate": 9.617117117117117e-05, + "loss": 0.3897, + "step": 4535 + }, + { + "epoch": 1.3597526700393479, + "grad_norm": 0.21616344153881073, + "learning_rate": 9.612612612612611e-05, + "loss": 0.3397, + "step": 4536 + }, + { + "epoch": 1.3600524639310474, + "grad_norm": 0.20136888325214386, + "learning_rate": 9.608108108108108e-05, + "loss": 0.3543, + "step": 4537 + }, + { + "epoch": 1.360352257822747, + "grad_norm": 0.21425513923168182, + "learning_rate": 9.603603603603604e-05, + "loss": 0.365, + "step": 4538 + }, + { + "epoch": 1.3606520517144463, + "grad_norm": 0.2139771580696106, + "learning_rate": 9.599099099099098e-05, + "loss": 0.3724, + "step": 4539 + }, + { + "epoch": 1.3609518456061458, + "grad_norm": 0.20793326199054718, + "learning_rate": 9.594594594594594e-05, + "loss": 0.3242, + "step": 4540 + }, + { + "epoch": 1.3612516394978451, + "grad_norm": 0.21706359088420868, + "learning_rate": 9.590090090090089e-05, + "loss": 0.3471, + "step": 4541 + }, + { + "epoch": 1.3615514333895447, + "grad_norm": 0.21347346901893616, + "learning_rate": 9.585585585585584e-05, + "loss": 0.343, + "step": 4542 + }, + { + "epoch": 1.3618512272812442, + "grad_norm": 0.22152802348136902, + "learning_rate": 9.58108108108108e-05, + "loss": 0.3857, + "step": 4543 + }, + { + "epoch": 1.3621510211729437, + "grad_norm": 0.21097399294376373, + "learning_rate": 9.576576576576575e-05, + "loss": 0.3675, + "step": 4544 + }, + { + "epoch": 1.362450815064643, + "grad_norm": 0.22488509118556976, + "learning_rate": 9.57207207207207e-05, + "loss": 0.3796, + "step": 4545 + }, + { + "epoch": 1.3627506089563426, + "grad_norm": 0.20570115745067596, + "learning_rate": 9.567567567567567e-05, + "loss": 0.3662, + "step": 4546 + }, + { + "epoch": 1.363050402848042, + "grad_norm": 0.20886462926864624, + "learning_rate": 9.563063063063062e-05, + "loss": 0.3735, + "step": 4547 + }, + { + "epoch": 1.3633501967397414, + "grad_norm": 0.22209970653057098, + "learning_rate": 9.558558558558558e-05, + "loss": 0.3648, + "step": 4548 + }, + { + "epoch": 1.363649990631441, + "grad_norm": 0.2033938765525818, + "learning_rate": 9.554054054054053e-05, + "loss": 0.3484, + "step": 4549 + }, + { + "epoch": 1.3639497845231403, + "grad_norm": 0.21983402967453003, + "learning_rate": 9.549549549549548e-05, + "loss": 0.3743, + "step": 4550 + }, + { + "epoch": 1.3642495784148398, + "grad_norm": 0.22346578538417816, + "learning_rate": 9.545045045045045e-05, + "loss": 0.3806, + "step": 4551 + }, + { + "epoch": 1.3645493723065392, + "grad_norm": 0.26459768414497375, + "learning_rate": 9.540540540540541e-05, + "loss": 0.3382, + "step": 4552 + }, + { + "epoch": 1.3648491661982387, + "grad_norm": 0.20662111043930054, + "learning_rate": 9.536036036036035e-05, + "loss": 0.3322, + "step": 4553 + }, + { + "epoch": 1.3651489600899382, + "grad_norm": 0.21967481076717377, + "learning_rate": 9.531531531531531e-05, + "loss": 0.3699, + "step": 4554 + }, + { + "epoch": 1.3654487539816376, + "grad_norm": 0.21974623203277588, + "learning_rate": 9.527027027027025e-05, + "loss": 0.3481, + "step": 4555 + }, + { + "epoch": 1.365748547873337, + "grad_norm": 0.19991014897823334, + "learning_rate": 9.522522522522521e-05, + "loss": 0.3348, + "step": 4556 + }, + { + "epoch": 1.3660483417650364, + "grad_norm": 0.22287118434906006, + "learning_rate": 9.518018018018018e-05, + "loss": 0.3791, + "step": 4557 + }, + { + "epoch": 1.366348135656736, + "grad_norm": 0.22346952557563782, + "learning_rate": 9.513513513513513e-05, + "loss": 0.3663, + "step": 4558 + }, + { + "epoch": 1.3666479295484355, + "grad_norm": 0.2181125432252884, + "learning_rate": 9.509009009009008e-05, + "loss": 0.3678, + "step": 4559 + }, + { + "epoch": 1.366947723440135, + "grad_norm": 0.2311943769454956, + "learning_rate": 9.504504504504504e-05, + "loss": 0.3705, + "step": 4560 + }, + { + "epoch": 1.3672475173318344, + "grad_norm": 0.20911650359630585, + "learning_rate": 9.499999999999999e-05, + "loss": 0.3471, + "step": 4561 + }, + { + "epoch": 1.367547311223534, + "grad_norm": 0.22261382639408112, + "learning_rate": 9.495495495495494e-05, + "loss": 0.3691, + "step": 4562 + }, + { + "epoch": 1.3678471051152332, + "grad_norm": 0.2156069576740265, + "learning_rate": 9.49099099099099e-05, + "loss": 0.3537, + "step": 4563 + }, + { + "epoch": 1.3681468990069328, + "grad_norm": 0.21138927340507507, + "learning_rate": 9.486486486486486e-05, + "loss": 0.3507, + "step": 4564 + }, + { + "epoch": 1.3684466928986323, + "grad_norm": 0.2109546810388565, + "learning_rate": 9.481981981981982e-05, + "loss": 0.3669, + "step": 4565 + }, + { + "epoch": 1.3687464867903316, + "grad_norm": 0.2068871259689331, + "learning_rate": 9.477477477477476e-05, + "loss": 0.3529, + "step": 4566 + }, + { + "epoch": 1.3690462806820312, + "grad_norm": 0.21869945526123047, + "learning_rate": 9.472972972972972e-05, + "loss": 0.3913, + "step": 4567 + }, + { + "epoch": 1.3693460745737305, + "grad_norm": 0.2108869105577469, + "learning_rate": 9.468468468468468e-05, + "loss": 0.3622, + "step": 4568 + }, + { + "epoch": 1.36964586846543, + "grad_norm": 0.21931153535842896, + "learning_rate": 9.463963963963962e-05, + "loss": 0.3598, + "step": 4569 + }, + { + "epoch": 1.3699456623571296, + "grad_norm": 0.2235441952943802, + "learning_rate": 9.459459459459459e-05, + "loss": 0.3696, + "step": 4570 + }, + { + "epoch": 1.3702454562488289, + "grad_norm": 0.21451959013938904, + "learning_rate": 9.454954954954955e-05, + "loss": 0.3565, + "step": 4571 + }, + { + "epoch": 1.3705452501405284, + "grad_norm": 0.22164882719516754, + "learning_rate": 9.450450450450449e-05, + "loss": 0.3669, + "step": 4572 + }, + { + "epoch": 1.3708450440322277, + "grad_norm": 0.20921410620212555, + "learning_rate": 9.445945945945945e-05, + "loss": 0.3552, + "step": 4573 + }, + { + "epoch": 1.3711448379239273, + "grad_norm": 0.22980685532093048, + "learning_rate": 9.441441441441441e-05, + "loss": 0.3638, + "step": 4574 + }, + { + "epoch": 1.3714446318156268, + "grad_norm": 0.20418162643909454, + "learning_rate": 9.436936936936936e-05, + "loss": 0.3414, + "step": 4575 + }, + { + "epoch": 1.3717444257073261, + "grad_norm": 0.20581036806106567, + "learning_rate": 9.432432432432431e-05, + "loss": 0.3271, + "step": 4576 + }, + { + "epoch": 1.3720442195990257, + "grad_norm": 0.21641530096530914, + "learning_rate": 9.427927927927926e-05, + "loss": 0.35, + "step": 4577 + }, + { + "epoch": 1.372344013490725, + "grad_norm": 0.21078841388225555, + "learning_rate": 9.423423423423423e-05, + "loss": 0.3677, + "step": 4578 + }, + { + "epoch": 1.3726438073824245, + "grad_norm": 0.22394929826259613, + "learning_rate": 9.418918918918918e-05, + "loss": 0.3811, + "step": 4579 + }, + { + "epoch": 1.372943601274124, + "grad_norm": 0.21448741853237152, + "learning_rate": 9.414414414414413e-05, + "loss": 0.3627, + "step": 4580 + }, + { + "epoch": 1.3732433951658236, + "grad_norm": 0.23157335817813873, + "learning_rate": 9.409909909909909e-05, + "loss": 0.3881, + "step": 4581 + }, + { + "epoch": 1.373543189057523, + "grad_norm": 0.21925771236419678, + "learning_rate": 9.405405405405406e-05, + "loss": 0.3546, + "step": 4582 + }, + { + "epoch": 1.3738429829492225, + "grad_norm": 0.21934622526168823, + "learning_rate": 9.4009009009009e-05, + "loss": 0.3487, + "step": 4583 + }, + { + "epoch": 1.3741427768409218, + "grad_norm": 0.22106409072875977, + "learning_rate": 9.396396396396396e-05, + "loss": 0.3689, + "step": 4584 + }, + { + "epoch": 1.3744425707326213, + "grad_norm": 0.2363518327474594, + "learning_rate": 9.391891891891892e-05, + "loss": 0.36, + "step": 4585 + }, + { + "epoch": 1.3747423646243209, + "grad_norm": 0.22158819437026978, + "learning_rate": 9.387387387387386e-05, + "loss": 0.37, + "step": 4586 + }, + { + "epoch": 1.3750421585160202, + "grad_norm": 0.2105867564678192, + "learning_rate": 9.382882882882882e-05, + "loss": 0.3588, + "step": 4587 + }, + { + "epoch": 1.3753419524077197, + "grad_norm": 0.22234956920146942, + "learning_rate": 9.378378378378379e-05, + "loss": 0.3523, + "step": 4588 + }, + { + "epoch": 1.375641746299419, + "grad_norm": 0.20997658371925354, + "learning_rate": 9.373873873873872e-05, + "loss": 0.3504, + "step": 4589 + }, + { + "epoch": 1.3759415401911186, + "grad_norm": 0.22327353060245514, + "learning_rate": 9.369369369369369e-05, + "loss": 0.365, + "step": 4590 + }, + { + "epoch": 1.3762413340828181, + "grad_norm": 0.218084916472435, + "learning_rate": 9.364864864864864e-05, + "loss": 0.3526, + "step": 4591 + }, + { + "epoch": 1.3765411279745174, + "grad_norm": 0.2157234251499176, + "learning_rate": 9.36036036036036e-05, + "loss": 0.3539, + "step": 4592 + }, + { + "epoch": 1.376840921866217, + "grad_norm": 0.22182735800743103, + "learning_rate": 9.355855855855855e-05, + "loss": 0.3468, + "step": 4593 + }, + { + "epoch": 1.3771407157579163, + "grad_norm": 0.21038967370986938, + "learning_rate": 9.35135135135135e-05, + "loss": 0.3549, + "step": 4594 + }, + { + "epoch": 1.3774405096496158, + "grad_norm": 0.2077571451663971, + "learning_rate": 9.346846846846847e-05, + "loss": 0.3517, + "step": 4595 + }, + { + "epoch": 1.3777403035413154, + "grad_norm": 0.20815788209438324, + "learning_rate": 9.342342342342342e-05, + "loss": 0.3568, + "step": 4596 + }, + { + "epoch": 1.378040097433015, + "grad_norm": 0.21737034618854523, + "learning_rate": 9.337837837837837e-05, + "loss": 0.3792, + "step": 4597 + }, + { + "epoch": 1.3783398913247142, + "grad_norm": 0.2084619402885437, + "learning_rate": 9.333333333333333e-05, + "loss": 0.3389, + "step": 4598 + }, + { + "epoch": 1.3786396852164138, + "grad_norm": 0.222722589969635, + "learning_rate": 9.32882882882883e-05, + "loss": 0.3812, + "step": 4599 + }, + { + "epoch": 1.378939479108113, + "grad_norm": 0.21024678647518158, + "learning_rate": 9.324324324324323e-05, + "loss": 0.3675, + "step": 4600 + }, + { + "epoch": 1.3792392729998126, + "grad_norm": 0.21897338330745697, + "learning_rate": 9.31981981981982e-05, + "loss": 0.3664, + "step": 4601 + }, + { + "epoch": 1.3795390668915122, + "grad_norm": 0.21706995368003845, + "learning_rate": 9.315315315315313e-05, + "loss": 0.346, + "step": 4602 + }, + { + "epoch": 1.3798388607832115, + "grad_norm": 0.21143165230751038, + "learning_rate": 9.31081081081081e-05, + "loss": 0.3465, + "step": 4603 + }, + { + "epoch": 1.380138654674911, + "grad_norm": 0.2121080905199051, + "learning_rate": 9.306306306306306e-05, + "loss": 0.349, + "step": 4604 + }, + { + "epoch": 1.3804384485666104, + "grad_norm": 0.21112461388111115, + "learning_rate": 9.301801801801801e-05, + "loss": 0.3558, + "step": 4605 + }, + { + "epoch": 1.38073824245831, + "grad_norm": 0.21366949379444122, + "learning_rate": 9.297297297297296e-05, + "loss": 0.3601, + "step": 4606 + }, + { + "epoch": 1.3810380363500094, + "grad_norm": 0.21894964575767517, + "learning_rate": 9.292792792792792e-05, + "loss": 0.3619, + "step": 4607 + }, + { + "epoch": 1.3813378302417088, + "grad_norm": 0.21191251277923584, + "learning_rate": 9.288288288288287e-05, + "loss": 0.3592, + "step": 4608 + }, + { + "epoch": 1.3816376241334083, + "grad_norm": 0.22147145867347717, + "learning_rate": 9.283783783783784e-05, + "loss": 0.3592, + "step": 4609 + }, + { + "epoch": 1.3819374180251076, + "grad_norm": 0.2090921849012375, + "learning_rate": 9.279279279279279e-05, + "loss": 0.3391, + "step": 4610 + }, + { + "epoch": 1.3822372119168072, + "grad_norm": 0.20255540311336517, + "learning_rate": 9.274774774774774e-05, + "loss": 0.3383, + "step": 4611 + }, + { + "epoch": 1.3825370058085067, + "grad_norm": 0.21785931289196014, + "learning_rate": 9.27027027027027e-05, + "loss": 0.3639, + "step": 4612 + }, + { + "epoch": 1.3828367997002062, + "grad_norm": 0.21439585089683533, + "learning_rate": 9.265765765765764e-05, + "loss": 0.3805, + "step": 4613 + }, + { + "epoch": 1.3831365935919055, + "grad_norm": 0.2247477024793625, + "learning_rate": 9.26126126126126e-05, + "loss": 0.3709, + "step": 4614 + }, + { + "epoch": 1.383436387483605, + "grad_norm": 0.22263139486312866, + "learning_rate": 9.256756756756757e-05, + "loss": 0.3801, + "step": 4615 + }, + { + "epoch": 1.3837361813753044, + "grad_norm": 0.22057713568210602, + "learning_rate": 9.25225225225225e-05, + "loss": 0.3709, + "step": 4616 + }, + { + "epoch": 1.384035975267004, + "grad_norm": 0.21248671412467957, + "learning_rate": 9.247747747747747e-05, + "loss": 0.3655, + "step": 4617 + }, + { + "epoch": 1.3843357691587035, + "grad_norm": 0.22102704644203186, + "learning_rate": 9.243243243243243e-05, + "loss": 0.3594, + "step": 4618 + }, + { + "epoch": 1.3846355630504028, + "grad_norm": 0.21167469024658203, + "learning_rate": 9.238738738738737e-05, + "loss": 0.3515, + "step": 4619 + }, + { + "epoch": 1.3849353569421023, + "grad_norm": 0.225198894739151, + "learning_rate": 9.234234234234233e-05, + "loss": 0.3936, + "step": 4620 + }, + { + "epoch": 1.3852351508338017, + "grad_norm": 0.22111676633358002, + "learning_rate": 9.22972972972973e-05, + "loss": 0.3449, + "step": 4621 + }, + { + "epoch": 1.3855349447255012, + "grad_norm": 0.2069976031780243, + "learning_rate": 9.225225225225225e-05, + "loss": 0.3571, + "step": 4622 + }, + { + "epoch": 1.3858347386172007, + "grad_norm": 0.21398650109767914, + "learning_rate": 9.22072072072072e-05, + "loss": 0.3696, + "step": 4623 + }, + { + "epoch": 1.3861345325089, + "grad_norm": 0.1977337896823883, + "learning_rate": 9.216216216216216e-05, + "loss": 0.3493, + "step": 4624 + }, + { + "epoch": 1.3864343264005996, + "grad_norm": 0.21368259191513062, + "learning_rate": 9.211711711711711e-05, + "loss": 0.3657, + "step": 4625 + }, + { + "epoch": 1.386734120292299, + "grad_norm": 0.2091464400291443, + "learning_rate": 9.207207207207206e-05, + "loss": 0.3497, + "step": 4626 + }, + { + "epoch": 1.3870339141839985, + "grad_norm": 0.1985793113708496, + "learning_rate": 9.202702702702701e-05, + "loss": 0.3193, + "step": 4627 + }, + { + "epoch": 1.387333708075698, + "grad_norm": 0.2133084088563919, + "learning_rate": 9.198198198198197e-05, + "loss": 0.3502, + "step": 4628 + }, + { + "epoch": 1.3876335019673975, + "grad_norm": 0.2148822396993637, + "learning_rate": 9.193693693693694e-05, + "loss": 0.3713, + "step": 4629 + }, + { + "epoch": 1.3879332958590969, + "grad_norm": 0.20007802546024323, + "learning_rate": 9.189189189189188e-05, + "loss": 0.3348, + "step": 4630 + }, + { + "epoch": 1.3882330897507964, + "grad_norm": 0.21505482494831085, + "learning_rate": 9.184684684684684e-05, + "loss": 0.3381, + "step": 4631 + }, + { + "epoch": 1.3885328836424957, + "grad_norm": 0.21337425708770752, + "learning_rate": 9.18018018018018e-05, + "loss": 0.3535, + "step": 4632 + }, + { + "epoch": 1.3888326775341953, + "grad_norm": 0.2174317091703415, + "learning_rate": 9.175675675675674e-05, + "loss": 0.3692, + "step": 4633 + }, + { + "epoch": 1.3891324714258948, + "grad_norm": 0.2055487334728241, + "learning_rate": 9.17117117117117e-05, + "loss": 0.3505, + "step": 4634 + }, + { + "epoch": 1.3894322653175941, + "grad_norm": 0.21364519000053406, + "learning_rate": 9.166666666666667e-05, + "loss": 0.3436, + "step": 4635 + }, + { + "epoch": 1.3897320592092937, + "grad_norm": 0.2120402306318283, + "learning_rate": 9.16216216216216e-05, + "loss": 0.361, + "step": 4636 + }, + { + "epoch": 1.390031853100993, + "grad_norm": 0.210201233625412, + "learning_rate": 9.157657657657657e-05, + "loss": 0.3496, + "step": 4637 + }, + { + "epoch": 1.3903316469926925, + "grad_norm": 0.22710666060447693, + "learning_rate": 9.153153153153152e-05, + "loss": 0.3787, + "step": 4638 + }, + { + "epoch": 1.390631440884392, + "grad_norm": 0.21441099047660828, + "learning_rate": 9.148648648648648e-05, + "loss": 0.3516, + "step": 4639 + }, + { + "epoch": 1.3909312347760914, + "grad_norm": 0.21323607861995697, + "learning_rate": 9.144144144144143e-05, + "loss": 0.3751, + "step": 4640 + }, + { + "epoch": 1.391231028667791, + "grad_norm": 0.21749518811702728, + "learning_rate": 9.139639639639638e-05, + "loss": 0.3544, + "step": 4641 + }, + { + "epoch": 1.3915308225594902, + "grad_norm": 0.2149432897567749, + "learning_rate": 9.135135135135135e-05, + "loss": 0.3548, + "step": 4642 + }, + { + "epoch": 1.3918306164511898, + "grad_norm": 0.2081831991672516, + "learning_rate": 9.13063063063063e-05, + "loss": 0.3496, + "step": 4643 + }, + { + "epoch": 1.3921304103428893, + "grad_norm": 0.19373933970928192, + "learning_rate": 9.126126126126125e-05, + "loss": 0.3397, + "step": 4644 + }, + { + "epoch": 1.3924302042345889, + "grad_norm": 0.2193603813648224, + "learning_rate": 9.121621621621621e-05, + "loss": 0.3663, + "step": 4645 + }, + { + "epoch": 1.3927299981262882, + "grad_norm": 0.2071419656276703, + "learning_rate": 9.117117117117118e-05, + "loss": 0.3459, + "step": 4646 + }, + { + "epoch": 1.3930297920179877, + "grad_norm": 0.21072034537792206, + "learning_rate": 9.112612612612611e-05, + "loss": 0.3491, + "step": 4647 + }, + { + "epoch": 1.393329585909687, + "grad_norm": 0.21739129722118378, + "learning_rate": 9.108108108108108e-05, + "loss": 0.358, + "step": 4648 + }, + { + "epoch": 1.3936293798013866, + "grad_norm": 0.21196670830249786, + "learning_rate": 9.103603603603604e-05, + "loss": 0.3595, + "step": 4649 + }, + { + "epoch": 1.393929173693086, + "grad_norm": 0.20686453580856323, + "learning_rate": 9.099099099099098e-05, + "loss": 0.3514, + "step": 4650 + }, + { + "epoch": 1.3942289675847854, + "grad_norm": 0.21052870154380798, + "learning_rate": 9.094594594594594e-05, + "loss": 0.3654, + "step": 4651 + }, + { + "epoch": 1.394528761476485, + "grad_norm": 0.21836452186107635, + "learning_rate": 9.090090090090089e-05, + "loss": 0.3569, + "step": 4652 + }, + { + "epoch": 1.3948285553681843, + "grad_norm": 0.21096113324165344, + "learning_rate": 9.085585585585584e-05, + "loss": 0.3436, + "step": 4653 + }, + { + "epoch": 1.3951283492598838, + "grad_norm": 0.20840652287006378, + "learning_rate": 9.08108108108108e-05, + "loss": 0.3413, + "step": 4654 + }, + { + "epoch": 1.3954281431515834, + "grad_norm": 0.2149275839328766, + "learning_rate": 9.076576576576576e-05, + "loss": 0.3708, + "step": 4655 + }, + { + "epoch": 1.3957279370432827, + "grad_norm": 0.2034139484167099, + "learning_rate": 9.072072072072072e-05, + "loss": 0.3678, + "step": 4656 + }, + { + "epoch": 1.3960277309349822, + "grad_norm": 0.21449275314807892, + "learning_rate": 9.067567567567567e-05, + "loss": 0.3314, + "step": 4657 + }, + { + "epoch": 1.3963275248266815, + "grad_norm": 0.22322918474674225, + "learning_rate": 9.063063063063062e-05, + "loss": 0.3792, + "step": 4658 + }, + { + "epoch": 1.396627318718381, + "grad_norm": 0.1963275969028473, + "learning_rate": 9.058558558558558e-05, + "loss": 0.3371, + "step": 4659 + }, + { + "epoch": 1.3969271126100806, + "grad_norm": 0.2096429467201233, + "learning_rate": 9.054054054054053e-05, + "loss": 0.3615, + "step": 4660 + }, + { + "epoch": 1.3972269065017802, + "grad_norm": 0.2007439285516739, + "learning_rate": 9.049549549549548e-05, + "loss": 0.3225, + "step": 4661 + }, + { + "epoch": 1.3975267003934795, + "grad_norm": 0.22279343008995056, + "learning_rate": 9.045045045045045e-05, + "loss": 0.3572, + "step": 4662 + }, + { + "epoch": 1.397826494285179, + "grad_norm": 0.217242032289505, + "learning_rate": 9.040540540540539e-05, + "loss": 0.3578, + "step": 4663 + }, + { + "epoch": 1.3981262881768783, + "grad_norm": 0.2023712396621704, + "learning_rate": 9.036036036036035e-05, + "loss": 0.3249, + "step": 4664 + }, + { + "epoch": 1.3984260820685779, + "grad_norm": 0.20755234360694885, + "learning_rate": 9.031531531531531e-05, + "loss": 0.3523, + "step": 4665 + }, + { + "epoch": 1.3987258759602774, + "grad_norm": 0.20431917905807495, + "learning_rate": 9.027027027027025e-05, + "loss": 0.3305, + "step": 4666 + }, + { + "epoch": 1.3990256698519767, + "grad_norm": 0.21976415812969208, + "learning_rate": 9.022522522522521e-05, + "loss": 0.3601, + "step": 4667 + }, + { + "epoch": 1.3993254637436763, + "grad_norm": 0.21623222529888153, + "learning_rate": 9.018018018018018e-05, + "loss": 0.3684, + "step": 4668 + }, + { + "epoch": 1.3996252576353756, + "grad_norm": 0.20926150679588318, + "learning_rate": 9.013513513513513e-05, + "loss": 0.3714, + "step": 4669 + }, + { + "epoch": 1.3999250515270751, + "grad_norm": 0.22803552448749542, + "learning_rate": 9.009009009009008e-05, + "loss": 0.3691, + "step": 4670 + }, + { + "epoch": 1.4002248454187747, + "grad_norm": 0.21862058341503143, + "learning_rate": 9.004504504504504e-05, + "loss": 0.3746, + "step": 4671 + }, + { + "epoch": 1.400524639310474, + "grad_norm": 0.22191566228866577, + "learning_rate": 8.999999999999999e-05, + "loss": 0.3745, + "step": 4672 + }, + { + "epoch": 1.4008244332021735, + "grad_norm": 0.21257881820201874, + "learning_rate": 8.995495495495496e-05, + "loss": 0.3724, + "step": 4673 + }, + { + "epoch": 1.4011242270938729, + "grad_norm": 0.20726177096366882, + "learning_rate": 8.990990990990989e-05, + "loss": 0.363, + "step": 4674 + }, + { + "epoch": 1.4014240209855724, + "grad_norm": 0.21418994665145874, + "learning_rate": 8.986486486486486e-05, + "loss": 0.3771, + "step": 4675 + }, + { + "epoch": 1.401723814877272, + "grad_norm": 0.2140100598335266, + "learning_rate": 8.981981981981982e-05, + "loss": 0.3518, + "step": 4676 + }, + { + "epoch": 1.4020236087689713, + "grad_norm": 0.20784786343574524, + "learning_rate": 8.977477477477476e-05, + "loss": 0.3565, + "step": 4677 + }, + { + "epoch": 1.4023234026606708, + "grad_norm": 0.2127532958984375, + "learning_rate": 8.972972972972972e-05, + "loss": 0.3675, + "step": 4678 + }, + { + "epoch": 1.40262319655237, + "grad_norm": 0.1980638951063156, + "learning_rate": 8.968468468468469e-05, + "loss": 0.3402, + "step": 4679 + }, + { + "epoch": 1.4029229904440697, + "grad_norm": 0.21542289853096008, + "learning_rate": 8.963963963963962e-05, + "loss": 0.3871, + "step": 4680 + }, + { + "epoch": 1.4032227843357692, + "grad_norm": 0.2163427472114563, + "learning_rate": 8.959459459459459e-05, + "loss": 0.3631, + "step": 4681 + }, + { + "epoch": 1.4035225782274687, + "grad_norm": 0.2084680050611496, + "learning_rate": 8.954954954954955e-05, + "loss": 0.3426, + "step": 4682 + }, + { + "epoch": 1.403822372119168, + "grad_norm": 0.2164963334798813, + "learning_rate": 8.950450450450449e-05, + "loss": 0.372, + "step": 4683 + }, + { + "epoch": 1.4041221660108676, + "grad_norm": 0.20649103820323944, + "learning_rate": 8.945945945945945e-05, + "loss": 0.3465, + "step": 4684 + }, + { + "epoch": 1.404421959902567, + "grad_norm": 0.21171538531780243, + "learning_rate": 8.941441441441441e-05, + "loss": 0.3644, + "step": 4685 + }, + { + "epoch": 1.4047217537942664, + "grad_norm": 0.2132863998413086, + "learning_rate": 8.936936936936936e-05, + "loss": 0.3751, + "step": 4686 + }, + { + "epoch": 1.405021547685966, + "grad_norm": 0.2170388251543045, + "learning_rate": 8.932432432432431e-05, + "loss": 0.3664, + "step": 4687 + }, + { + "epoch": 1.4053213415776653, + "grad_norm": 0.21167437732219696, + "learning_rate": 8.927927927927926e-05, + "loss": 0.36, + "step": 4688 + }, + { + "epoch": 1.4056211354693648, + "grad_norm": 0.21021796762943268, + "learning_rate": 8.923423423423423e-05, + "loss": 0.3773, + "step": 4689 + }, + { + "epoch": 1.4059209293610642, + "grad_norm": 0.21205608546733856, + "learning_rate": 8.918918918918919e-05, + "loss": 0.3418, + "step": 4690 + }, + { + "epoch": 1.4062207232527637, + "grad_norm": 0.20514236390590668, + "learning_rate": 8.914414414414413e-05, + "loss": 0.3519, + "step": 4691 + }, + { + "epoch": 1.4065205171444632, + "grad_norm": 0.21129223704338074, + "learning_rate": 8.90990990990991e-05, + "loss": 0.3532, + "step": 4692 + }, + { + "epoch": 1.4068203110361626, + "grad_norm": 0.2106056809425354, + "learning_rate": 8.905405405405406e-05, + "loss": 0.3514, + "step": 4693 + }, + { + "epoch": 1.407120104927862, + "grad_norm": 0.21528342366218567, + "learning_rate": 8.9009009009009e-05, + "loss": 0.3759, + "step": 4694 + }, + { + "epoch": 1.4074198988195614, + "grad_norm": 0.21354471147060394, + "learning_rate": 8.896396396396396e-05, + "loss": 0.372, + "step": 4695 + }, + { + "epoch": 1.407719692711261, + "grad_norm": 0.2052365392446518, + "learning_rate": 8.891891891891892e-05, + "loss": 0.3421, + "step": 4696 + }, + { + "epoch": 1.4080194866029605, + "grad_norm": 0.2149754762649536, + "learning_rate": 8.887387387387386e-05, + "loss": 0.3505, + "step": 4697 + }, + { + "epoch": 1.40831928049466, + "grad_norm": 0.20838052034378052, + "learning_rate": 8.882882882882882e-05, + "loss": 0.3604, + "step": 4698 + }, + { + "epoch": 1.4086190743863594, + "grad_norm": 0.21180404722690582, + "learning_rate": 8.878378378378377e-05, + "loss": 0.342, + "step": 4699 + }, + { + "epoch": 1.408918868278059, + "grad_norm": 0.2073618769645691, + "learning_rate": 8.873873873873872e-05, + "loss": 0.3694, + "step": 4700 + }, + { + "epoch": 1.4092186621697582, + "grad_norm": 0.20314088463783264, + "learning_rate": 8.869369369369369e-05, + "loss": 0.366, + "step": 4701 + }, + { + "epoch": 1.4095184560614578, + "grad_norm": 0.22556687891483307, + "learning_rate": 8.864864864864864e-05, + "loss": 0.3434, + "step": 4702 + }, + { + "epoch": 1.4098182499531573, + "grad_norm": 0.21841377019882202, + "learning_rate": 8.86036036036036e-05, + "loss": 0.3622, + "step": 4703 + }, + { + "epoch": 1.4101180438448566, + "grad_norm": 0.21003997325897217, + "learning_rate": 8.855855855855855e-05, + "loss": 0.3658, + "step": 4704 + }, + { + "epoch": 1.4104178377365562, + "grad_norm": 0.20329275727272034, + "learning_rate": 8.85135135135135e-05, + "loss": 0.3793, + "step": 4705 + }, + { + "epoch": 1.4107176316282555, + "grad_norm": 0.22247040271759033, + "learning_rate": 8.846846846846847e-05, + "loss": 0.3687, + "step": 4706 + }, + { + "epoch": 1.411017425519955, + "grad_norm": 0.21038077771663666, + "learning_rate": 8.842342342342343e-05, + "loss": 0.3435, + "step": 4707 + }, + { + "epoch": 1.4113172194116546, + "grad_norm": 0.21580614149570465, + "learning_rate": 8.837837837837837e-05, + "loss": 0.353, + "step": 4708 + }, + { + "epoch": 1.4116170133033539, + "grad_norm": 0.20951011776924133, + "learning_rate": 8.833333333333333e-05, + "loss": 0.3314, + "step": 4709 + }, + { + "epoch": 1.4119168071950534, + "grad_norm": 0.21354609727859497, + "learning_rate": 8.828828828828827e-05, + "loss": 0.3393, + "step": 4710 + }, + { + "epoch": 1.4122166010867527, + "grad_norm": 0.20141154527664185, + "learning_rate": 8.824324324324323e-05, + "loss": 0.3392, + "step": 4711 + }, + { + "epoch": 1.4125163949784523, + "grad_norm": 0.21682001650333405, + "learning_rate": 8.81981981981982e-05, + "loss": 0.3473, + "step": 4712 + }, + { + "epoch": 1.4128161888701518, + "grad_norm": 0.24055300652980804, + "learning_rate": 8.815315315315314e-05, + "loss": 0.3727, + "step": 4713 + }, + { + "epoch": 1.4131159827618514, + "grad_norm": 0.21340611577033997, + "learning_rate": 8.81081081081081e-05, + "loss": 0.3668, + "step": 4714 + }, + { + "epoch": 1.4134157766535507, + "grad_norm": 0.22251774370670319, + "learning_rate": 8.806306306306306e-05, + "loss": 0.3509, + "step": 4715 + }, + { + "epoch": 1.4137155705452502, + "grad_norm": 0.21125243604183197, + "learning_rate": 8.801801801801801e-05, + "loss": 0.3619, + "step": 4716 + }, + { + "epoch": 1.4140153644369495, + "grad_norm": 0.22132828831672668, + "learning_rate": 8.797297297297296e-05, + "loss": 0.3874, + "step": 4717 + }, + { + "epoch": 1.414315158328649, + "grad_norm": 0.21483568847179413, + "learning_rate": 8.792792792792792e-05, + "loss": 0.3462, + "step": 4718 + }, + { + "epoch": 1.4146149522203486, + "grad_norm": 0.2184942066669464, + "learning_rate": 8.788288288288287e-05, + "loss": 0.3579, + "step": 4719 + }, + { + "epoch": 1.414914746112048, + "grad_norm": 0.2186221331357956, + "learning_rate": 8.783783783783784e-05, + "loss": 0.3633, + "step": 4720 + }, + { + "epoch": 1.4152145400037475, + "grad_norm": 0.20850953459739685, + "learning_rate": 8.779279279279279e-05, + "loss": 0.3622, + "step": 4721 + }, + { + "epoch": 1.4155143338954468, + "grad_norm": 0.21589501202106476, + "learning_rate": 8.774774774774774e-05, + "loss": 0.3308, + "step": 4722 + }, + { + "epoch": 1.4158141277871463, + "grad_norm": 0.20245422422885895, + "learning_rate": 8.77027027027027e-05, + "loss": 0.3527, + "step": 4723 + }, + { + "epoch": 1.4161139216788459, + "grad_norm": 0.21961507201194763, + "learning_rate": 8.765765765765764e-05, + "loss": 0.3637, + "step": 4724 + }, + { + "epoch": 1.4164137155705452, + "grad_norm": 0.21779102087020874, + "learning_rate": 8.76126126126126e-05, + "loss": 0.3713, + "step": 4725 + }, + { + "epoch": 1.4167135094622447, + "grad_norm": 0.23682500422000885, + "learning_rate": 8.756756756756757e-05, + "loss": 0.3533, + "step": 4726 + }, + { + "epoch": 1.417013303353944, + "grad_norm": 0.20653976500034332, + "learning_rate": 8.75225225225225e-05, + "loss": 0.3365, + "step": 4727 + }, + { + "epoch": 1.4173130972456436, + "grad_norm": 0.2318013608455658, + "learning_rate": 8.747747747747747e-05, + "loss": 0.3705, + "step": 4728 + }, + { + "epoch": 1.4176128911373431, + "grad_norm": 0.2127133309841156, + "learning_rate": 8.743243243243243e-05, + "loss": 0.3496, + "step": 4729 + }, + { + "epoch": 1.4179126850290427, + "grad_norm": 0.2204805463552475, + "learning_rate": 8.738738738738738e-05, + "loss": 0.3477, + "step": 4730 + }, + { + "epoch": 1.418212478920742, + "grad_norm": 0.21495847404003143, + "learning_rate": 8.734234234234233e-05, + "loss": 0.3638, + "step": 4731 + }, + { + "epoch": 1.4185122728124415, + "grad_norm": 0.21796728670597076, + "learning_rate": 8.72972972972973e-05, + "loss": 0.3608, + "step": 4732 + }, + { + "epoch": 1.4188120667041408, + "grad_norm": 0.2098185420036316, + "learning_rate": 8.725225225225225e-05, + "loss": 0.3585, + "step": 4733 + }, + { + "epoch": 1.4191118605958404, + "grad_norm": 0.2242172807455063, + "learning_rate": 8.72072072072072e-05, + "loss": 0.3719, + "step": 4734 + }, + { + "epoch": 1.41941165448754, + "grad_norm": 0.2186833620071411, + "learning_rate": 8.716216216216215e-05, + "loss": 0.3833, + "step": 4735 + }, + { + "epoch": 1.4197114483792392, + "grad_norm": 0.1976839154958725, + "learning_rate": 8.711711711711711e-05, + "loss": 0.3142, + "step": 4736 + }, + { + "epoch": 1.4200112422709388, + "grad_norm": 0.19263608753681183, + "learning_rate": 8.707207207207207e-05, + "loss": 0.3205, + "step": 4737 + }, + { + "epoch": 1.420311036162638, + "grad_norm": 0.21386316418647766, + "learning_rate": 8.702702702702701e-05, + "loss": 0.344, + "step": 4738 + }, + { + "epoch": 1.4206108300543376, + "grad_norm": 0.20507782697677612, + "learning_rate": 8.698198198198198e-05, + "loss": 0.3434, + "step": 4739 + }, + { + "epoch": 1.4209106239460372, + "grad_norm": 0.20067331194877625, + "learning_rate": 8.693693693693694e-05, + "loss": 0.3435, + "step": 4740 + }, + { + "epoch": 1.4212104178377365, + "grad_norm": 0.20949102938175201, + "learning_rate": 8.689189189189188e-05, + "loss": 0.358, + "step": 4741 + }, + { + "epoch": 1.421510211729436, + "grad_norm": 0.21897202730178833, + "learning_rate": 8.684684684684684e-05, + "loss": 0.357, + "step": 4742 + }, + { + "epoch": 1.4218100056211354, + "grad_norm": 0.2232789844274521, + "learning_rate": 8.68018018018018e-05, + "loss": 0.3495, + "step": 4743 + }, + { + "epoch": 1.422109799512835, + "grad_norm": 0.20372246205806732, + "learning_rate": 8.675675675675674e-05, + "loss": 0.3285, + "step": 4744 + }, + { + "epoch": 1.4224095934045344, + "grad_norm": 0.21029895544052124, + "learning_rate": 8.67117117117117e-05, + "loss": 0.3475, + "step": 4745 + }, + { + "epoch": 1.422709387296234, + "grad_norm": 0.2134687602519989, + "learning_rate": 8.666666666666665e-05, + "loss": 0.3529, + "step": 4746 + }, + { + "epoch": 1.4230091811879333, + "grad_norm": 0.21589897572994232, + "learning_rate": 8.662162162162162e-05, + "loss": 0.3446, + "step": 4747 + }, + { + "epoch": 1.4233089750796328, + "grad_norm": 0.22031275928020477, + "learning_rate": 8.657657657657657e-05, + "loss": 0.3759, + "step": 4748 + }, + { + "epoch": 1.4236087689713322, + "grad_norm": 0.20217642188072205, + "learning_rate": 8.653153153153152e-05, + "loss": 0.3324, + "step": 4749 + }, + { + "epoch": 1.4239085628630317, + "grad_norm": 0.22420530021190643, + "learning_rate": 8.648648648648648e-05, + "loss": 0.3884, + "step": 4750 + }, + { + "epoch": 1.4242083567547312, + "grad_norm": 0.2160356044769287, + "learning_rate": 8.644144144144143e-05, + "loss": 0.3546, + "step": 4751 + }, + { + "epoch": 1.4245081506464305, + "grad_norm": 0.2071686089038849, + "learning_rate": 8.639639639639638e-05, + "loss": 0.3741, + "step": 4752 + }, + { + "epoch": 1.42480794453813, + "grad_norm": 0.22385691106319427, + "learning_rate": 8.635135135135135e-05, + "loss": 0.3441, + "step": 4753 + }, + { + "epoch": 1.4251077384298294, + "grad_norm": 0.20757465064525604, + "learning_rate": 8.630630630630631e-05, + "loss": 0.3466, + "step": 4754 + }, + { + "epoch": 1.425407532321529, + "grad_norm": 0.20419491827487946, + "learning_rate": 8.626126126126125e-05, + "loss": 0.3246, + "step": 4755 + }, + { + "epoch": 1.4257073262132285, + "grad_norm": 0.2099572867155075, + "learning_rate": 8.621621621621621e-05, + "loss": 0.3642, + "step": 4756 + }, + { + "epoch": 1.4260071201049278, + "grad_norm": 0.21940596401691437, + "learning_rate": 8.617117117117118e-05, + "loss": 0.3569, + "step": 4757 + }, + { + "epoch": 1.4263069139966273, + "grad_norm": 0.21012279391288757, + "learning_rate": 8.612612612612611e-05, + "loss": 0.3658, + "step": 4758 + }, + { + "epoch": 1.4266067078883267, + "grad_norm": 0.20855596661567688, + "learning_rate": 8.608108108108108e-05, + "loss": 0.3697, + "step": 4759 + }, + { + "epoch": 1.4269065017800262, + "grad_norm": 0.22132547199726105, + "learning_rate": 8.603603603603603e-05, + "loss": 0.348, + "step": 4760 + }, + { + "epoch": 1.4272062956717257, + "grad_norm": 0.2173125445842743, + "learning_rate": 8.599099099099098e-05, + "loss": 0.3516, + "step": 4761 + }, + { + "epoch": 1.427506089563425, + "grad_norm": 0.2090543806552887, + "learning_rate": 8.594594594594594e-05, + "loss": 0.3521, + "step": 4762 + }, + { + "epoch": 1.4278058834551246, + "grad_norm": 0.21249796450138092, + "learning_rate": 8.590090090090089e-05, + "loss": 0.3688, + "step": 4763 + }, + { + "epoch": 1.428105677346824, + "grad_norm": 0.22098830342292786, + "learning_rate": 8.585585585585586e-05, + "loss": 0.3585, + "step": 4764 + }, + { + "epoch": 1.4284054712385235, + "grad_norm": 0.21827562153339386, + "learning_rate": 8.58108108108108e-05, + "loss": 0.3416, + "step": 4765 + }, + { + "epoch": 1.428705265130223, + "grad_norm": 0.22327540814876556, + "learning_rate": 8.576576576576576e-05, + "loss": 0.3714, + "step": 4766 + }, + { + "epoch": 1.4290050590219225, + "grad_norm": 0.20923393964767456, + "learning_rate": 8.572072072072072e-05, + "loss": 0.3596, + "step": 4767 + }, + { + "epoch": 1.4293048529136219, + "grad_norm": 0.22230814397335052, + "learning_rate": 8.567567567567567e-05, + "loss": 0.362, + "step": 4768 + }, + { + "epoch": 1.4296046468053214, + "grad_norm": 0.2018887847661972, + "learning_rate": 8.563063063063062e-05, + "loss": 0.3394, + "step": 4769 + }, + { + "epoch": 1.4299044406970207, + "grad_norm": 0.22293516993522644, + "learning_rate": 8.558558558558558e-05, + "loss": 0.35, + "step": 4770 + }, + { + "epoch": 1.4302042345887203, + "grad_norm": 0.20164437592029572, + "learning_rate": 8.554054054054052e-05, + "loss": 0.3486, + "step": 4771 + }, + { + "epoch": 1.4305040284804198, + "grad_norm": 0.22350816428661346, + "learning_rate": 8.549549549549548e-05, + "loss": 0.3784, + "step": 4772 + }, + { + "epoch": 1.4308038223721191, + "grad_norm": 0.2070097029209137, + "learning_rate": 8.545045045045045e-05, + "loss": 0.3488, + "step": 4773 + }, + { + "epoch": 1.4311036162638187, + "grad_norm": 0.20798823237419128, + "learning_rate": 8.540540540540539e-05, + "loss": 0.3468, + "step": 4774 + }, + { + "epoch": 1.431403410155518, + "grad_norm": 0.21909308433532715, + "learning_rate": 8.536036036036035e-05, + "loss": 0.3654, + "step": 4775 + }, + { + "epoch": 1.4317032040472175, + "grad_norm": 0.1986744999885559, + "learning_rate": 8.531531531531531e-05, + "loss": 0.3392, + "step": 4776 + }, + { + "epoch": 1.432002997938917, + "grad_norm": 0.2093539983034134, + "learning_rate": 8.527027027027026e-05, + "loss": 0.3651, + "step": 4777 + }, + { + "epoch": 1.4323027918306164, + "grad_norm": 0.20547617971897125, + "learning_rate": 8.522522522522521e-05, + "loss": 0.3425, + "step": 4778 + }, + { + "epoch": 1.432602585722316, + "grad_norm": 0.21190671622753143, + "learning_rate": 8.518018018018018e-05, + "loss": 0.3336, + "step": 4779 + }, + { + "epoch": 1.4329023796140152, + "grad_norm": 0.20590968430042267, + "learning_rate": 8.513513513513513e-05, + "loss": 0.3588, + "step": 4780 + }, + { + "epoch": 1.4332021735057148, + "grad_norm": 0.20922647416591644, + "learning_rate": 8.509009009009008e-05, + "loss": 0.3222, + "step": 4781 + }, + { + "epoch": 1.4335019673974143, + "grad_norm": 0.209987074136734, + "learning_rate": 8.504504504504503e-05, + "loss": 0.3416, + "step": 4782 + }, + { + "epoch": 1.4338017612891139, + "grad_norm": 0.2170223444700241, + "learning_rate": 8.499999999999999e-05, + "loss": 0.3554, + "step": 4783 + }, + { + "epoch": 1.4341015551808132, + "grad_norm": 0.2070566564798355, + "learning_rate": 8.495495495495496e-05, + "loss": 0.3472, + "step": 4784 + }, + { + "epoch": 1.4344013490725127, + "grad_norm": 0.222892165184021, + "learning_rate": 8.490990990990989e-05, + "loss": 0.3625, + "step": 4785 + }, + { + "epoch": 1.434701142964212, + "grad_norm": 0.21506567299365997, + "learning_rate": 8.486486486486486e-05, + "loss": 0.3613, + "step": 4786 + }, + { + "epoch": 1.4350009368559116, + "grad_norm": 0.19889682531356812, + "learning_rate": 8.481981981981982e-05, + "loss": 0.3184, + "step": 4787 + }, + { + "epoch": 1.435300730747611, + "grad_norm": 0.2095138281583786, + "learning_rate": 8.477477477477476e-05, + "loss": 0.3507, + "step": 4788 + }, + { + "epoch": 1.4356005246393104, + "grad_norm": 0.21511593461036682, + "learning_rate": 8.472972972972972e-05, + "loss": 0.3577, + "step": 4789 + }, + { + "epoch": 1.43590031853101, + "grad_norm": 0.21792061626911163, + "learning_rate": 8.468468468468469e-05, + "loss": 0.3768, + "step": 4790 + }, + { + "epoch": 1.4362001124227093, + "grad_norm": 0.22104412317276, + "learning_rate": 8.463963963963962e-05, + "loss": 0.4004, + "step": 4791 + }, + { + "epoch": 1.4364999063144088, + "grad_norm": 0.21523074805736542, + "learning_rate": 8.459459459459459e-05, + "loss": 0.3608, + "step": 4792 + }, + { + "epoch": 1.4367997002061084, + "grad_norm": 0.22235362231731415, + "learning_rate": 8.454954954954955e-05, + "loss": 0.3556, + "step": 4793 + }, + { + "epoch": 1.4370994940978077, + "grad_norm": 0.20765119791030884, + "learning_rate": 8.45045045045045e-05, + "loss": 0.3514, + "step": 4794 + }, + { + "epoch": 1.4373992879895072, + "grad_norm": 0.2087615430355072, + "learning_rate": 8.445945945945945e-05, + "loss": 0.3556, + "step": 4795 + }, + { + "epoch": 1.4376990818812065, + "grad_norm": 0.20913684368133545, + "learning_rate": 8.44144144144144e-05, + "loss": 0.3271, + "step": 4796 + }, + { + "epoch": 1.437998875772906, + "grad_norm": 0.20327967405319214, + "learning_rate": 8.436936936936936e-05, + "loss": 0.3427, + "step": 4797 + }, + { + "epoch": 1.4382986696646056, + "grad_norm": 0.21335206925868988, + "learning_rate": 8.432432432432432e-05, + "loss": 0.3577, + "step": 4798 + }, + { + "epoch": 1.4385984635563052, + "grad_norm": 0.21867401897907257, + "learning_rate": 8.427927927927927e-05, + "loss": 0.3687, + "step": 4799 + }, + { + "epoch": 1.4388982574480045, + "grad_norm": 0.21505360305309296, + "learning_rate": 8.423423423423423e-05, + "loss": 0.3362, + "step": 4800 + }, + { + "epoch": 1.439198051339704, + "grad_norm": 0.20245307683944702, + "learning_rate": 8.418918918918919e-05, + "loss": 0.3432, + "step": 4801 + }, + { + "epoch": 1.4394978452314033, + "grad_norm": 0.20703735947608948, + "learning_rate": 8.414414414414413e-05, + "loss": 0.34, + "step": 4802 + }, + { + "epoch": 1.4397976391231029, + "grad_norm": 0.20130011439323425, + "learning_rate": 8.40990990990991e-05, + "loss": 0.3335, + "step": 4803 + }, + { + "epoch": 1.4400974330148024, + "grad_norm": 0.20115235447883606, + "learning_rate": 8.405405405405406e-05, + "loss": 0.3479, + "step": 4804 + }, + { + "epoch": 1.4403972269065017, + "grad_norm": 0.20624947547912598, + "learning_rate": 8.4009009009009e-05, + "loss": 0.3533, + "step": 4805 + }, + { + "epoch": 1.4406970207982013, + "grad_norm": 0.19826063513755798, + "learning_rate": 8.396396396396396e-05, + "loss": 0.3412, + "step": 4806 + }, + { + "epoch": 1.4409968146899006, + "grad_norm": 0.21055233478546143, + "learning_rate": 8.391891891891891e-05, + "loss": 0.3611, + "step": 4807 + }, + { + "epoch": 1.4412966085816001, + "grad_norm": 0.2094549536705017, + "learning_rate": 8.387387387387386e-05, + "loss": 0.3353, + "step": 4808 + }, + { + "epoch": 1.4415964024732997, + "grad_norm": 0.20850618183612823, + "learning_rate": 8.382882882882882e-05, + "loss": 0.3345, + "step": 4809 + }, + { + "epoch": 1.441896196364999, + "grad_norm": 0.2107923924922943, + "learning_rate": 8.378378378378377e-05, + "loss": 0.3531, + "step": 4810 + }, + { + "epoch": 1.4421959902566985, + "grad_norm": 0.21351805329322815, + "learning_rate": 8.373873873873874e-05, + "loss": 0.3561, + "step": 4811 + }, + { + "epoch": 1.4424957841483979, + "grad_norm": 0.21343296766281128, + "learning_rate": 8.369369369369369e-05, + "loss": 0.3648, + "step": 4812 + }, + { + "epoch": 1.4427955780400974, + "grad_norm": 0.20149439573287964, + "learning_rate": 8.364864864864864e-05, + "loss": 0.327, + "step": 4813 + }, + { + "epoch": 1.443095371931797, + "grad_norm": 0.20820508897304535, + "learning_rate": 8.36036036036036e-05, + "loss": 0.3756, + "step": 4814 + }, + { + "epoch": 1.4433951658234965, + "grad_norm": 0.21000804007053375, + "learning_rate": 8.355855855855855e-05, + "loss": 0.3539, + "step": 4815 + }, + { + "epoch": 1.4436949597151958, + "grad_norm": 0.21678487956523895, + "learning_rate": 8.35135135135135e-05, + "loss": 0.3662, + "step": 4816 + }, + { + "epoch": 1.4439947536068953, + "grad_norm": 0.2124273180961609, + "learning_rate": 8.346846846846847e-05, + "loss": 0.3702, + "step": 4817 + }, + { + "epoch": 1.4442945474985946, + "grad_norm": 0.2035249024629593, + "learning_rate": 8.34234234234234e-05, + "loss": 0.3401, + "step": 4818 + }, + { + "epoch": 1.4445943413902942, + "grad_norm": 0.2147194743156433, + "learning_rate": 8.337837837837837e-05, + "loss": 0.3565, + "step": 4819 + }, + { + "epoch": 1.4448941352819937, + "grad_norm": 0.20793719589710236, + "learning_rate": 8.333333333333333e-05, + "loss": 0.3507, + "step": 4820 + }, + { + "epoch": 1.445193929173693, + "grad_norm": 0.20299087464809418, + "learning_rate": 8.328828828828827e-05, + "loss": 0.3528, + "step": 4821 + }, + { + "epoch": 1.4454937230653926, + "grad_norm": 0.20743577182292938, + "learning_rate": 8.324324324324323e-05, + "loss": 0.3585, + "step": 4822 + }, + { + "epoch": 1.445793516957092, + "grad_norm": 0.21002289652824402, + "learning_rate": 8.31981981981982e-05, + "loss": 0.3496, + "step": 4823 + }, + { + "epoch": 1.4460933108487914, + "grad_norm": 0.2117779701948166, + "learning_rate": 8.315315315315315e-05, + "loss": 0.3683, + "step": 4824 + }, + { + "epoch": 1.446393104740491, + "grad_norm": 0.21782147884368896, + "learning_rate": 8.31081081081081e-05, + "loss": 0.3545, + "step": 4825 + }, + { + "epoch": 1.4466928986321903, + "grad_norm": 0.20695964992046356, + "learning_rate": 8.306306306306306e-05, + "loss": 0.3553, + "step": 4826 + }, + { + "epoch": 1.4469926925238898, + "grad_norm": 0.19961470365524292, + "learning_rate": 8.301801801801801e-05, + "loss": 0.3447, + "step": 4827 + }, + { + "epoch": 1.4472924864155892, + "grad_norm": 0.2148684710264206, + "learning_rate": 8.297297297297297e-05, + "loss": 0.3538, + "step": 4828 + }, + { + "epoch": 1.4475922803072887, + "grad_norm": 0.21752296388149261, + "learning_rate": 8.292792792792792e-05, + "loss": 0.3677, + "step": 4829 + }, + { + "epoch": 1.4478920741989882, + "grad_norm": 0.20284780859947205, + "learning_rate": 8.288288288288287e-05, + "loss": 0.3581, + "step": 4830 + }, + { + "epoch": 1.4481918680906878, + "grad_norm": 0.21310214698314667, + "learning_rate": 8.283783783783784e-05, + "loss": 0.3538, + "step": 4831 + }, + { + "epoch": 1.448491661982387, + "grad_norm": 0.20693115890026093, + "learning_rate": 8.279279279279278e-05, + "loss": 0.3669, + "step": 4832 + }, + { + "epoch": 1.4487914558740866, + "grad_norm": 0.2130792886018753, + "learning_rate": 8.274774774774774e-05, + "loss": 0.3514, + "step": 4833 + }, + { + "epoch": 1.449091249765786, + "grad_norm": 0.21330896019935608, + "learning_rate": 8.27027027027027e-05, + "loss": 0.3523, + "step": 4834 + }, + { + "epoch": 1.4493910436574855, + "grad_norm": 0.20298290252685547, + "learning_rate": 8.265765765765764e-05, + "loss": 0.3388, + "step": 4835 + }, + { + "epoch": 1.449690837549185, + "grad_norm": 0.20353178679943085, + "learning_rate": 8.26126126126126e-05, + "loss": 0.3461, + "step": 4836 + }, + { + "epoch": 1.4499906314408844, + "grad_norm": 0.2044645994901657, + "learning_rate": 8.256756756756757e-05, + "loss": 0.3302, + "step": 4837 + }, + { + "epoch": 1.450290425332584, + "grad_norm": 0.2172108143568039, + "learning_rate": 8.25225225225225e-05, + "loss": 0.3573, + "step": 4838 + }, + { + "epoch": 1.4505902192242832, + "grad_norm": 0.20277266204357147, + "learning_rate": 8.247747747747747e-05, + "loss": 0.345, + "step": 4839 + }, + { + "epoch": 1.4508900131159828, + "grad_norm": 0.22137466073036194, + "learning_rate": 8.243243243243243e-05, + "loss": 0.3754, + "step": 4840 + }, + { + "epoch": 1.4511898070076823, + "grad_norm": 0.20912005007266998, + "learning_rate": 8.238738738738738e-05, + "loss": 0.353, + "step": 4841 + }, + { + "epoch": 1.4514896008993816, + "grad_norm": 0.1990968883037567, + "learning_rate": 8.234234234234233e-05, + "loss": 0.3336, + "step": 4842 + }, + { + "epoch": 1.4517893947910812, + "grad_norm": 0.21815185248851776, + "learning_rate": 8.229729729729728e-05, + "loss": 0.3884, + "step": 4843 + }, + { + "epoch": 1.4520891886827805, + "grad_norm": 0.21321304142475128, + "learning_rate": 8.225225225225225e-05, + "loss": 0.3624, + "step": 4844 + }, + { + "epoch": 1.45238898257448, + "grad_norm": 0.20276996493339539, + "learning_rate": 8.220720720720721e-05, + "loss": 0.342, + "step": 4845 + }, + { + "epoch": 1.4526887764661796, + "grad_norm": 0.19751514494419098, + "learning_rate": 8.216216216216215e-05, + "loss": 0.3381, + "step": 4846 + }, + { + "epoch": 1.452988570357879, + "grad_norm": 0.2045748233795166, + "learning_rate": 8.211711711711711e-05, + "loss": 0.3579, + "step": 4847 + }, + { + "epoch": 1.4532883642495784, + "grad_norm": 0.2179010659456253, + "learning_rate": 8.207207207207208e-05, + "loss": 0.3746, + "step": 4848 + }, + { + "epoch": 1.453588158141278, + "grad_norm": 0.20252852141857147, + "learning_rate": 8.202702702702701e-05, + "loss": 0.3512, + "step": 4849 + }, + { + "epoch": 1.4538879520329773, + "grad_norm": 0.2126975953578949, + "learning_rate": 8.198198198198198e-05, + "loss": 0.3473, + "step": 4850 + }, + { + "epoch": 1.4541877459246768, + "grad_norm": 0.2115095853805542, + "learning_rate": 8.193693693693694e-05, + "loss": 0.376, + "step": 4851 + }, + { + "epoch": 1.4544875398163764, + "grad_norm": 0.21630194783210754, + "learning_rate": 8.189189189189188e-05, + "loss": 0.3726, + "step": 4852 + }, + { + "epoch": 1.4547873337080757, + "grad_norm": 0.21030595898628235, + "learning_rate": 8.184684684684684e-05, + "loss": 0.363, + "step": 4853 + }, + { + "epoch": 1.4550871275997752, + "grad_norm": 0.20921146869659424, + "learning_rate": 8.18018018018018e-05, + "loss": 0.3435, + "step": 4854 + }, + { + "epoch": 1.4553869214914745, + "grad_norm": 0.22109493613243103, + "learning_rate": 8.175675675675674e-05, + "loss": 0.3523, + "step": 4855 + }, + { + "epoch": 1.455686715383174, + "grad_norm": 0.203207865357399, + "learning_rate": 8.17117117117117e-05, + "loss": 0.3276, + "step": 4856 + }, + { + "epoch": 1.4559865092748736, + "grad_norm": 0.21486714482307434, + "learning_rate": 8.166666666666665e-05, + "loss": 0.3696, + "step": 4857 + }, + { + "epoch": 1.456286303166573, + "grad_norm": 0.20264849066734314, + "learning_rate": 8.162162162162162e-05, + "loss": 0.3339, + "step": 4858 + }, + { + "epoch": 1.4565860970582725, + "grad_norm": 0.20054593682289124, + "learning_rate": 8.157657657657657e-05, + "loss": 0.3353, + "step": 4859 + }, + { + "epoch": 1.4568858909499718, + "grad_norm": 0.20756486058235168, + "learning_rate": 8.153153153153152e-05, + "loss": 0.3571, + "step": 4860 + }, + { + "epoch": 1.4571856848416713, + "grad_norm": 0.21069984138011932, + "learning_rate": 8.148648648648648e-05, + "loss": 0.3495, + "step": 4861 + }, + { + "epoch": 1.4574854787333709, + "grad_norm": 0.21038687229156494, + "learning_rate": 8.144144144144145e-05, + "loss": 0.3639, + "step": 4862 + }, + { + "epoch": 1.4577852726250702, + "grad_norm": 0.21815302968025208, + "learning_rate": 8.139639639639638e-05, + "loss": 0.3768, + "step": 4863 + }, + { + "epoch": 1.4580850665167697, + "grad_norm": 0.20300470292568207, + "learning_rate": 8.135135135135135e-05, + "loss": 0.349, + "step": 4864 + }, + { + "epoch": 1.458384860408469, + "grad_norm": 0.21326471865177155, + "learning_rate": 8.130630630630631e-05, + "loss": 0.364, + "step": 4865 + }, + { + "epoch": 1.4586846543001686, + "grad_norm": 0.2107071727514267, + "learning_rate": 8.126126126126125e-05, + "loss": 0.3701, + "step": 4866 + }, + { + "epoch": 1.4589844481918681, + "grad_norm": 0.21025468409061432, + "learning_rate": 8.121621621621621e-05, + "loss": 0.3736, + "step": 4867 + }, + { + "epoch": 1.4592842420835677, + "grad_norm": 0.2088591605424881, + "learning_rate": 8.117117117117116e-05, + "loss": 0.3498, + "step": 4868 + }, + { + "epoch": 1.459584035975267, + "grad_norm": 0.22014079988002777, + "learning_rate": 8.112612612612611e-05, + "loss": 0.3635, + "step": 4869 + }, + { + "epoch": 1.4598838298669665, + "grad_norm": 0.20154501497745514, + "learning_rate": 8.108108108108108e-05, + "loss": 0.3429, + "step": 4870 + }, + { + "epoch": 1.4601836237586658, + "grad_norm": 0.2102486938238144, + "learning_rate": 8.103603603603603e-05, + "loss": 0.3574, + "step": 4871 + }, + { + "epoch": 1.4604834176503654, + "grad_norm": 0.2018607258796692, + "learning_rate": 8.099099099099098e-05, + "loss": 0.3337, + "step": 4872 + }, + { + "epoch": 1.460783211542065, + "grad_norm": 0.20702728629112244, + "learning_rate": 8.094594594594594e-05, + "loss": 0.3473, + "step": 4873 + }, + { + "epoch": 1.4610830054337642, + "grad_norm": 0.20739267766475677, + "learning_rate": 8.090090090090089e-05, + "loss": 0.3506, + "step": 4874 + }, + { + "epoch": 1.4613827993254638, + "grad_norm": 0.20889143645763397, + "learning_rate": 8.085585585585586e-05, + "loss": 0.3711, + "step": 4875 + }, + { + "epoch": 1.461682593217163, + "grad_norm": 0.20718500018119812, + "learning_rate": 8.08108108108108e-05, + "loss": 0.3618, + "step": 4876 + }, + { + "epoch": 1.4619823871088626, + "grad_norm": 0.22436094284057617, + "learning_rate": 8.076576576576576e-05, + "loss": 0.3752, + "step": 4877 + }, + { + "epoch": 1.4622821810005622, + "grad_norm": 0.21661464869976044, + "learning_rate": 8.072072072072072e-05, + "loss": 0.3945, + "step": 4878 + }, + { + "epoch": 1.4625819748922615, + "grad_norm": 0.21557655930519104, + "learning_rate": 8.067567567567566e-05, + "loss": 0.33, + "step": 4879 + }, + { + "epoch": 1.462881768783961, + "grad_norm": 0.20167309045791626, + "learning_rate": 8.063063063063062e-05, + "loss": 0.3333, + "step": 4880 + }, + { + "epoch": 1.4631815626756604, + "grad_norm": 0.20474180579185486, + "learning_rate": 8.058558558558558e-05, + "loss": 0.3634, + "step": 4881 + }, + { + "epoch": 1.46348135656736, + "grad_norm": 0.2092469483613968, + "learning_rate": 8.054054054054052e-05, + "loss": 0.3638, + "step": 4882 + }, + { + "epoch": 1.4637811504590594, + "grad_norm": 0.21168501675128937, + "learning_rate": 8.049549549549549e-05, + "loss": 0.3653, + "step": 4883 + }, + { + "epoch": 1.464080944350759, + "grad_norm": 0.2195328325033188, + "learning_rate": 8.045045045045045e-05, + "loss": 0.3888, + "step": 4884 + }, + { + "epoch": 1.4643807382424583, + "grad_norm": 0.2088628113269806, + "learning_rate": 8.04054054054054e-05, + "loss": 0.3611, + "step": 4885 + }, + { + "epoch": 1.4646805321341578, + "grad_norm": 0.20401327311992645, + "learning_rate": 8.036036036036035e-05, + "loss": 0.3487, + "step": 4886 + }, + { + "epoch": 1.4649803260258571, + "grad_norm": 0.2168956845998764, + "learning_rate": 8.031531531531531e-05, + "loss": 0.3709, + "step": 4887 + }, + { + "epoch": 1.4652801199175567, + "grad_norm": 0.199428528547287, + "learning_rate": 8.027027027027026e-05, + "loss": 0.3379, + "step": 4888 + }, + { + "epoch": 1.4655799138092562, + "grad_norm": 0.20674806833267212, + "learning_rate": 8.022522522522521e-05, + "loss": 0.3648, + "step": 4889 + }, + { + "epoch": 1.4658797077009555, + "grad_norm": 0.1945922076702118, + "learning_rate": 8.018018018018018e-05, + "loss": 0.3477, + "step": 4890 + }, + { + "epoch": 1.466179501592655, + "grad_norm": 0.2134675234556198, + "learning_rate": 8.013513513513513e-05, + "loss": 0.3796, + "step": 4891 + }, + { + "epoch": 1.4664792954843544, + "grad_norm": 0.2022518515586853, + "learning_rate": 8.009009009009009e-05, + "loss": 0.3458, + "step": 4892 + }, + { + "epoch": 1.466779089376054, + "grad_norm": 0.21644069254398346, + "learning_rate": 8.004504504504503e-05, + "loss": 0.3625, + "step": 4893 + }, + { + "epoch": 1.4670788832677535, + "grad_norm": 0.21158376336097717, + "learning_rate": 7.999999999999999e-05, + "loss": 0.3616, + "step": 4894 + }, + { + "epoch": 1.4673786771594528, + "grad_norm": 0.21450527012348175, + "learning_rate": 7.995495495495496e-05, + "loss": 0.3662, + "step": 4895 + }, + { + "epoch": 1.4676784710511523, + "grad_norm": 0.21180300414562225, + "learning_rate": 7.99099099099099e-05, + "loss": 0.3544, + "step": 4896 + }, + { + "epoch": 1.4679782649428517, + "grad_norm": 0.20244859158992767, + "learning_rate": 7.986486486486486e-05, + "loss": 0.3475, + "step": 4897 + }, + { + "epoch": 1.4682780588345512, + "grad_norm": 0.21008580923080444, + "learning_rate": 7.981981981981982e-05, + "loss": 0.3545, + "step": 4898 + }, + { + "epoch": 1.4685778527262507, + "grad_norm": 0.21382389962673187, + "learning_rate": 7.977477477477476e-05, + "loss": 0.3796, + "step": 4899 + }, + { + "epoch": 1.4688776466179503, + "grad_norm": 0.21943014860153198, + "learning_rate": 7.972972972972972e-05, + "loss": 0.3769, + "step": 4900 + }, + { + "epoch": 1.4691774405096496, + "grad_norm": 0.20606811344623566, + "learning_rate": 7.968468468468469e-05, + "loss": 0.3511, + "step": 4901 + }, + { + "epoch": 1.4694772344013491, + "grad_norm": 0.2154122292995453, + "learning_rate": 7.963963963963964e-05, + "loss": 0.367, + "step": 4902 + }, + { + "epoch": 1.4697770282930485, + "grad_norm": 0.21306243538856506, + "learning_rate": 7.959459459459459e-05, + "loss": 0.3667, + "step": 4903 + }, + { + "epoch": 1.470076822184748, + "grad_norm": 0.20610538125038147, + "learning_rate": 7.954954954954954e-05, + "loss": 0.3608, + "step": 4904 + }, + { + "epoch": 1.4703766160764475, + "grad_norm": 0.20182529091835022, + "learning_rate": 7.95045045045045e-05, + "loss": 0.3404, + "step": 4905 + }, + { + "epoch": 1.4706764099681469, + "grad_norm": 0.22032389044761658, + "learning_rate": 7.945945945945945e-05, + "loss": 0.371, + "step": 4906 + }, + { + "epoch": 1.4709762038598464, + "grad_norm": 0.22178107500076294, + "learning_rate": 7.94144144144144e-05, + "loss": 0.348, + "step": 4907 + }, + { + "epoch": 1.4712759977515457, + "grad_norm": 0.2226249873638153, + "learning_rate": 7.936936936936937e-05, + "loss": 0.3342, + "step": 4908 + }, + { + "epoch": 1.4715757916432453, + "grad_norm": 0.21837739646434784, + "learning_rate": 7.932432432432433e-05, + "loss": 0.3316, + "step": 4909 + }, + { + "epoch": 1.4718755855349448, + "grad_norm": 0.2183731645345688, + "learning_rate": 7.927927927927927e-05, + "loss": 0.3615, + "step": 4910 + }, + { + "epoch": 1.4721753794266441, + "grad_norm": 0.22412636876106262, + "learning_rate": 7.923423423423423e-05, + "loss": 0.3466, + "step": 4911 + }, + { + "epoch": 1.4724751733183437, + "grad_norm": 0.21817882359027863, + "learning_rate": 7.91891891891892e-05, + "loss": 0.3699, + "step": 4912 + }, + { + "epoch": 1.472774967210043, + "grad_norm": 0.2297516018152237, + "learning_rate": 7.914414414414413e-05, + "loss": 0.3617, + "step": 4913 + }, + { + "epoch": 1.4730747611017425, + "grad_norm": 0.21283744275569916, + "learning_rate": 7.90990990990991e-05, + "loss": 0.3578, + "step": 4914 + }, + { + "epoch": 1.473374554993442, + "grad_norm": 0.21200676262378693, + "learning_rate": 7.905405405405404e-05, + "loss": 0.3389, + "step": 4915 + }, + { + "epoch": 1.4736743488851416, + "grad_norm": 0.22450967133045197, + "learning_rate": 7.9009009009009e-05, + "loss": 0.352, + "step": 4916 + }, + { + "epoch": 1.473974142776841, + "grad_norm": 0.23047909140586853, + "learning_rate": 7.896396396396396e-05, + "loss": 0.371, + "step": 4917 + }, + { + "epoch": 1.4742739366685405, + "grad_norm": 0.2088591456413269, + "learning_rate": 7.891891891891891e-05, + "loss": 0.3543, + "step": 4918 + }, + { + "epoch": 1.4745737305602398, + "grad_norm": 0.22470034658908844, + "learning_rate": 7.887387387387387e-05, + "loss": 0.3477, + "step": 4919 + }, + { + "epoch": 1.4748735244519393, + "grad_norm": 0.22082637250423431, + "learning_rate": 7.882882882882882e-05, + "loss": 0.361, + "step": 4920 + }, + { + "epoch": 1.4751733183436389, + "grad_norm": 0.22368364036083221, + "learning_rate": 7.878378378378377e-05, + "loss": 0.3703, + "step": 4921 + }, + { + "epoch": 1.4754731122353382, + "grad_norm": 0.22301022708415985, + "learning_rate": 7.873873873873874e-05, + "loss": 0.3558, + "step": 4922 + }, + { + "epoch": 1.4757729061270377, + "grad_norm": 0.21291525661945343, + "learning_rate": 7.869369369369369e-05, + "loss": 0.3634, + "step": 4923 + }, + { + "epoch": 1.476072700018737, + "grad_norm": 0.2003738433122635, + "learning_rate": 7.864864864864864e-05, + "loss": 0.3432, + "step": 4924 + }, + { + "epoch": 1.4763724939104366, + "grad_norm": 0.21728284657001495, + "learning_rate": 7.86036036036036e-05, + "loss": 0.3527, + "step": 4925 + }, + { + "epoch": 1.476672287802136, + "grad_norm": 0.21595244109630585, + "learning_rate": 7.855855855855857e-05, + "loss": 0.3567, + "step": 4926 + }, + { + "epoch": 1.4769720816938354, + "grad_norm": 0.21208882331848145, + "learning_rate": 7.85135135135135e-05, + "loss": 0.3611, + "step": 4927 + }, + { + "epoch": 1.477271875585535, + "grad_norm": 0.21161866188049316, + "learning_rate": 7.846846846846847e-05, + "loss": 0.3587, + "step": 4928 + }, + { + "epoch": 1.4775716694772343, + "grad_norm": 0.2157314121723175, + "learning_rate": 7.84234234234234e-05, + "loss": 0.3595, + "step": 4929 + }, + { + "epoch": 1.4778714633689338, + "grad_norm": 0.22329649329185486, + "learning_rate": 7.837837837837837e-05, + "loss": 0.3563, + "step": 4930 + }, + { + "epoch": 1.4781712572606334, + "grad_norm": 0.20477770268917084, + "learning_rate": 7.833333333333333e-05, + "loss": 0.3265, + "step": 4931 + }, + { + "epoch": 1.478471051152333, + "grad_norm": 0.20577353239059448, + "learning_rate": 7.828828828828828e-05, + "loss": 0.3553, + "step": 4932 + }, + { + "epoch": 1.4787708450440322, + "grad_norm": 0.22000664472579956, + "learning_rate": 7.824324324324323e-05, + "loss": 0.3523, + "step": 4933 + }, + { + "epoch": 1.4790706389357318, + "grad_norm": 0.21802487969398499, + "learning_rate": 7.81981981981982e-05, + "loss": 0.3964, + "step": 4934 + }, + { + "epoch": 1.479370432827431, + "grad_norm": 0.21147708594799042, + "learning_rate": 7.815315315315315e-05, + "loss": 0.3463, + "step": 4935 + }, + { + "epoch": 1.4796702267191306, + "grad_norm": 0.21361719071865082, + "learning_rate": 7.81081081081081e-05, + "loss": 0.3512, + "step": 4936 + }, + { + "epoch": 1.4799700206108302, + "grad_norm": 0.20637863874435425, + "learning_rate": 7.806306306306306e-05, + "loss": 0.3357, + "step": 4937 + }, + { + "epoch": 1.4802698145025295, + "grad_norm": 0.20978131890296936, + "learning_rate": 7.801801801801801e-05, + "loss": 0.3778, + "step": 4938 + }, + { + "epoch": 1.480569608394229, + "grad_norm": 0.22668473422527313, + "learning_rate": 7.797297297297297e-05, + "loss": 0.3844, + "step": 4939 + }, + { + "epoch": 1.4808694022859283, + "grad_norm": 0.21885189414024353, + "learning_rate": 7.792792792792791e-05, + "loss": 0.3337, + "step": 4940 + }, + { + "epoch": 1.4811691961776279, + "grad_norm": 0.19497647881507874, + "learning_rate": 7.788288288288287e-05, + "loss": 0.3193, + "step": 4941 + }, + { + "epoch": 1.4814689900693274, + "grad_norm": 0.2125978320837021, + "learning_rate": 7.783783783783784e-05, + "loss": 0.3722, + "step": 4942 + }, + { + "epoch": 1.4817687839610267, + "grad_norm": 0.20573614537715912, + "learning_rate": 7.779279279279278e-05, + "loss": 0.3535, + "step": 4943 + }, + { + "epoch": 1.4820685778527263, + "grad_norm": 0.20022504031658173, + "learning_rate": 7.774774774774774e-05, + "loss": 0.358, + "step": 4944 + }, + { + "epoch": 1.4823683717444256, + "grad_norm": 0.19802720844745636, + "learning_rate": 7.77027027027027e-05, + "loss": 0.3268, + "step": 4945 + }, + { + "epoch": 1.4826681656361251, + "grad_norm": 0.2035313844680786, + "learning_rate": 7.765765765765764e-05, + "loss": 0.3432, + "step": 4946 + }, + { + "epoch": 1.4829679595278247, + "grad_norm": 0.19907894730567932, + "learning_rate": 7.76126126126126e-05, + "loss": 0.3341, + "step": 4947 + }, + { + "epoch": 1.4832677534195242, + "grad_norm": 0.22520247101783752, + "learning_rate": 7.756756756756757e-05, + "loss": 0.3673, + "step": 4948 + }, + { + "epoch": 1.4835675473112235, + "grad_norm": 0.21920591592788696, + "learning_rate": 7.752252252252252e-05, + "loss": 0.3789, + "step": 4949 + }, + { + "epoch": 1.483867341202923, + "grad_norm": 0.2152647227048874, + "learning_rate": 7.747747747747747e-05, + "loss": 0.3697, + "step": 4950 + }, + { + "epoch": 1.4841671350946224, + "grad_norm": 0.20558972656726837, + "learning_rate": 7.743243243243242e-05, + "loss": 0.3421, + "step": 4951 + }, + { + "epoch": 1.484466928986322, + "grad_norm": 0.21162059903144836, + "learning_rate": 7.738738738738738e-05, + "loss": 0.3531, + "step": 4952 + }, + { + "epoch": 1.4847667228780215, + "grad_norm": 0.20541666448116302, + "learning_rate": 7.734234234234233e-05, + "loss": 0.3557, + "step": 4953 + }, + { + "epoch": 1.4850665167697208, + "grad_norm": 0.21199767291545868, + "learning_rate": 7.729729729729728e-05, + "loss": 0.3593, + "step": 4954 + }, + { + "epoch": 1.4853663106614203, + "grad_norm": 0.21618947386741638, + "learning_rate": 7.725225225225225e-05, + "loss": 0.3663, + "step": 4955 + }, + { + "epoch": 1.4856661045531196, + "grad_norm": 0.20753389596939087, + "learning_rate": 7.720720720720721e-05, + "loss": 0.3508, + "step": 4956 + }, + { + "epoch": 1.4859658984448192, + "grad_norm": 0.22099077701568604, + "learning_rate": 7.716216216216215e-05, + "loss": 0.3516, + "step": 4957 + }, + { + "epoch": 1.4862656923365187, + "grad_norm": 0.2257046103477478, + "learning_rate": 7.711711711711711e-05, + "loss": 0.3551, + "step": 4958 + }, + { + "epoch": 1.486565486228218, + "grad_norm": 0.2128523290157318, + "learning_rate": 7.707207207207208e-05, + "loss": 0.36, + "step": 4959 + }, + { + "epoch": 1.4868652801199176, + "grad_norm": 0.2099178284406662, + "learning_rate": 7.702702702702701e-05, + "loss": 0.3645, + "step": 4960 + }, + { + "epoch": 1.487165074011617, + "grad_norm": 0.20770512521266937, + "learning_rate": 7.698198198198198e-05, + "loss": 0.36, + "step": 4961 + }, + { + "epoch": 1.4874648679033164, + "grad_norm": 0.20869390666484833, + "learning_rate": 7.693693693693694e-05, + "loss": 0.3542, + "step": 4962 + }, + { + "epoch": 1.487764661795016, + "grad_norm": 0.1958959698677063, + "learning_rate": 7.689189189189188e-05, + "loss": 0.3298, + "step": 4963 + }, + { + "epoch": 1.4880644556867153, + "grad_norm": 0.22579431533813477, + "learning_rate": 7.684684684684684e-05, + "loss": 0.3886, + "step": 4964 + }, + { + "epoch": 1.4883642495784148, + "grad_norm": 0.21450291574001312, + "learning_rate": 7.680180180180179e-05, + "loss": 0.3742, + "step": 4965 + }, + { + "epoch": 1.4886640434701142, + "grad_norm": 0.21159665286540985, + "learning_rate": 7.675675675675675e-05, + "loss": 0.362, + "step": 4966 + }, + { + "epoch": 1.4889638373618137, + "grad_norm": 0.2096509039402008, + "learning_rate": 7.67117117117117e-05, + "loss": 0.373, + "step": 4967 + }, + { + "epoch": 1.4892636312535132, + "grad_norm": 0.21596965193748474, + "learning_rate": 7.666666666666666e-05, + "loss": 0.3584, + "step": 4968 + }, + { + "epoch": 1.4895634251452128, + "grad_norm": 0.20583121478557587, + "learning_rate": 7.662162162162162e-05, + "loss": 0.344, + "step": 4969 + }, + { + "epoch": 1.489863219036912, + "grad_norm": 0.2186458855867386, + "learning_rate": 7.657657657657657e-05, + "loss": 0.3943, + "step": 4970 + }, + { + "epoch": 1.4901630129286116, + "grad_norm": 0.21535223722457886, + "learning_rate": 7.653153153153152e-05, + "loss": 0.3337, + "step": 4971 + }, + { + "epoch": 1.490462806820311, + "grad_norm": 0.21319331228733063, + "learning_rate": 7.648648648648648e-05, + "loss": 0.3697, + "step": 4972 + }, + { + "epoch": 1.4907626007120105, + "grad_norm": 0.21556057035923004, + "learning_rate": 7.644144144144145e-05, + "loss": 0.3758, + "step": 4973 + }, + { + "epoch": 1.49106239460371, + "grad_norm": 0.2040267437696457, + "learning_rate": 7.639639639639638e-05, + "loss": 0.3605, + "step": 4974 + }, + { + "epoch": 1.4913621884954094, + "grad_norm": 0.22170892357826233, + "learning_rate": 7.635135135135135e-05, + "loss": 0.3907, + "step": 4975 + }, + { + "epoch": 1.491661982387109, + "grad_norm": 0.21407882869243622, + "learning_rate": 7.630630630630629e-05, + "loss": 0.3591, + "step": 4976 + }, + { + "epoch": 1.4919617762788082, + "grad_norm": 0.20388971269130707, + "learning_rate": 7.626126126126125e-05, + "loss": 0.3491, + "step": 4977 + }, + { + "epoch": 1.4922615701705078, + "grad_norm": 0.21065190434455872, + "learning_rate": 7.621621621621621e-05, + "loss": 0.3413, + "step": 4978 + }, + { + "epoch": 1.4925613640622073, + "grad_norm": 0.21436721086502075, + "learning_rate": 7.617117117117116e-05, + "loss": 0.3576, + "step": 4979 + }, + { + "epoch": 1.4928611579539066, + "grad_norm": 0.205039381980896, + "learning_rate": 7.612612612612611e-05, + "loss": 0.3722, + "step": 4980 + }, + { + "epoch": 1.4931609518456062, + "grad_norm": 0.2189406454563141, + "learning_rate": 7.608108108108108e-05, + "loss": 0.3728, + "step": 4981 + }, + { + "epoch": 1.4934607457373055, + "grad_norm": 0.21848881244659424, + "learning_rate": 7.603603603603603e-05, + "loss": 0.3592, + "step": 4982 + }, + { + "epoch": 1.493760539629005, + "grad_norm": 0.21232780814170837, + "learning_rate": 7.599099099099099e-05, + "loss": 0.3704, + "step": 4983 + }, + { + "epoch": 1.4940603335207046, + "grad_norm": 0.2145267128944397, + "learning_rate": 7.594594594594594e-05, + "loss": 0.368, + "step": 4984 + }, + { + "epoch": 1.494360127412404, + "grad_norm": 0.22516867518424988, + "learning_rate": 7.590090090090089e-05, + "loss": 0.3801, + "step": 4985 + }, + { + "epoch": 1.4946599213041034, + "grad_norm": 0.21354909241199493, + "learning_rate": 7.585585585585586e-05, + "loss": 0.3709, + "step": 4986 + }, + { + "epoch": 1.494959715195803, + "grad_norm": 0.21226109564304352, + "learning_rate": 7.581081081081079e-05, + "loss": 0.3591, + "step": 4987 + }, + { + "epoch": 1.4952595090875023, + "grad_norm": 0.21242061257362366, + "learning_rate": 7.576576576576576e-05, + "loss": 0.3492, + "step": 4988 + }, + { + "epoch": 1.4955593029792018, + "grad_norm": 0.20486439764499664, + "learning_rate": 7.572072072072072e-05, + "loss": 0.3624, + "step": 4989 + }, + { + "epoch": 1.4958590968709014, + "grad_norm": 0.20076008141040802, + "learning_rate": 7.567567567567566e-05, + "loss": 0.3296, + "step": 4990 + }, + { + "epoch": 1.4961588907626007, + "grad_norm": 0.21576011180877686, + "learning_rate": 7.563063063063062e-05, + "loss": 0.3839, + "step": 4991 + }, + { + "epoch": 1.4964586846543002, + "grad_norm": 0.20013126730918884, + "learning_rate": 7.558558558558559e-05, + "loss": 0.3606, + "step": 4992 + }, + { + "epoch": 1.4967584785459995, + "grad_norm": 0.2180420160293579, + "learning_rate": 7.554054054054052e-05, + "loss": 0.3652, + "step": 4993 + }, + { + "epoch": 1.497058272437699, + "grad_norm": 0.21717427670955658, + "learning_rate": 7.549549549549549e-05, + "loss": 0.3481, + "step": 4994 + }, + { + "epoch": 1.4973580663293986, + "grad_norm": 0.2011396586894989, + "learning_rate": 7.545045045045045e-05, + "loss": 0.3143, + "step": 4995 + }, + { + "epoch": 1.497657860221098, + "grad_norm": 0.2071286290884018, + "learning_rate": 7.54054054054054e-05, + "loss": 0.3822, + "step": 4996 + }, + { + "epoch": 1.4979576541127975, + "grad_norm": 0.20190057158470154, + "learning_rate": 7.536036036036035e-05, + "loss": 0.3512, + "step": 4997 + }, + { + "epoch": 1.4982574480044968, + "grad_norm": 0.20391745865345, + "learning_rate": 7.531531531531531e-05, + "loss": 0.3613, + "step": 4998 + }, + { + "epoch": 1.4985572418961963, + "grad_norm": 0.20113232731819153, + "learning_rate": 7.527027027027026e-05, + "loss": 0.3377, + "step": 4999 + }, + { + "epoch": 1.4988570357878959, + "grad_norm": 0.20599274337291718, + "learning_rate": 7.522522522522523e-05, + "loss": 0.34, + "step": 5000 + }, + { + "epoch": 1.4988570357878959, + "eval_loss": 0.409088671207428, + "eval_runtime": 565.6657, + "eval_samples_per_second": 3.817, + "eval_steps_per_second": 0.477, + "step": 5000 + }, + { + "epoch": 1.4991568296795954, + "grad_norm": 0.21036118268966675, + "learning_rate": 7.518018018018017e-05, + "loss": 0.349, + "step": 5001 + }, + { + "epoch": 1.4994566235712947, + "grad_norm": 0.20376059412956238, + "learning_rate": 7.513513513513513e-05, + "loss": 0.3423, + "step": 5002 + }, + { + "epoch": 1.4997564174629943, + "grad_norm": 0.20630986988544464, + "learning_rate": 7.509009009009009e-05, + "loss": 0.3514, + "step": 5003 + }, + { + "epoch": 1.5000562113546936, + "grad_norm": 0.20768223702907562, + "learning_rate": 7.504504504504503e-05, + "loss": 0.3557, + "step": 5004 + }, + { + "epoch": 1.5003560052463931, + "grad_norm": 0.22584180533885956, + "learning_rate": 7.5e-05, + "loss": 0.3641, + "step": 5005 + }, + { + "epoch": 1.5006557991380927, + "grad_norm": 0.20455649495124817, + "learning_rate": 7.495495495495494e-05, + "loss": 0.3396, + "step": 5006 + }, + { + "epoch": 1.500955593029792, + "grad_norm": 0.21096675097942352, + "learning_rate": 7.490990990990991e-05, + "loss": 0.3527, + "step": 5007 + }, + { + "epoch": 1.5012553869214915, + "grad_norm": 0.20545406639575958, + "learning_rate": 7.486486486486486e-05, + "loss": 0.3494, + "step": 5008 + }, + { + "epoch": 1.5015551808131908, + "grad_norm": 0.20965568721294403, + "learning_rate": 7.481981981981981e-05, + "loss": 0.3613, + "step": 5009 + }, + { + "epoch": 1.5018549747048904, + "grad_norm": 0.21544276177883148, + "learning_rate": 7.477477477477476e-05, + "loss": 0.3565, + "step": 5010 + }, + { + "epoch": 1.50215476859659, + "grad_norm": 0.20230863988399506, + "learning_rate": 7.472972972972972e-05, + "loss": 0.3376, + "step": 5011 + }, + { + "epoch": 1.5024545624882895, + "grad_norm": 0.20163989067077637, + "learning_rate": 7.468468468468467e-05, + "loss": 0.3276, + "step": 5012 + }, + { + "epoch": 1.5027543563799888, + "grad_norm": 0.21091212332248688, + "learning_rate": 7.463963963963964e-05, + "loss": 0.3613, + "step": 5013 + }, + { + "epoch": 1.503054150271688, + "grad_norm": 0.21473614871501923, + "learning_rate": 7.459459459459459e-05, + "loss": 0.345, + "step": 5014 + }, + { + "epoch": 1.5033539441633876, + "grad_norm": 0.19872497022151947, + "learning_rate": 7.454954954954955e-05, + "loss": 0.349, + "step": 5015 + }, + { + "epoch": 1.5036537380550872, + "grad_norm": 0.21866753697395325, + "learning_rate": 7.45045045045045e-05, + "loss": 0.3704, + "step": 5016 + }, + { + "epoch": 1.5039535319467867, + "grad_norm": 0.19531461596488953, + "learning_rate": 7.445945945945945e-05, + "loss": 0.3293, + "step": 5017 + }, + { + "epoch": 1.504253325838486, + "grad_norm": 0.20132407546043396, + "learning_rate": 7.441441441441442e-05, + "loss": 0.3535, + "step": 5018 + }, + { + "epoch": 1.5045531197301854, + "grad_norm": 0.20986546576023102, + "learning_rate": 7.436936936936937e-05, + "loss": 0.3635, + "step": 5019 + }, + { + "epoch": 1.504852913621885, + "grad_norm": 0.21108418703079224, + "learning_rate": 7.432432432432432e-05, + "loss": 0.3592, + "step": 5020 + }, + { + "epoch": 1.5051527075135844, + "grad_norm": 0.2011735737323761, + "learning_rate": 7.427927927927927e-05, + "loss": 0.3311, + "step": 5021 + }, + { + "epoch": 1.505452501405284, + "grad_norm": 0.20430687069892883, + "learning_rate": 7.423423423423423e-05, + "loss": 0.3586, + "step": 5022 + }, + { + "epoch": 1.5057522952969833, + "grad_norm": 0.19233015179634094, + "learning_rate": 7.418918918918918e-05, + "loss": 0.3203, + "step": 5023 + }, + { + "epoch": 1.5060520891886828, + "grad_norm": 0.21224820613861084, + "learning_rate": 7.414414414414413e-05, + "loss": 0.352, + "step": 5024 + }, + { + "epoch": 1.5063518830803821, + "grad_norm": 0.2162107676267624, + "learning_rate": 7.40990990990991e-05, + "loss": 0.3716, + "step": 5025 + }, + { + "epoch": 1.5066516769720817, + "grad_norm": 0.21990056335926056, + "learning_rate": 7.405405405405405e-05, + "loss": 0.371, + "step": 5026 + }, + { + "epoch": 1.5069514708637812, + "grad_norm": 0.2139321267604828, + "learning_rate": 7.4009009009009e-05, + "loss": 0.3406, + "step": 5027 + }, + { + "epoch": 1.5072512647554808, + "grad_norm": 0.21176601946353912, + "learning_rate": 7.396396396396396e-05, + "loss": 0.3863, + "step": 5028 + }, + { + "epoch": 1.50755105864718, + "grad_norm": 0.21457381546497345, + "learning_rate": 7.391891891891891e-05, + "loss": 0.3651, + "step": 5029 + }, + { + "epoch": 1.5078508525388794, + "grad_norm": 0.22571976482868195, + "learning_rate": 7.387387387387387e-05, + "loss": 0.3626, + "step": 5030 + }, + { + "epoch": 1.508150646430579, + "grad_norm": 0.20284788310527802, + "learning_rate": 7.382882882882882e-05, + "loss": 0.3339, + "step": 5031 + }, + { + "epoch": 1.5084504403222785, + "grad_norm": 0.2027716338634491, + "learning_rate": 7.378378378378379e-05, + "loss": 0.3439, + "step": 5032 + }, + { + "epoch": 1.508750234213978, + "grad_norm": 0.21533340215682983, + "learning_rate": 7.373873873873874e-05, + "loss": 0.3524, + "step": 5033 + }, + { + "epoch": 1.5090500281056773, + "grad_norm": 0.2093924582004547, + "learning_rate": 7.369369369369369e-05, + "loss": 0.37, + "step": 5034 + }, + { + "epoch": 1.5093498219973767, + "grad_norm": 0.20701274275779724, + "learning_rate": 7.364864864864864e-05, + "loss": 0.3559, + "step": 5035 + }, + { + "epoch": 1.5096496158890762, + "grad_norm": 0.20343904197216034, + "learning_rate": 7.36036036036036e-05, + "loss": 0.3713, + "step": 5036 + }, + { + "epoch": 1.5099494097807757, + "grad_norm": 0.20688888430595398, + "learning_rate": 7.355855855855855e-05, + "loss": 0.3541, + "step": 5037 + }, + { + "epoch": 1.5102492036724753, + "grad_norm": 0.21339473128318787, + "learning_rate": 7.35135135135135e-05, + "loss": 0.3674, + "step": 5038 + }, + { + "epoch": 1.5105489975641746, + "grad_norm": 0.2124233990907669, + "learning_rate": 7.346846846846845e-05, + "loss": 0.3723, + "step": 5039 + }, + { + "epoch": 1.5108487914558741, + "grad_norm": 0.20129559934139252, + "learning_rate": 7.342342342342342e-05, + "loss": 0.3266, + "step": 5040 + }, + { + "epoch": 1.5111485853475735, + "grad_norm": 0.23245026171207428, + "learning_rate": 7.337837837837837e-05, + "loss": 0.3649, + "step": 5041 + }, + { + "epoch": 1.511448379239273, + "grad_norm": 0.2057090848684311, + "learning_rate": 7.333333333333332e-05, + "loss": 0.3694, + "step": 5042 + }, + { + "epoch": 1.5117481731309725, + "grad_norm": 0.20243839919567108, + "learning_rate": 7.328828828828828e-05, + "loss": 0.3258, + "step": 5043 + }, + { + "epoch": 1.512047967022672, + "grad_norm": 0.20519401133060455, + "learning_rate": 7.324324324324323e-05, + "loss": 0.3383, + "step": 5044 + }, + { + "epoch": 1.5123477609143714, + "grad_norm": 0.2056913673877716, + "learning_rate": 7.31981981981982e-05, + "loss": 0.3543, + "step": 5045 + }, + { + "epoch": 1.5126475548060707, + "grad_norm": 0.20329561829566956, + "learning_rate": 7.315315315315315e-05, + "loss": 0.339, + "step": 5046 + }, + { + "epoch": 1.5129473486977703, + "grad_norm": 0.20037339627742767, + "learning_rate": 7.310810810810811e-05, + "loss": 0.3371, + "step": 5047 + }, + { + "epoch": 1.5132471425894698, + "grad_norm": 0.21403111517429352, + "learning_rate": 7.306306306306306e-05, + "loss": 0.3854, + "step": 5048 + }, + { + "epoch": 1.5135469364811693, + "grad_norm": 0.20956586301326752, + "learning_rate": 7.301801801801801e-05, + "loss": 0.3696, + "step": 5049 + }, + { + "epoch": 1.5138467303728687, + "grad_norm": 0.208208367228508, + "learning_rate": 7.297297297297297e-05, + "loss": 0.3495, + "step": 5050 + }, + { + "epoch": 1.514146524264568, + "grad_norm": 0.2096438705921173, + "learning_rate": 7.292792792792792e-05, + "loss": 0.3275, + "step": 5051 + }, + { + "epoch": 1.5144463181562675, + "grad_norm": 0.22821904718875885, + "learning_rate": 7.288288288288288e-05, + "loss": 0.3513, + "step": 5052 + }, + { + "epoch": 1.514746112047967, + "grad_norm": 0.20898018777370453, + "learning_rate": 7.283783783783783e-05, + "loss": 0.366, + "step": 5053 + }, + { + "epoch": 1.5150459059396666, + "grad_norm": 0.21257898211479187, + "learning_rate": 7.279279279279279e-05, + "loss": 0.3604, + "step": 5054 + }, + { + "epoch": 1.515345699831366, + "grad_norm": 0.2063610702753067, + "learning_rate": 7.274774774774774e-05, + "loss": 0.3513, + "step": 5055 + }, + { + "epoch": 1.5156454937230652, + "grad_norm": 0.21631956100463867, + "learning_rate": 7.270270270270269e-05, + "loss": 0.3586, + "step": 5056 + }, + { + "epoch": 1.5159452876147648, + "grad_norm": 0.21602363884449005, + "learning_rate": 7.265765765765765e-05, + "loss": 0.3659, + "step": 5057 + }, + { + "epoch": 1.5162450815064643, + "grad_norm": 0.20453397929668427, + "learning_rate": 7.26126126126126e-05, + "loss": 0.346, + "step": 5058 + }, + { + "epoch": 1.5165448753981638, + "grad_norm": 0.20310211181640625, + "learning_rate": 7.256756756756755e-05, + "loss": 0.3352, + "step": 5059 + }, + { + "epoch": 1.5168446692898632, + "grad_norm": 0.2058761715888977, + "learning_rate": 7.252252252252252e-05, + "loss": 0.3479, + "step": 5060 + }, + { + "epoch": 1.5171444631815627, + "grad_norm": 0.20629523694515228, + "learning_rate": 7.247747747747747e-05, + "loss": 0.3784, + "step": 5061 + }, + { + "epoch": 1.517444257073262, + "grad_norm": 0.2118360996246338, + "learning_rate": 7.243243243243243e-05, + "loss": 0.3586, + "step": 5062 + }, + { + "epoch": 1.5177440509649616, + "grad_norm": 0.22766007483005524, + "learning_rate": 7.238738738738738e-05, + "loss": 0.3553, + "step": 5063 + }, + { + "epoch": 1.518043844856661, + "grad_norm": 0.21020479500293732, + "learning_rate": 7.234234234234233e-05, + "loss": 0.3518, + "step": 5064 + }, + { + "epoch": 1.5183436387483606, + "grad_norm": 0.19974002242088318, + "learning_rate": 7.22972972972973e-05, + "loss": 0.3441, + "step": 5065 + }, + { + "epoch": 1.51864343264006, + "grad_norm": 0.20955109596252441, + "learning_rate": 7.225225225225225e-05, + "loss": 0.3786, + "step": 5066 + }, + { + "epoch": 1.5189432265317593, + "grad_norm": 0.19488339126110077, + "learning_rate": 7.22072072072072e-05, + "loss": 0.3461, + "step": 5067 + }, + { + "epoch": 1.5192430204234588, + "grad_norm": 0.20264093577861786, + "learning_rate": 7.216216216216216e-05, + "loss": 0.3526, + "step": 5068 + }, + { + "epoch": 1.5195428143151584, + "grad_norm": 0.20072831213474274, + "learning_rate": 7.211711711711711e-05, + "loss": 0.3539, + "step": 5069 + }, + { + "epoch": 1.519842608206858, + "grad_norm": 0.19965948164463043, + "learning_rate": 7.207207207207206e-05, + "loss": 0.3529, + "step": 5070 + }, + { + "epoch": 1.5201424020985572, + "grad_norm": 0.21437616646289825, + "learning_rate": 7.202702702702701e-05, + "loss": 0.3363, + "step": 5071 + }, + { + "epoch": 1.5204421959902565, + "grad_norm": 0.20634238421916962, + "learning_rate": 7.198198198198198e-05, + "loss": 0.3653, + "step": 5072 + }, + { + "epoch": 1.520741989881956, + "grad_norm": 0.20045985281467438, + "learning_rate": 7.193693693693693e-05, + "loss": 0.3508, + "step": 5073 + }, + { + "epoch": 1.5210417837736556, + "grad_norm": 0.2013823390007019, + "learning_rate": 7.189189189189189e-05, + "loss": 0.3434, + "step": 5074 + }, + { + "epoch": 1.5213415776653552, + "grad_norm": 0.20428241789340973, + "learning_rate": 7.184684684684684e-05, + "loss": 0.3478, + "step": 5075 + }, + { + "epoch": 1.5216413715570545, + "grad_norm": 0.20228740572929382, + "learning_rate": 7.180180180180179e-05, + "loss": 0.3502, + "step": 5076 + }, + { + "epoch": 1.521941165448754, + "grad_norm": 0.21271152794361115, + "learning_rate": 7.175675675675676e-05, + "loss": 0.3606, + "step": 5077 + }, + { + "epoch": 1.5222409593404533, + "grad_norm": 0.20367246866226196, + "learning_rate": 7.17117117117117e-05, + "loss": 0.3658, + "step": 5078 + }, + { + "epoch": 1.5225407532321529, + "grad_norm": 0.20234622061252594, + "learning_rate": 7.166666666666667e-05, + "loss": 0.3669, + "step": 5079 + }, + { + "epoch": 1.5228405471238524, + "grad_norm": 0.20251305401325226, + "learning_rate": 7.162162162162162e-05, + "loss": 0.353, + "step": 5080 + }, + { + "epoch": 1.523140341015552, + "grad_norm": 0.2070438116788864, + "learning_rate": 7.157657657657657e-05, + "loss": 0.3654, + "step": 5081 + }, + { + "epoch": 1.5234401349072513, + "grad_norm": 0.21502596139907837, + "learning_rate": 7.153153153153152e-05, + "loss": 0.3666, + "step": 5082 + }, + { + "epoch": 1.5237399287989506, + "grad_norm": 0.2074059396982193, + "learning_rate": 7.148648648648648e-05, + "loss": 0.3868, + "step": 5083 + }, + { + "epoch": 1.5240397226906501, + "grad_norm": 0.20912374556064606, + "learning_rate": 7.144144144144143e-05, + "loss": 0.3563, + "step": 5084 + }, + { + "epoch": 1.5243395165823497, + "grad_norm": 0.19438451528549194, + "learning_rate": 7.139639639639638e-05, + "loss": 0.3388, + "step": 5085 + }, + { + "epoch": 1.5246393104740492, + "grad_norm": 0.20785526931285858, + "learning_rate": 7.135135135135135e-05, + "loss": 0.3535, + "step": 5086 + }, + { + "epoch": 1.5249391043657485, + "grad_norm": 0.21372053027153015, + "learning_rate": 7.13063063063063e-05, + "loss": 0.385, + "step": 5087 + }, + { + "epoch": 1.5252388982574479, + "grad_norm": 0.2027290165424347, + "learning_rate": 7.126126126126125e-05, + "loss": 0.3561, + "step": 5088 + }, + { + "epoch": 1.5255386921491474, + "grad_norm": 0.21128539741039276, + "learning_rate": 7.121621621621621e-05, + "loss": 0.3608, + "step": 5089 + }, + { + "epoch": 1.525838486040847, + "grad_norm": 0.2072833627462387, + "learning_rate": 7.117117117117116e-05, + "loss": 0.3572, + "step": 5090 + }, + { + "epoch": 1.5261382799325465, + "grad_norm": 0.21812467277050018, + "learning_rate": 7.112612612612611e-05, + "loss": 0.3916, + "step": 5091 + }, + { + "epoch": 1.5264380738242458, + "grad_norm": 0.1990809589624405, + "learning_rate": 7.108108108108108e-05, + "loss": 0.3457, + "step": 5092 + }, + { + "epoch": 1.5267378677159453, + "grad_norm": 0.21215678751468658, + "learning_rate": 7.103603603603603e-05, + "loss": 0.361, + "step": 5093 + }, + { + "epoch": 1.5270376616076446, + "grad_norm": 0.20100034773349762, + "learning_rate": 7.099099099099099e-05, + "loss": 0.346, + "step": 5094 + }, + { + "epoch": 1.5273374554993442, + "grad_norm": 0.25413811206817627, + "learning_rate": 7.094594594594594e-05, + "loss": 0.339, + "step": 5095 + }, + { + "epoch": 1.5276372493910437, + "grad_norm": 0.19817399978637695, + "learning_rate": 7.090090090090089e-05, + "loss": 0.3539, + "step": 5096 + }, + { + "epoch": 1.5279370432827433, + "grad_norm": 0.20904068648815155, + "learning_rate": 7.085585585585586e-05, + "loss": 0.3499, + "step": 5097 + }, + { + "epoch": 1.5282368371744426, + "grad_norm": 0.21889416873455048, + "learning_rate": 7.081081081081081e-05, + "loss": 0.3659, + "step": 5098 + }, + { + "epoch": 1.528536631066142, + "grad_norm": 0.2134886085987091, + "learning_rate": 7.076576576576576e-05, + "loss": 0.3583, + "step": 5099 + }, + { + "epoch": 1.5288364249578414, + "grad_norm": 0.20167680084705353, + "learning_rate": 7.072072072072071e-05, + "loss": 0.3578, + "step": 5100 + }, + { + "epoch": 1.529136218849541, + "grad_norm": 0.21113313734531403, + "learning_rate": 7.067567567567567e-05, + "loss": 0.329, + "step": 5101 + }, + { + "epoch": 1.5294360127412405, + "grad_norm": 0.1989641785621643, + "learning_rate": 7.063063063063062e-05, + "loss": 0.3332, + "step": 5102 + }, + { + "epoch": 1.5297358066329398, + "grad_norm": 0.20343445241451263, + "learning_rate": 7.058558558558557e-05, + "loss": 0.3505, + "step": 5103 + }, + { + "epoch": 1.5300356005246392, + "grad_norm": 0.21245552599430084, + "learning_rate": 7.054054054054054e-05, + "loss": 0.3468, + "step": 5104 + }, + { + "epoch": 1.5303353944163387, + "grad_norm": 0.209833025932312, + "learning_rate": 7.049549549549549e-05, + "loss": 0.3684, + "step": 5105 + }, + { + "epoch": 1.5306351883080382, + "grad_norm": 0.19674982130527496, + "learning_rate": 7.045045045045045e-05, + "loss": 0.3338, + "step": 5106 + }, + { + "epoch": 1.5309349821997378, + "grad_norm": 0.2160000056028366, + "learning_rate": 7.04054054054054e-05, + "loss": 0.3603, + "step": 5107 + }, + { + "epoch": 1.531234776091437, + "grad_norm": 0.19963768124580383, + "learning_rate": 7.036036036036035e-05, + "loss": 0.3641, + "step": 5108 + }, + { + "epoch": 1.5315345699831366, + "grad_norm": 0.1939668208360672, + "learning_rate": 7.031531531531531e-05, + "loss": 0.3339, + "step": 5109 + }, + { + "epoch": 1.531834363874836, + "grad_norm": 0.2016298770904541, + "learning_rate": 7.027027027027026e-05, + "loss": 0.3602, + "step": 5110 + }, + { + "epoch": 1.5321341577665355, + "grad_norm": 0.21612954139709473, + "learning_rate": 7.022522522522522e-05, + "loss": 0.3846, + "step": 5111 + }, + { + "epoch": 1.532433951658235, + "grad_norm": 0.1958528757095337, + "learning_rate": 7.018018018018018e-05, + "loss": 0.3414, + "step": 5112 + }, + { + "epoch": 1.5327337455499346, + "grad_norm": 0.19213689863681793, + "learning_rate": 7.013513513513513e-05, + "loss": 0.3327, + "step": 5113 + }, + { + "epoch": 1.533033539441634, + "grad_norm": 0.2063470482826233, + "learning_rate": 7.009009009009008e-05, + "loss": 0.3538, + "step": 5114 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.21728667616844177, + "learning_rate": 7.004504504504504e-05, + "loss": 0.3551, + "step": 5115 + }, + { + "epoch": 1.5336331272250328, + "grad_norm": 0.20425064861774445, + "learning_rate": 7e-05, + "loss": 0.3249, + "step": 5116 + }, + { + "epoch": 1.5339329211167323, + "grad_norm": 0.20801694691181183, + "learning_rate": 6.995495495495494e-05, + "loss": 0.3611, + "step": 5117 + }, + { + "epoch": 1.5342327150084318, + "grad_norm": 0.19504021108150482, + "learning_rate": 6.99099099099099e-05, + "loss": 0.348, + "step": 5118 + }, + { + "epoch": 1.5345325089001312, + "grad_norm": 0.21642310917377472, + "learning_rate": 6.986486486486486e-05, + "loss": 0.3574, + "step": 5119 + }, + { + "epoch": 1.5348323027918305, + "grad_norm": 0.2077680230140686, + "learning_rate": 6.981981981981981e-05, + "loss": 0.3348, + "step": 5120 + }, + { + "epoch": 1.53513209668353, + "grad_norm": 0.2039538472890854, + "learning_rate": 6.977477477477477e-05, + "loss": 0.3443, + "step": 5121 + }, + { + "epoch": 1.5354318905752296, + "grad_norm": 0.20255398750305176, + "learning_rate": 6.972972972972972e-05, + "loss": 0.3563, + "step": 5122 + }, + { + "epoch": 1.535731684466929, + "grad_norm": 0.21409064531326294, + "learning_rate": 6.968468468468469e-05, + "loss": 0.3481, + "step": 5123 + }, + { + "epoch": 1.5360314783586284, + "grad_norm": 0.20551519095897675, + "learning_rate": 6.963963963963964e-05, + "loss": 0.3597, + "step": 5124 + }, + { + "epoch": 1.536331272250328, + "grad_norm": 0.2170633226633072, + "learning_rate": 6.959459459459459e-05, + "loss": 0.3652, + "step": 5125 + }, + { + "epoch": 1.5366310661420273, + "grad_norm": 0.20070041716098785, + "learning_rate": 6.954954954954955e-05, + "loss": 0.3412, + "step": 5126 + }, + { + "epoch": 1.5369308600337268, + "grad_norm": 0.22287438809871674, + "learning_rate": 6.95045045045045e-05, + "loss": 0.3788, + "step": 5127 + }, + { + "epoch": 1.5372306539254263, + "grad_norm": 0.2027093470096588, + "learning_rate": 6.945945945945945e-05, + "loss": 0.3439, + "step": 5128 + }, + { + "epoch": 1.537530447817126, + "grad_norm": 0.19838665425777435, + "learning_rate": 6.94144144144144e-05, + "loss": 0.3213, + "step": 5129 + }, + { + "epoch": 1.5378302417088252, + "grad_norm": 0.20357826352119446, + "learning_rate": 6.936936936936937e-05, + "loss": 0.35, + "step": 5130 + }, + { + "epoch": 1.5381300356005245, + "grad_norm": 0.2061091959476471, + "learning_rate": 6.932432432432432e-05, + "loss": 0.3341, + "step": 5131 + }, + { + "epoch": 1.538429829492224, + "grad_norm": 0.205326110124588, + "learning_rate": 6.927927927927927e-05, + "loss": 0.3473, + "step": 5132 + }, + { + "epoch": 1.5387296233839236, + "grad_norm": 0.21463227272033691, + "learning_rate": 6.923423423423423e-05, + "loss": 0.3708, + "step": 5133 + }, + { + "epoch": 1.5390294172756231, + "grad_norm": 0.1967410147190094, + "learning_rate": 6.918918918918918e-05, + "loss": 0.3439, + "step": 5134 + }, + { + "epoch": 1.5393292111673225, + "grad_norm": 0.20796793699264526, + "learning_rate": 6.914414414414413e-05, + "loss": 0.3573, + "step": 5135 + }, + { + "epoch": 1.5396290050590218, + "grad_norm": 0.2030736207962036, + "learning_rate": 6.90990990990991e-05, + "loss": 0.3634, + "step": 5136 + }, + { + "epoch": 1.5399287989507213, + "grad_norm": 0.20618915557861328, + "learning_rate": 6.905405405405405e-05, + "loss": 0.3503, + "step": 5137 + }, + { + "epoch": 1.5402285928424209, + "grad_norm": 0.21252769231796265, + "learning_rate": 6.900900900900901e-05, + "loss": 0.3655, + "step": 5138 + }, + { + "epoch": 1.5405283867341204, + "grad_norm": 0.2006307691335678, + "learning_rate": 6.896396396396396e-05, + "loss": 0.3445, + "step": 5139 + }, + { + "epoch": 1.5408281806258197, + "grad_norm": 0.19492262601852417, + "learning_rate": 6.891891891891891e-05, + "loss": 0.3275, + "step": 5140 + }, + { + "epoch": 1.541127974517519, + "grad_norm": 0.20005106925964355, + "learning_rate": 6.887387387387387e-05, + "loss": 0.3486, + "step": 5141 + }, + { + "epoch": 1.5414277684092186, + "grad_norm": 0.21888670325279236, + "learning_rate": 6.882882882882882e-05, + "loss": 0.3486, + "step": 5142 + }, + { + "epoch": 1.5417275623009181, + "grad_norm": 0.19712640345096588, + "learning_rate": 6.878378378378377e-05, + "loss": 0.3372, + "step": 5143 + }, + { + "epoch": 1.5420273561926177, + "grad_norm": 0.19769833981990814, + "learning_rate": 6.873873873873874e-05, + "loss": 0.3589, + "step": 5144 + }, + { + "epoch": 1.542327150084317, + "grad_norm": 0.2131979912519455, + "learning_rate": 6.869369369369369e-05, + "loss": 0.3463, + "step": 5145 + }, + { + "epoch": 1.5426269439760165, + "grad_norm": 0.20438240468502045, + "learning_rate": 6.864864864864864e-05, + "loss": 0.3632, + "step": 5146 + }, + { + "epoch": 1.5429267378677158, + "grad_norm": 0.2102312594652176, + "learning_rate": 6.860360360360359e-05, + "loss": 0.3709, + "step": 5147 + }, + { + "epoch": 1.5432265317594154, + "grad_norm": 0.20745466649532318, + "learning_rate": 6.855855855855855e-05, + "loss": 0.373, + "step": 5148 + }, + { + "epoch": 1.543526325651115, + "grad_norm": 0.20155175030231476, + "learning_rate": 6.85135135135135e-05, + "loss": 0.3678, + "step": 5149 + }, + { + "epoch": 1.5438261195428145, + "grad_norm": 0.21392785012722015, + "learning_rate": 6.846846846846845e-05, + "loss": 0.3831, + "step": 5150 + }, + { + "epoch": 1.5441259134345138, + "grad_norm": 0.20007534325122833, + "learning_rate": 6.842342342342342e-05, + "loss": 0.352, + "step": 5151 + }, + { + "epoch": 1.544425707326213, + "grad_norm": 0.20434188842773438, + "learning_rate": 6.837837837837837e-05, + "loss": 0.3534, + "step": 5152 + }, + { + "epoch": 1.5447255012179126, + "grad_norm": 0.2060280591249466, + "learning_rate": 6.833333333333333e-05, + "loss": 0.3425, + "step": 5153 + }, + { + "epoch": 1.5450252951096122, + "grad_norm": 0.20043106377124786, + "learning_rate": 6.828828828828828e-05, + "loss": 0.3471, + "step": 5154 + }, + { + "epoch": 1.5453250890013117, + "grad_norm": 0.20862244069576263, + "learning_rate": 6.824324324324325e-05, + "loss": 0.3743, + "step": 5155 + }, + { + "epoch": 1.545624882893011, + "grad_norm": 0.20181861519813538, + "learning_rate": 6.81981981981982e-05, + "loss": 0.3345, + "step": 5156 + }, + { + "epoch": 1.5459246767847103, + "grad_norm": 0.19661659002304077, + "learning_rate": 6.815315315315315e-05, + "loss": 0.3485, + "step": 5157 + }, + { + "epoch": 1.54622447067641, + "grad_norm": 0.19832420349121094, + "learning_rate": 6.810810810810811e-05, + "loss": 0.3398, + "step": 5158 + }, + { + "epoch": 1.5465242645681094, + "grad_norm": 0.19693104922771454, + "learning_rate": 6.806306306306306e-05, + "loss": 0.3493, + "step": 5159 + }, + { + "epoch": 1.546824058459809, + "grad_norm": 0.20385287702083588, + "learning_rate": 6.801801801801801e-05, + "loss": 0.3658, + "step": 5160 + }, + { + "epoch": 1.5471238523515083, + "grad_norm": 0.19576039910316467, + "learning_rate": 6.797297297297296e-05, + "loss": 0.3485, + "step": 5161 + }, + { + "epoch": 1.5474236462432078, + "grad_norm": 0.19672639667987823, + "learning_rate": 6.792792792792793e-05, + "loss": 0.3395, + "step": 5162 + }, + { + "epoch": 1.5477234401349071, + "grad_norm": 0.19690831005573273, + "learning_rate": 6.788288288288288e-05, + "loss": 0.3501, + "step": 5163 + }, + { + "epoch": 1.5480232340266067, + "grad_norm": 0.20069430768489838, + "learning_rate": 6.783783783783783e-05, + "loss": 0.3425, + "step": 5164 + }, + { + "epoch": 1.5483230279183062, + "grad_norm": 0.20951126515865326, + "learning_rate": 6.779279279279278e-05, + "loss": 0.3594, + "step": 5165 + }, + { + "epoch": 1.5486228218100058, + "grad_norm": 0.2090735137462616, + "learning_rate": 6.774774774774774e-05, + "loss": 0.3967, + "step": 5166 + }, + { + "epoch": 1.548922615701705, + "grad_norm": 0.2065633237361908, + "learning_rate": 6.770270270270269e-05, + "loss": 0.3609, + "step": 5167 + }, + { + "epoch": 1.5492224095934044, + "grad_norm": 0.20373541116714478, + "learning_rate": 6.765765765765765e-05, + "loss": 0.3446, + "step": 5168 + }, + { + "epoch": 1.549522203485104, + "grad_norm": 0.20226061344146729, + "learning_rate": 6.76126126126126e-05, + "loss": 0.351, + "step": 5169 + }, + { + "epoch": 1.5498219973768035, + "grad_norm": 0.19589018821716309, + "learning_rate": 6.756756756756757e-05, + "loss": 0.3165, + "step": 5170 + }, + { + "epoch": 1.550121791268503, + "grad_norm": 0.19932496547698975, + "learning_rate": 6.752252252252252e-05, + "loss": 0.3503, + "step": 5171 + }, + { + "epoch": 1.5504215851602023, + "grad_norm": 0.20237968862056732, + "learning_rate": 6.747747747747747e-05, + "loss": 0.3637, + "step": 5172 + }, + { + "epoch": 1.5507213790519017, + "grad_norm": 0.20368225872516632, + "learning_rate": 6.743243243243243e-05, + "loss": 0.3595, + "step": 5173 + }, + { + "epoch": 1.5510211729436012, + "grad_norm": 0.20362980663776398, + "learning_rate": 6.738738738738738e-05, + "loss": 0.3522, + "step": 5174 + }, + { + "epoch": 1.5513209668353007, + "grad_norm": 0.20698122680187225, + "learning_rate": 6.734234234234233e-05, + "loss": 0.3547, + "step": 5175 + }, + { + "epoch": 1.5516207607270003, + "grad_norm": 0.20629571378231049, + "learning_rate": 6.72972972972973e-05, + "loss": 0.3708, + "step": 5176 + }, + { + "epoch": 1.5519205546186996, + "grad_norm": 0.20679518580436707, + "learning_rate": 6.725225225225225e-05, + "loss": 0.3418, + "step": 5177 + }, + { + "epoch": 1.5522203485103991, + "grad_norm": 0.2120271623134613, + "learning_rate": 6.72072072072072e-05, + "loss": 0.3554, + "step": 5178 + }, + { + "epoch": 1.5525201424020985, + "grad_norm": 0.2077147364616394, + "learning_rate": 6.716216216216215e-05, + "loss": 0.3508, + "step": 5179 + }, + { + "epoch": 1.552819936293798, + "grad_norm": 0.199779212474823, + "learning_rate": 6.711711711711711e-05, + "loss": 0.3535, + "step": 5180 + }, + { + "epoch": 1.5531197301854975, + "grad_norm": 0.20204727351665497, + "learning_rate": 6.707207207207206e-05, + "loss": 0.3523, + "step": 5181 + }, + { + "epoch": 1.553419524077197, + "grad_norm": 0.21412204205989838, + "learning_rate": 6.702702702702701e-05, + "loss": 0.3569, + "step": 5182 + }, + { + "epoch": 1.5537193179688964, + "grad_norm": 0.21525458991527557, + "learning_rate": 6.698198198198198e-05, + "loss": 0.3581, + "step": 5183 + }, + { + "epoch": 1.5540191118605957, + "grad_norm": 0.20961333811283112, + "learning_rate": 6.693693693693693e-05, + "loss": 0.3573, + "step": 5184 + }, + { + "epoch": 1.5543189057522953, + "grad_norm": 0.20356057584285736, + "learning_rate": 6.689189189189189e-05, + "loss": 0.3278, + "step": 5185 + }, + { + "epoch": 1.5546186996439948, + "grad_norm": 0.20768241584300995, + "learning_rate": 6.684684684684684e-05, + "loss": 0.3584, + "step": 5186 + }, + { + "epoch": 1.5549184935356943, + "grad_norm": 0.2090872973203659, + "learning_rate": 6.68018018018018e-05, + "loss": 0.3532, + "step": 5187 + }, + { + "epoch": 1.5552182874273937, + "grad_norm": 0.21495124697685242, + "learning_rate": 6.675675675675676e-05, + "loss": 0.3308, + "step": 5188 + }, + { + "epoch": 1.555518081319093, + "grad_norm": 0.19636289775371552, + "learning_rate": 6.67117117117117e-05, + "loss": 0.3379, + "step": 5189 + }, + { + "epoch": 1.5558178752107925, + "grad_norm": 0.20264169573783875, + "learning_rate": 6.666666666666666e-05, + "loss": 0.3437, + "step": 5190 + }, + { + "epoch": 1.556117669102492, + "grad_norm": 0.2122523933649063, + "learning_rate": 6.662162162162162e-05, + "loss": 0.364, + "step": 5191 + }, + { + "epoch": 1.5564174629941916, + "grad_norm": 0.208254873752594, + "learning_rate": 6.657657657657657e-05, + "loss": 0.3399, + "step": 5192 + }, + { + "epoch": 1.556717256885891, + "grad_norm": 0.21715570986270905, + "learning_rate": 6.653153153153152e-05, + "loss": 0.3684, + "step": 5193 + }, + { + "epoch": 1.5570170507775905, + "grad_norm": 0.21001993119716644, + "learning_rate": 6.648648648648648e-05, + "loss": 0.3473, + "step": 5194 + }, + { + "epoch": 1.5573168446692898, + "grad_norm": 0.20231187343597412, + "learning_rate": 6.644144144144144e-05, + "loss": 0.3334, + "step": 5195 + }, + { + "epoch": 1.5576166385609893, + "grad_norm": 0.20209276676177979, + "learning_rate": 6.639639639639639e-05, + "loss": 0.3453, + "step": 5196 + }, + { + "epoch": 1.5579164324526888, + "grad_norm": 0.1989961862564087, + "learning_rate": 6.635135135135134e-05, + "loss": 0.3562, + "step": 5197 + }, + { + "epoch": 1.5582162263443884, + "grad_norm": 0.19940902292728424, + "learning_rate": 6.63063063063063e-05, + "loss": 0.3448, + "step": 5198 + }, + { + "epoch": 1.5585160202360877, + "grad_norm": 0.19980254769325256, + "learning_rate": 6.626126126126125e-05, + "loss": 0.3424, + "step": 5199 + }, + { + "epoch": 1.558815814127787, + "grad_norm": 0.2070973515510559, + "learning_rate": 6.621621621621621e-05, + "loss": 0.3654, + "step": 5200 + }, + { + "epoch": 1.5591156080194866, + "grad_norm": 0.20411080121994019, + "learning_rate": 6.617117117117116e-05, + "loss": 0.3549, + "step": 5201 + }, + { + "epoch": 1.559415401911186, + "grad_norm": 0.20583924651145935, + "learning_rate": 6.612612612612613e-05, + "loss": 0.3567, + "step": 5202 + }, + { + "epoch": 1.5597151958028856, + "grad_norm": 0.2240842878818512, + "learning_rate": 6.608108108108108e-05, + "loss": 0.3793, + "step": 5203 + }, + { + "epoch": 1.560014989694585, + "grad_norm": 0.1986006647348404, + "learning_rate": 6.603603603603603e-05, + "loss": 0.3539, + "step": 5204 + }, + { + "epoch": 1.5603147835862843, + "grad_norm": 0.20004892349243164, + "learning_rate": 6.599099099099099e-05, + "loss": 0.3512, + "step": 5205 + }, + { + "epoch": 1.5606145774779838, + "grad_norm": 0.19041694700717926, + "learning_rate": 6.594594594594594e-05, + "loss": 0.3341, + "step": 5206 + }, + { + "epoch": 1.5609143713696834, + "grad_norm": 0.208636075258255, + "learning_rate": 6.590090090090089e-05, + "loss": 0.3746, + "step": 5207 + }, + { + "epoch": 1.561214165261383, + "grad_norm": 0.20262156426906586, + "learning_rate": 6.585585585585584e-05, + "loss": 0.3455, + "step": 5208 + }, + { + "epoch": 1.5615139591530822, + "grad_norm": 0.21155297756195068, + "learning_rate": 6.581081081081081e-05, + "loss": 0.3844, + "step": 5209 + }, + { + "epoch": 1.5618137530447818, + "grad_norm": 0.20110183954238892, + "learning_rate": 6.576576576576576e-05, + "loss": 0.3615, + "step": 5210 + }, + { + "epoch": 1.562113546936481, + "grad_norm": 0.20476794242858887, + "learning_rate": 6.572072072072071e-05, + "loss": 0.344, + "step": 5211 + }, + { + "epoch": 1.5624133408281806, + "grad_norm": 0.21922746300697327, + "learning_rate": 6.567567567567567e-05, + "loss": 0.3739, + "step": 5212 + }, + { + "epoch": 1.5627131347198802, + "grad_norm": 0.20494097471237183, + "learning_rate": 6.563063063063062e-05, + "loss": 0.3659, + "step": 5213 + }, + { + "epoch": 1.5630129286115797, + "grad_norm": 0.21062615513801575, + "learning_rate": 6.558558558558557e-05, + "loss": 0.3713, + "step": 5214 + }, + { + "epoch": 1.563312722503279, + "grad_norm": 0.22318191826343536, + "learning_rate": 6.554054054054054e-05, + "loss": 0.3676, + "step": 5215 + }, + { + "epoch": 1.5636125163949783, + "grad_norm": 0.19631384313106537, + "learning_rate": 6.549549549549549e-05, + "loss": 0.3633, + "step": 5216 + }, + { + "epoch": 1.5639123102866779, + "grad_norm": 0.2068559229373932, + "learning_rate": 6.545045045045045e-05, + "loss": 0.3758, + "step": 5217 + }, + { + "epoch": 1.5642121041783774, + "grad_norm": 0.2027987688779831, + "learning_rate": 6.54054054054054e-05, + "loss": 0.3514, + "step": 5218 + }, + { + "epoch": 1.564511898070077, + "grad_norm": 0.20169684290885925, + "learning_rate": 6.536036036036036e-05, + "loss": 0.3362, + "step": 5219 + }, + { + "epoch": 1.5648116919617763, + "grad_norm": 0.2035885900259018, + "learning_rate": 6.531531531531531e-05, + "loss": 0.351, + "step": 5220 + }, + { + "epoch": 1.5651114858534756, + "grad_norm": 0.20314770936965942, + "learning_rate": 6.527027027027027e-05, + "loss": 0.3644, + "step": 5221 + }, + { + "epoch": 1.5654112797451751, + "grad_norm": 0.19844964146614075, + "learning_rate": 6.522522522522522e-05, + "loss": 0.339, + "step": 5222 + }, + { + "epoch": 1.5657110736368747, + "grad_norm": 0.2092464417219162, + "learning_rate": 6.518018018018018e-05, + "loss": 0.3653, + "step": 5223 + }, + { + "epoch": 1.5660108675285742, + "grad_norm": 0.22174417972564697, + "learning_rate": 6.513513513513513e-05, + "loss": 0.3876, + "step": 5224 + }, + { + "epoch": 1.5663106614202735, + "grad_norm": 0.2033134251832962, + "learning_rate": 6.509009009009008e-05, + "loss": 0.3635, + "step": 5225 + }, + { + "epoch": 1.566610455311973, + "grad_norm": 0.20326606929302216, + "learning_rate": 6.504504504504503e-05, + "loss": 0.3377, + "step": 5226 + }, + { + "epoch": 1.5669102492036724, + "grad_norm": 0.2041843980550766, + "learning_rate": 6.5e-05, + "loss": 0.3396, + "step": 5227 + }, + { + "epoch": 1.567210043095372, + "grad_norm": 0.2067858725786209, + "learning_rate": 6.495495495495494e-05, + "loss": 0.3544, + "step": 5228 + }, + { + "epoch": 1.5675098369870715, + "grad_norm": 0.208845317363739, + "learning_rate": 6.490990990990991e-05, + "loss": 0.3445, + "step": 5229 + }, + { + "epoch": 1.567809630878771, + "grad_norm": 0.2178359478712082, + "learning_rate": 6.486486486486486e-05, + "loss": 0.3334, + "step": 5230 + }, + { + "epoch": 1.5681094247704703, + "grad_norm": 0.20035913586616516, + "learning_rate": 6.481981981981981e-05, + "loss": 0.3561, + "step": 5231 + }, + { + "epoch": 1.5684092186621696, + "grad_norm": 0.20526796579360962, + "learning_rate": 6.477477477477477e-05, + "loss": 0.3622, + "step": 5232 + }, + { + "epoch": 1.5687090125538692, + "grad_norm": 0.2058054804801941, + "learning_rate": 6.472972972972972e-05, + "loss": 0.3576, + "step": 5233 + }, + { + "epoch": 1.5690088064455687, + "grad_norm": 0.20559555292129517, + "learning_rate": 6.468468468468469e-05, + "loss": 0.3695, + "step": 5234 + }, + { + "epoch": 1.5693086003372683, + "grad_norm": 0.21687480807304382, + "learning_rate": 6.463963963963964e-05, + "loss": 0.3643, + "step": 5235 + }, + { + "epoch": 1.5696083942289676, + "grad_norm": 0.20145055651664734, + "learning_rate": 6.459459459459459e-05, + "loss": 0.342, + "step": 5236 + }, + { + "epoch": 1.569908188120667, + "grad_norm": 0.2083156406879425, + "learning_rate": 6.454954954954955e-05, + "loss": 0.3694, + "step": 5237 + }, + { + "epoch": 1.5702079820123664, + "grad_norm": 0.2008083611726761, + "learning_rate": 6.45045045045045e-05, + "loss": 0.358, + "step": 5238 + }, + { + "epoch": 1.570507775904066, + "grad_norm": 0.20801851153373718, + "learning_rate": 6.445945945945945e-05, + "loss": 0.3444, + "step": 5239 + }, + { + "epoch": 1.5708075697957655, + "grad_norm": 0.20249147713184357, + "learning_rate": 6.44144144144144e-05, + "loss": 0.3594, + "step": 5240 + }, + { + "epoch": 1.5711073636874648, + "grad_norm": 0.20679299533367157, + "learning_rate": 6.436936936936937e-05, + "loss": 0.3604, + "step": 5241 + }, + { + "epoch": 1.5714071575791642, + "grad_norm": 0.20185597240924835, + "learning_rate": 6.432432432432432e-05, + "loss": 0.3361, + "step": 5242 + }, + { + "epoch": 1.5717069514708637, + "grad_norm": 0.20025381445884705, + "learning_rate": 6.427927927927927e-05, + "loss": 0.3515, + "step": 5243 + }, + { + "epoch": 1.5720067453625632, + "grad_norm": 0.19662436842918396, + "learning_rate": 6.423423423423423e-05, + "loss": 0.3188, + "step": 5244 + }, + { + "epoch": 1.5723065392542628, + "grad_norm": 0.20173777639865875, + "learning_rate": 6.418918918918918e-05, + "loss": 0.3509, + "step": 5245 + }, + { + "epoch": 1.572606333145962, + "grad_norm": 0.21695034205913544, + "learning_rate": 6.414414414414413e-05, + "loss": 0.3589, + "step": 5246 + }, + { + "epoch": 1.5729061270376616, + "grad_norm": 0.21321168541908264, + "learning_rate": 6.40990990990991e-05, + "loss": 0.371, + "step": 5247 + }, + { + "epoch": 1.573205920929361, + "grad_norm": 0.21553568542003632, + "learning_rate": 6.405405405405405e-05, + "loss": 0.3673, + "step": 5248 + }, + { + "epoch": 1.5735057148210605, + "grad_norm": 0.21144214272499084, + "learning_rate": 6.400900900900901e-05, + "loss": 0.3649, + "step": 5249 + }, + { + "epoch": 1.57380550871276, + "grad_norm": 0.20765072107315063, + "learning_rate": 6.396396396396396e-05, + "loss": 0.3642, + "step": 5250 + }, + { + "epoch": 1.5741053026044596, + "grad_norm": 0.2129376381635666, + "learning_rate": 6.391891891891891e-05, + "loss": 0.3612, + "step": 5251 + }, + { + "epoch": 1.574405096496159, + "grad_norm": 0.20372062921524048, + "learning_rate": 6.387387387387387e-05, + "loss": 0.3579, + "step": 5252 + }, + { + "epoch": 1.5747048903878582, + "grad_norm": 0.1967153400182724, + "learning_rate": 6.382882882882882e-05, + "loss": 0.3413, + "step": 5253 + }, + { + "epoch": 1.5750046842795578, + "grad_norm": 0.19723547995090485, + "learning_rate": 6.378378378378377e-05, + "loss": 0.3423, + "step": 5254 + }, + { + "epoch": 1.5753044781712573, + "grad_norm": 0.20694804191589355, + "learning_rate": 6.373873873873874e-05, + "loss": 0.3431, + "step": 5255 + }, + { + "epoch": 1.5756042720629568, + "grad_norm": 0.209686741232872, + "learning_rate": 6.369369369369369e-05, + "loss": 0.3634, + "step": 5256 + }, + { + "epoch": 1.5759040659546562, + "grad_norm": 0.20350812375545502, + "learning_rate": 6.364864864864864e-05, + "loss": 0.3499, + "step": 5257 + }, + { + "epoch": 1.5762038598463555, + "grad_norm": 0.19632326066493988, + "learning_rate": 6.360360360360359e-05, + "loss": 0.3564, + "step": 5258 + }, + { + "epoch": 1.576503653738055, + "grad_norm": 0.19848290085792542, + "learning_rate": 6.355855855855855e-05, + "loss": 0.3611, + "step": 5259 + }, + { + "epoch": 1.5768034476297546, + "grad_norm": 0.22999663650989532, + "learning_rate": 6.35135135135135e-05, + "loss": 0.3523, + "step": 5260 + }, + { + "epoch": 1.577103241521454, + "grad_norm": 0.20401522517204285, + "learning_rate": 6.346846846846847e-05, + "loss": 0.3361, + "step": 5261 + }, + { + "epoch": 1.5774030354131534, + "grad_norm": 0.19645123183727264, + "learning_rate": 6.342342342342342e-05, + "loss": 0.3451, + "step": 5262 + }, + { + "epoch": 1.577702829304853, + "grad_norm": 0.2110058218240738, + "learning_rate": 6.337837837837837e-05, + "loss": 0.3687, + "step": 5263 + }, + { + "epoch": 1.5780026231965523, + "grad_norm": 0.20960693061351776, + "learning_rate": 6.333333333333333e-05, + "loss": 0.3732, + "step": 5264 + }, + { + "epoch": 1.5783024170882518, + "grad_norm": 0.20382042229175568, + "learning_rate": 6.328828828828828e-05, + "loss": 0.3454, + "step": 5265 + }, + { + "epoch": 1.5786022109799513, + "grad_norm": 0.20214159786701202, + "learning_rate": 6.324324324324325e-05, + "loss": 0.3532, + "step": 5266 + }, + { + "epoch": 1.5789020048716509, + "grad_norm": 0.20783282816410065, + "learning_rate": 6.31981981981982e-05, + "loss": 0.3861, + "step": 5267 + }, + { + "epoch": 1.5792017987633502, + "grad_norm": 0.19975163042545319, + "learning_rate": 6.315315315315315e-05, + "loss": 0.3567, + "step": 5268 + }, + { + "epoch": 1.5795015926550495, + "grad_norm": 0.20556297898292542, + "learning_rate": 6.31081081081081e-05, + "loss": 0.3658, + "step": 5269 + }, + { + "epoch": 1.579801386546749, + "grad_norm": 0.20620745420455933, + "learning_rate": 6.306306306306306e-05, + "loss": 0.3593, + "step": 5270 + }, + { + "epoch": 1.5801011804384486, + "grad_norm": 0.2031751573085785, + "learning_rate": 6.301801801801801e-05, + "loss": 0.3426, + "step": 5271 + }, + { + "epoch": 1.5804009743301481, + "grad_norm": 0.2001752108335495, + "learning_rate": 6.297297297297296e-05, + "loss": 0.3696, + "step": 5272 + }, + { + "epoch": 1.5807007682218475, + "grad_norm": 0.20031659305095673, + "learning_rate": 6.292792792792793e-05, + "loss": 0.3616, + "step": 5273 + }, + { + "epoch": 1.5810005621135468, + "grad_norm": 0.20846503973007202, + "learning_rate": 6.288288288288288e-05, + "loss": 0.3689, + "step": 5274 + }, + { + "epoch": 1.5813003560052463, + "grad_norm": 0.21977601945400238, + "learning_rate": 6.283783783783783e-05, + "loss": 0.374, + "step": 5275 + }, + { + "epoch": 1.5816001498969459, + "grad_norm": 0.20855718851089478, + "learning_rate": 6.279279279279279e-05, + "loss": 0.3654, + "step": 5276 + }, + { + "epoch": 1.5818999437886454, + "grad_norm": 0.22264760732650757, + "learning_rate": 6.274774774774774e-05, + "loss": 0.3838, + "step": 5277 + }, + { + "epoch": 1.5821997376803447, + "grad_norm": 0.2073168158531189, + "learning_rate": 6.27027027027027e-05, + "loss": 0.3424, + "step": 5278 + }, + { + "epoch": 1.5824995315720443, + "grad_norm": 0.20966382324695587, + "learning_rate": 6.265765765765765e-05, + "loss": 0.3666, + "step": 5279 + }, + { + "epoch": 1.5827993254637436, + "grad_norm": 0.19850985705852509, + "learning_rate": 6.26126126126126e-05, + "loss": 0.3304, + "step": 5280 + }, + { + "epoch": 1.5830991193554431, + "grad_norm": 0.2018197774887085, + "learning_rate": 6.256756756756757e-05, + "loss": 0.3496, + "step": 5281 + }, + { + "epoch": 1.5833989132471427, + "grad_norm": 0.21336401998996735, + "learning_rate": 6.252252252252252e-05, + "loss": 0.3907, + "step": 5282 + }, + { + "epoch": 1.5836987071388422, + "grad_norm": 0.2028983235359192, + "learning_rate": 6.247747747747747e-05, + "loss": 0.3528, + "step": 5283 + }, + { + "epoch": 1.5839985010305415, + "grad_norm": 0.20132046937942505, + "learning_rate": 6.243243243243243e-05, + "loss": 0.3358, + "step": 5284 + }, + { + "epoch": 1.5842982949222408, + "grad_norm": 0.2028990089893341, + "learning_rate": 6.238738738738738e-05, + "loss": 0.3534, + "step": 5285 + }, + { + "epoch": 1.5845980888139404, + "grad_norm": 0.20739594101905823, + "learning_rate": 6.234234234234233e-05, + "loss": 0.3595, + "step": 5286 + }, + { + "epoch": 1.58489788270564, + "grad_norm": 0.2152402400970459, + "learning_rate": 6.229729729729728e-05, + "loss": 0.3585, + "step": 5287 + }, + { + "epoch": 1.5851976765973395, + "grad_norm": 0.2000325322151184, + "learning_rate": 6.225225225225225e-05, + "loss": 0.3307, + "step": 5288 + }, + { + "epoch": 1.5854974704890388, + "grad_norm": 0.1998959332704544, + "learning_rate": 6.22072072072072e-05, + "loss": 0.3517, + "step": 5289 + }, + { + "epoch": 1.585797264380738, + "grad_norm": 0.1964779496192932, + "learning_rate": 6.216216216216215e-05, + "loss": 0.331, + "step": 5290 + }, + { + "epoch": 1.5860970582724376, + "grad_norm": 0.21091748774051666, + "learning_rate": 6.211711711711711e-05, + "loss": 0.3483, + "step": 5291 + }, + { + "epoch": 1.5863968521641372, + "grad_norm": 0.20698702335357666, + "learning_rate": 6.207207207207206e-05, + "loss": 0.3513, + "step": 5292 + }, + { + "epoch": 1.5866966460558367, + "grad_norm": 0.21151360869407654, + "learning_rate": 6.202702702702703e-05, + "loss": 0.3753, + "step": 5293 + }, + { + "epoch": 1.586996439947536, + "grad_norm": 0.20693033933639526, + "learning_rate": 6.198198198198198e-05, + "loss": 0.3428, + "step": 5294 + }, + { + "epoch": 1.5872962338392356, + "grad_norm": 0.20240649580955505, + "learning_rate": 6.193693693693693e-05, + "loss": 0.3344, + "step": 5295 + }, + { + "epoch": 1.5875960277309349, + "grad_norm": 0.2075674682855606, + "learning_rate": 6.189189189189189e-05, + "loss": 0.358, + "step": 5296 + }, + { + "epoch": 1.5878958216226344, + "grad_norm": 0.2162725031375885, + "learning_rate": 6.184684684684684e-05, + "loss": 0.3737, + "step": 5297 + }, + { + "epoch": 1.588195615514334, + "grad_norm": 0.20333421230316162, + "learning_rate": 6.180180180180179e-05, + "loss": 0.3486, + "step": 5298 + }, + { + "epoch": 1.5884954094060335, + "grad_norm": 0.2127985805273056, + "learning_rate": 6.175675675675676e-05, + "loss": 0.3723, + "step": 5299 + }, + { + "epoch": 1.5887952032977328, + "grad_norm": 0.20037934184074402, + "learning_rate": 6.17117117117117e-05, + "loss": 0.3639, + "step": 5300 + }, + { + "epoch": 1.5890949971894321, + "grad_norm": 0.21006622910499573, + "learning_rate": 6.166666666666666e-05, + "loss": 0.3539, + "step": 5301 + }, + { + "epoch": 1.5893947910811317, + "grad_norm": 0.21559639275074005, + "learning_rate": 6.162162162162162e-05, + "loss": 0.3706, + "step": 5302 + }, + { + "epoch": 1.5896945849728312, + "grad_norm": 0.19809530675411224, + "learning_rate": 6.157657657657657e-05, + "loss": 0.3262, + "step": 5303 + }, + { + "epoch": 1.5899943788645308, + "grad_norm": 0.2111746221780777, + "learning_rate": 6.153153153153152e-05, + "loss": 0.367, + "step": 5304 + }, + { + "epoch": 1.59029417275623, + "grad_norm": 0.2136707603931427, + "learning_rate": 6.148648648648647e-05, + "loss": 0.3775, + "step": 5305 + }, + { + "epoch": 1.5905939666479294, + "grad_norm": 0.21586540341377258, + "learning_rate": 6.144144144144144e-05, + "loss": 0.3507, + "step": 5306 + }, + { + "epoch": 1.590893760539629, + "grad_norm": 0.21576684713363647, + "learning_rate": 6.139639639639639e-05, + "loss": 0.3788, + "step": 5307 + }, + { + "epoch": 1.5911935544313285, + "grad_norm": 0.2173517942428589, + "learning_rate": 6.135135135135135e-05, + "loss": 0.3512, + "step": 5308 + }, + { + "epoch": 1.591493348323028, + "grad_norm": 0.19617997109889984, + "learning_rate": 6.13063063063063e-05, + "loss": 0.3464, + "step": 5309 + }, + { + "epoch": 1.5917931422147273, + "grad_norm": 0.1915069967508316, + "learning_rate": 6.126126126126126e-05, + "loss": 0.3131, + "step": 5310 + }, + { + "epoch": 1.5920929361064269, + "grad_norm": 0.21343593299388885, + "learning_rate": 6.121621621621621e-05, + "loss": 0.3542, + "step": 5311 + }, + { + "epoch": 1.5923927299981262, + "grad_norm": 0.19464311003684998, + "learning_rate": 6.117117117117116e-05, + "loss": 0.3474, + "step": 5312 + }, + { + "epoch": 1.5926925238898257, + "grad_norm": 0.2090366929769516, + "learning_rate": 6.112612612612613e-05, + "loss": 0.3418, + "step": 5313 + }, + { + "epoch": 1.5929923177815253, + "grad_norm": 0.2039409577846527, + "learning_rate": 6.108108108108108e-05, + "loss": 0.3397, + "step": 5314 + }, + { + "epoch": 1.5932921116732248, + "grad_norm": 0.1947229951620102, + "learning_rate": 6.103603603603603e-05, + "loss": 0.328, + "step": 5315 + }, + { + "epoch": 1.5935919055649241, + "grad_norm": 0.20912787318229675, + "learning_rate": 6.099099099099098e-05, + "loss": 0.344, + "step": 5316 + }, + { + "epoch": 1.5938916994566235, + "grad_norm": 0.21064409613609314, + "learning_rate": 6.094594594594594e-05, + "loss": 0.3577, + "step": 5317 + }, + { + "epoch": 1.594191493348323, + "grad_norm": 0.2055872529745102, + "learning_rate": 6.0900900900900893e-05, + "loss": 0.365, + "step": 5318 + }, + { + "epoch": 1.5944912872400225, + "grad_norm": 0.20798040926456451, + "learning_rate": 6.085585585585585e-05, + "loss": 0.3503, + "step": 5319 + }, + { + "epoch": 1.594791081131722, + "grad_norm": 0.20974093675613403, + "learning_rate": 6.081081081081081e-05, + "loss": 0.3587, + "step": 5320 + }, + { + "epoch": 1.5950908750234214, + "grad_norm": 0.19876185059547424, + "learning_rate": 6.0765765765765765e-05, + "loss": 0.3497, + "step": 5321 + }, + { + "epoch": 1.5953906689151207, + "grad_norm": 0.1975713074207306, + "learning_rate": 6.0720720720720715e-05, + "loss": 0.3295, + "step": 5322 + }, + { + "epoch": 1.5956904628068203, + "grad_norm": 0.20481306314468384, + "learning_rate": 6.0675675675675665e-05, + "loss": 0.3272, + "step": 5323 + }, + { + "epoch": 1.5959902566985198, + "grad_norm": 0.20750102400779724, + "learning_rate": 6.063063063063063e-05, + "loss": 0.356, + "step": 5324 + }, + { + "epoch": 1.5962900505902193, + "grad_norm": 0.21194814145565033, + "learning_rate": 6.058558558558558e-05, + "loss": 0.3695, + "step": 5325 + }, + { + "epoch": 1.5965898444819187, + "grad_norm": 0.1984264999628067, + "learning_rate": 6.054054054054053e-05, + "loss": 0.3514, + "step": 5326 + }, + { + "epoch": 1.5968896383736182, + "grad_norm": 0.2006942182779312, + "learning_rate": 6.0495495495495494e-05, + "loss": 0.3538, + "step": 5327 + }, + { + "epoch": 1.5971894322653175, + "grad_norm": 0.20723217725753784, + "learning_rate": 6.0450450450450444e-05, + "loss": 0.3668, + "step": 5328 + }, + { + "epoch": 1.597489226157017, + "grad_norm": 0.21457070112228394, + "learning_rate": 6.04054054054054e-05, + "loss": 0.3976, + "step": 5329 + }, + { + "epoch": 1.5977890200487166, + "grad_norm": 0.19332286715507507, + "learning_rate": 6.036036036036035e-05, + "loss": 0.3395, + "step": 5330 + }, + { + "epoch": 1.598088813940416, + "grad_norm": 0.20550230145454407, + "learning_rate": 6.0315315315315315e-05, + "loss": 0.3395, + "step": 5331 + }, + { + "epoch": 1.5983886078321154, + "grad_norm": 0.19099922478199005, + "learning_rate": 6.0270270270270266e-05, + "loss": 0.3172, + "step": 5332 + }, + { + "epoch": 1.5986884017238148, + "grad_norm": 0.20576585829257965, + "learning_rate": 6.0225225225225216e-05, + "loss": 0.3544, + "step": 5333 + }, + { + "epoch": 1.5989881956155143, + "grad_norm": 0.20336727797985077, + "learning_rate": 6.018018018018017e-05, + "loss": 0.3568, + "step": 5334 + }, + { + "epoch": 1.5992879895072138, + "grad_norm": 0.21562445163726807, + "learning_rate": 6.013513513513513e-05, + "loss": 0.363, + "step": 5335 + }, + { + "epoch": 1.5995877833989134, + "grad_norm": 0.20894883573055267, + "learning_rate": 6.009009009009009e-05, + "loss": 0.357, + "step": 5336 + }, + { + "epoch": 1.5998875772906127, + "grad_norm": 0.21003037691116333, + "learning_rate": 6.004504504504504e-05, + "loss": 0.349, + "step": 5337 + }, + { + "epoch": 1.600187371182312, + "grad_norm": 0.1921328455209732, + "learning_rate": 5.9999999999999995e-05, + "loss": 0.3329, + "step": 5338 + }, + { + "epoch": 1.6004871650740116, + "grad_norm": 0.21384379267692566, + "learning_rate": 5.995495495495495e-05, + "loss": 0.361, + "step": 5339 + }, + { + "epoch": 1.600786958965711, + "grad_norm": 0.2054745852947235, + "learning_rate": 5.99099099099099e-05, + "loss": 0.3383, + "step": 5340 + }, + { + "epoch": 1.6010867528574106, + "grad_norm": 0.20083633065223694, + "learning_rate": 5.986486486486486e-05, + "loss": 0.3442, + "step": 5341 + }, + { + "epoch": 1.60138654674911, + "grad_norm": 0.19561520218849182, + "learning_rate": 5.9819819819819816e-05, + "loss": 0.3416, + "step": 5342 + }, + { + "epoch": 1.6016863406408093, + "grad_norm": 0.22439317405223846, + "learning_rate": 5.977477477477477e-05, + "loss": 0.3699, + "step": 5343 + }, + { + "epoch": 1.6019861345325088, + "grad_norm": 0.20617817342281342, + "learning_rate": 5.9729729729729724e-05, + "loss": 0.3572, + "step": 5344 + }, + { + "epoch": 1.6022859284242084, + "grad_norm": 0.21624356508255005, + "learning_rate": 5.968468468468468e-05, + "loss": 0.3911, + "step": 5345 + }, + { + "epoch": 1.602585722315908, + "grad_norm": 0.21435832977294922, + "learning_rate": 5.963963963963964e-05, + "loss": 0.3497, + "step": 5346 + }, + { + "epoch": 1.6028855162076072, + "grad_norm": 0.20786425471305847, + "learning_rate": 5.959459459459459e-05, + "loss": 0.335, + "step": 5347 + }, + { + "epoch": 1.6031853100993068, + "grad_norm": 0.20502492785453796, + "learning_rate": 5.954954954954954e-05, + "loss": 0.3595, + "step": 5348 + }, + { + "epoch": 1.603485103991006, + "grad_norm": 0.20398326218128204, + "learning_rate": 5.95045045045045e-05, + "loss": 0.3562, + "step": 5349 + }, + { + "epoch": 1.6037848978827056, + "grad_norm": 0.19771265983581543, + "learning_rate": 5.945945945945945e-05, + "loss": 0.338, + "step": 5350 + }, + { + "epoch": 1.6040846917744052, + "grad_norm": 0.20617644488811493, + "learning_rate": 5.941441441441441e-05, + "loss": 0.3617, + "step": 5351 + }, + { + "epoch": 1.6043844856661047, + "grad_norm": 0.20850981771945953, + "learning_rate": 5.936936936936936e-05, + "loss": 0.3757, + "step": 5352 + }, + { + "epoch": 1.604684279557804, + "grad_norm": 0.20652638375759125, + "learning_rate": 5.9324324324324324e-05, + "loss": 0.3576, + "step": 5353 + }, + { + "epoch": 1.6049840734495033, + "grad_norm": 0.20817598700523376, + "learning_rate": 5.9279279279279274e-05, + "loss": 0.3645, + "step": 5354 + }, + { + "epoch": 1.6052838673412029, + "grad_norm": 0.19501417875289917, + "learning_rate": 5.9234234234234225e-05, + "loss": 0.346, + "step": 5355 + }, + { + "epoch": 1.6055836612329024, + "grad_norm": 0.20347361266613007, + "learning_rate": 5.918918918918919e-05, + "loss": 0.3521, + "step": 5356 + }, + { + "epoch": 1.605883455124602, + "grad_norm": 0.19727610051631927, + "learning_rate": 5.914414414414414e-05, + "loss": 0.3488, + "step": 5357 + }, + { + "epoch": 1.6061832490163013, + "grad_norm": 0.1962079256772995, + "learning_rate": 5.909909909909909e-05, + "loss": 0.3359, + "step": 5358 + }, + { + "epoch": 1.6064830429080006, + "grad_norm": 0.19446706771850586, + "learning_rate": 5.9054054054054046e-05, + "loss": 0.3439, + "step": 5359 + }, + { + "epoch": 1.6067828367997001, + "grad_norm": 0.2100493609905243, + "learning_rate": 5.9009009009009003e-05, + "loss": 0.3559, + "step": 5360 + }, + { + "epoch": 1.6070826306913997, + "grad_norm": 0.20190827548503876, + "learning_rate": 5.896396396396396e-05, + "loss": 0.3572, + "step": 5361 + }, + { + "epoch": 1.6073824245830992, + "grad_norm": 0.19307351112365723, + "learning_rate": 5.891891891891891e-05, + "loss": 0.3234, + "step": 5362 + }, + { + "epoch": 1.6076822184747985, + "grad_norm": 0.2018936276435852, + "learning_rate": 5.8873873873873875e-05, + "loss": 0.3509, + "step": 5363 + }, + { + "epoch": 1.607982012366498, + "grad_norm": 0.19602860510349274, + "learning_rate": 5.8828828828828825e-05, + "loss": 0.3462, + "step": 5364 + }, + { + "epoch": 1.6082818062581974, + "grad_norm": 0.19937603175640106, + "learning_rate": 5.8783783783783775e-05, + "loss": 0.3514, + "step": 5365 + }, + { + "epoch": 1.608581600149897, + "grad_norm": 0.20160555839538574, + "learning_rate": 5.873873873873873e-05, + "loss": 0.3628, + "step": 5366 + }, + { + "epoch": 1.6088813940415965, + "grad_norm": 0.20256981253623962, + "learning_rate": 5.869369369369369e-05, + "loss": 0.3315, + "step": 5367 + }, + { + "epoch": 1.609181187933296, + "grad_norm": 0.21089547872543335, + "learning_rate": 5.864864864864865e-05, + "loss": 0.3704, + "step": 5368 + }, + { + "epoch": 1.6094809818249953, + "grad_norm": 0.20721639692783356, + "learning_rate": 5.86036036036036e-05, + "loss": 0.3485, + "step": 5369 + }, + { + "epoch": 1.6097807757166946, + "grad_norm": 0.19964303076267242, + "learning_rate": 5.855855855855855e-05, + "loss": 0.3449, + "step": 5370 + }, + { + "epoch": 1.6100805696083942, + "grad_norm": 0.21870988607406616, + "learning_rate": 5.851351351351351e-05, + "loss": 0.3566, + "step": 5371 + }, + { + "epoch": 1.6103803635000937, + "grad_norm": 0.21295110881328583, + "learning_rate": 5.846846846846846e-05, + "loss": 0.3539, + "step": 5372 + }, + { + "epoch": 1.6106801573917933, + "grad_norm": 0.19819046556949615, + "learning_rate": 5.842342342342342e-05, + "loss": 0.3508, + "step": 5373 + }, + { + "epoch": 1.6109799512834926, + "grad_norm": 0.21221822500228882, + "learning_rate": 5.8378378378378376e-05, + "loss": 0.3716, + "step": 5374 + }, + { + "epoch": 1.611279745175192, + "grad_norm": 0.20993143320083618, + "learning_rate": 5.8333333333333326e-05, + "loss": 0.3714, + "step": 5375 + }, + { + "epoch": 1.6115795390668914, + "grad_norm": 0.20888566970825195, + "learning_rate": 5.828828828828828e-05, + "loss": 0.3561, + "step": 5376 + }, + { + "epoch": 1.611879332958591, + "grad_norm": 0.1925889551639557, + "learning_rate": 5.824324324324323e-05, + "loss": 0.3458, + "step": 5377 + }, + { + "epoch": 1.6121791268502905, + "grad_norm": 0.19851090013980865, + "learning_rate": 5.81981981981982e-05, + "loss": 0.3354, + "step": 5378 + }, + { + "epoch": 1.6124789207419898, + "grad_norm": 0.20619840919971466, + "learning_rate": 5.815315315315315e-05, + "loss": 0.3571, + "step": 5379 + }, + { + "epoch": 1.6127787146336894, + "grad_norm": 0.20241157710552216, + "learning_rate": 5.81081081081081e-05, + "loss": 0.3478, + "step": 5380 + }, + { + "epoch": 1.6130785085253887, + "grad_norm": 0.2011449635028839, + "learning_rate": 5.806306306306306e-05, + "loss": 0.354, + "step": 5381 + }, + { + "epoch": 1.6133783024170882, + "grad_norm": 0.19988194108009338, + "learning_rate": 5.801801801801801e-05, + "loss": 0.3399, + "step": 5382 + }, + { + "epoch": 1.6136780963087878, + "grad_norm": 0.21191838383674622, + "learning_rate": 5.797297297297297e-05, + "loss": 0.3748, + "step": 5383 + }, + { + "epoch": 1.6139778902004873, + "grad_norm": 0.20201589167118073, + "learning_rate": 5.792792792792792e-05, + "loss": 0.3636, + "step": 5384 + }, + { + "epoch": 1.6142776840921866, + "grad_norm": 0.22016596794128418, + "learning_rate": 5.788288288288288e-05, + "loss": 0.3713, + "step": 5385 + }, + { + "epoch": 1.614577477983886, + "grad_norm": 0.20050211250782013, + "learning_rate": 5.7837837837837834e-05, + "loss": 0.337, + "step": 5386 + }, + { + "epoch": 1.6148772718755855, + "grad_norm": 0.20136626064777374, + "learning_rate": 5.7792792792792784e-05, + "loss": 0.3672, + "step": 5387 + }, + { + "epoch": 1.615177065767285, + "grad_norm": 0.20241838693618774, + "learning_rate": 5.774774774774775e-05, + "loss": 0.3585, + "step": 5388 + }, + { + "epoch": 1.6154768596589846, + "grad_norm": 0.20410926640033722, + "learning_rate": 5.77027027027027e-05, + "loss": 0.352, + "step": 5389 + }, + { + "epoch": 1.615776653550684, + "grad_norm": 0.20114073157310486, + "learning_rate": 5.7657657657657655e-05, + "loss": 0.3386, + "step": 5390 + }, + { + "epoch": 1.6160764474423832, + "grad_norm": 0.20846368372440338, + "learning_rate": 5.7612612612612606e-05, + "loss": 0.3623, + "step": 5391 + }, + { + "epoch": 1.6163762413340828, + "grad_norm": 0.20690296590328217, + "learning_rate": 5.756756756756756e-05, + "loss": 0.3526, + "step": 5392 + }, + { + "epoch": 1.6166760352257823, + "grad_norm": 0.21066324412822723, + "learning_rate": 5.752252252252252e-05, + "loss": 0.3576, + "step": 5393 + }, + { + "epoch": 1.6169758291174818, + "grad_norm": 0.1903742104768753, + "learning_rate": 5.747747747747747e-05, + "loss": 0.3385, + "step": 5394 + }, + { + "epoch": 1.6172756230091812, + "grad_norm": 0.19906085729599, + "learning_rate": 5.743243243243242e-05, + "loss": 0.333, + "step": 5395 + }, + { + "epoch": 1.6175754169008807, + "grad_norm": 0.20286022126674652, + "learning_rate": 5.7387387387387384e-05, + "loss": 0.3405, + "step": 5396 + }, + { + "epoch": 1.61787521079258, + "grad_norm": 0.2012995481491089, + "learning_rate": 5.7342342342342335e-05, + "loss": 0.3622, + "step": 5397 + }, + { + "epoch": 1.6181750046842795, + "grad_norm": 0.20434437692165375, + "learning_rate": 5.729729729729729e-05, + "loss": 0.3456, + "step": 5398 + }, + { + "epoch": 1.618474798575979, + "grad_norm": 0.21659024059772491, + "learning_rate": 5.725225225225225e-05, + "loss": 0.3764, + "step": 5399 + }, + { + "epoch": 1.6187745924676786, + "grad_norm": 0.20373186469078064, + "learning_rate": 5.7207207207207206e-05, + "loss": 0.3489, + "step": 5400 + } + ], + "logging_steps": 1, + "max_steps": 6670, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4368809647228741e+20, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}